It was the Bitcointalk forum that inspired us to create Bitcointalksearch.org - Bitcointalk is an excellent site that should be the default page for anybody dealing in cryptocurrency, since it is a virtual gold-mine of data. However, our experience and user feedback led us create our site; Bitcointalk's search is slow, and difficult to get the results you need, because you need to log in first to find anything useful - furthermore, there are rate limiters for their search functionality.
The aim of our project is to create a faster website that yields more results and faster without having to create an account and eliminate the need to log in - your personal data, therefore, will never be in jeopardy since we are not asking for any of your data and you don't need to provide them to use our site with all of its capabilities.
We created this website with the sole purpose of users being able to search quickly and efficiently in the field of cryptocurrency so they will have access to the latest and most accurate information and thereby assisting the crypto-community at large.
if (degenerate) {
n = m;
}
else {
secp256k1_fe_sqr(&n, &n);
}
/** If flag is true, set *r equal to *a; otherwise leave it. Constant-time. */
static void secp256k1_fe_cmov(secp256k1_fe *r, const secp256k1_fe *a, int flag);
secp256k1_fe_cmov(&rr_alt, &rr, !degenerate);
secp256k1_fe_cmov(&m_alt, &m, !degenerate);
/* Now Ralt / Malt = lambda and is guaranteed not to be 0/0.
* From here on out Ralt and Malt represent the numerator
* and denominator of lambda; R and M represent the explicit
* expressions x1^2 + x2^2 + x1x2 and y1 + y2. */
secp256k1_fe_sqr(&n, &m_alt); /* n = Malt^2 (1) */
secp256k1_fe_mul(&q, &n, &t); /* q = Q = T*Malt^2 (1) */
/* These two lines use the observation that either M == Malt or M == 0,
* so M^3 * Malt is either Malt^4 (which is computed by squaring), or
* zero (which is "computed" by cmov). So the cost is one squaring
* versus two multiplications. */
secp256k1_fe_sqr(&n, &n);
secp256k1_fe_cmov(&n, &m, degenerate); /* n = M^3 * Malt (2) */
if (degenerate) {
n = m;
}
else {
secp256k1_fe_sqr(&n, &n);
}
/** If flag is true, set *r equal to *a; otherwise leave it. Constant-time. */
static void secp256k1_fe_cmov(secp256k1_fe *r, const secp256k1_fe *a, int flag);
if (flag) {
*r = *a;
}
static SECP256K1_INLINE void secp256k1_fe_cmov(secp256k1_fe *r, const secp256k1_fe *a, int flag) {
uint64_t mask0, mask1;
mask0 = flag + ~((uint64_t)0);
mask1 = ~mask0;
r->n[0] = (r->n[0] & mask0) | (a->n[0] & mask1);
r->n[1] = (r->n[1] & mask0) | (a->n[1] & mask1);
r->n[2] = (r->n[2] & mask0) | (a->n[2] & mask1);
r->n[3] = (r->n[3] & mask0) | (a->n[3] & mask1);
r->n[4] = (r->n[4] & mask0) | (a->n[4] & mask1);
#ifdef VERIFY
if (a->magnitude > r->magnitude) {
r->magnitude = a->magnitude;
}
r->normalized &= a->normalized;
#endif
}
secp256k1_fe_sqr(&t, &rr_alt); /* t = Ralt^2 (1) */
secp256k1_fe_mul(&r->z, &a->z, &m_alt); /* r->z = Malt*Z (1) */
infinity = secp256k1_fe_normalizes_to_zero(&r->z) * (1 - a->infinity);
secp256k1_fe_mul_int(&r->z, 2); /* r->z = Z3 = 2*Malt*Z (2) */
secp256k1_fe_negate(&q, &q, 1); /* q = -Q (2) */
secp256k1_fe_add(&t, &q); /* t = Ralt^2-Q (3) */
static void secp256k1_scalar_mul(secp256k1_scalar *r, const secp256k1_scalar *a, const secp256k1_scalar *b) {
#ifdef USE_ASM_X86_64
uint64_t l[8];
const uint64_t *pb = b->d;
__asm__ __volatile__(
/* Preload */
"movq 0(%%rdi), %%r15\n"
"movq 8(%%rdi), %%rbx\n"
"movq 16(%%rdi), %%rcx\n"
"movq 0(%%rdx), %%r11\n"
"movq 8(%%rdx), %%r9\n"
"movq 16(%%rdx), %%r10\n"
"movq 24(%%rdx), %%r8\n"
/* (rax,rdx) = a0 * b0 */
"movq %%r15, %%rax\n"
"mulq %%r11\n"
/* Extract l0 */
"movq %%rax, 0(%%rsi)\n"
/* (r14,r12,r13) = (rdx) */
"movq %%rdx, %%r14\n"
"xorq %%r12, %%r12\n"
"xorq %%r13, %%r13\n"
/* (r14,r12,r13) += a0 * b1 */
"movq %%r15, %%rax\n"
"mulq %%r9\n"
"addq %%rax, %%r14\n"
"adcq %%rdx, %%r12\n"
"movq %%rbx, %%rax\n"
"adcq $0, %%r13\n"
/* (r14,r12,r13) += a1 * b0 */
"mulq %%r11\n"
"addq %%rax, %%r14\n"
"adcq %%rdx, %%r12\n"
/* Extract l1 */
"movq %%r14, 8(%%rsi)\n"
"movq $0, %%r14\n"
/* (r12,r13,r14) += a0 * b2 */
"movq %%r15, %%rax\n"
"adcq $0, %%r13\n"
"mulq %%r10\n"
"addq %%rax, %%r12\n"
"adcq %%rdx, %%r13\n"
"movq %%rbx, %%rax\n"
"adcq $0, %%r14\n"
/* (r12,r13,r14) += a1 * b1 */
"mulq %%r9\n"
"addq %%rax, %%r12\n"
"adcq %%rdx, %%r13\n"
"movq %%rcx, %%rax\n"
"adcq $0, %%r14\n"
/* (r12,r13,r14) += a2 * b0 */
"mulq %%r11\n"
"addq %%rax, %%r12\n"
"adcq %%rdx, %%r13\n"
/* Extract l2 */
"movq %%r12, 16(%%rsi)\n"
"movq $0, %%r12\n"
/* (r13,r14,r12) += a0 * b3 */
"movq %%r15, %%rax\n"
"adcq $0, %%r14\n"
"mulq %%r8\n"
"addq %%rax, %%r13\n"
"adcq %%rdx, %%r14\n"
/* Preload a3 */
"movq 24(%%rdi), %%r15\n"
/* (r13,r14,r12) += a1 * b2 */
"movq %%rbx, %%rax\n"
"adcq $0, %%r12\n"
"mulq %%r10\n"
"addq %%rax, %%r13\n"
"adcq %%rdx, %%r14\n"
"movq %%rcx, %%rax\n"
"adcq $0, %%r12\n"
/* (r13,r14,r12) += a2 * b1 */
"mulq %%r9\n"
"addq %%rax, %%r13\n"
"adcq %%rdx, %%r14\n"
"movq %%r15, %%rax\n"
"adcq $0, %%r12\n"
/* (r13,r14,r12) += a3 * b0 */
"mulq %%r11\n"
"addq %%rax, %%r13\n"
"adcq %%rdx, %%r14\n"
/* Extract l3 */
"movq %%r13, 24(%%rsi)\n"
"movq $0, %%r13\n"
/* (r14,r12,r13) += a1 * b3 */
"movq %%rbx, %%rax\n"
"adcq $0, %%r12\n"
"mulq %%r8\n"
"addq %%rax, %%r14\n"
"adcq %%rdx, %%r12\n"
"movq %%rcx, %%rax\n"
"adcq $0, %%r13\n"
/* (r14,r12,r13) += a2 * b2 */
"mulq %%r10\n"
"addq %%rax, %%r14\n"
"adcq %%rdx, %%r12\n"
"movq %%r15, %%rax\n"
"adcq $0, %%r13\n"
/* (r14,r12,r13) += a3 * b1 */
"mulq %%r9\n"
"addq %%rax, %%r14\n"
"adcq %%rdx, %%r12\n"
"movq %%rcx, %%rax\n"
"adcq $0, %%r13\n"
/* Extract l4 */
/* "movq %%r14, 32(%%rsi)\n"*/
/* (r12,r13,r14) += a2 * b3 */
"mulq %%r8\n"
"movq %%r14, %%r11\n"
"xorq %%r14, %%r14\n"
"addq %%rax, %%r12\n"
"movq %%r15, %%rax\n"
"adcq %%rdx, %%r13\n"
"adcq $0, %%r14\n"
/* (r12,r13,r14) += a3 * b2 */
"mulq %%r10\n"
"addq %%rax, %%r12\n"
"adcq %%rdx, %%r13\n"
"movq %%r15, %%rax\n"
"adcq $0, %%r14\n"
/* Extract l5 */
/*"movq %%r12, 40(%%rsi)\n"*/
/* (r13,r14) += a3 * b3 */
"mulq %%r8\n"
"addq %%rax, %%r13\n"
"adcq %%rdx, %%r14\n"
/* Extract l6 */
/*"movq %%r13, 48(%%rsi)\n"*/
/* Extract l7 */
/*"movq %%r14, 56(%%rsi)\n"*/
: "+d"(pb)
: "S"(l), "D"(a->d)
: "rax", "rbx", "rcx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "cc", "memory");
__asm__ __volatile__(
/* Preload. */
/* "movq 32(%%rsi), %%r11\n" */
/* "movq 40(%%rsi), %%r12\n" */
/*"movq 48(%%rsi), %%r13\n" */
/* "movq 56(%%rsi), %%r14\n" */
"movq 0(%%rsi), %%rbx\n"
"movq %3, %%rax\n"
"movq %%rax, %%r10\n"
"xor %%ecx, %%ecx\n"
"xorq %%r15, %%r15\n"
"xorq %%r9, %%r9\n"
"xorq %%r8, %%r8\n"
"mulq %%r11\n"
"addq %%rax, %%rbx\n" /*q0 into rbx*/
"adcq %%rdx, %%rcx\n"
"addq 8(%%rsi), %%rcx\n"
"movq %%r10, %%rax\n"
"adcq %%r9, %%r15\n"
"mulq %%r12\n"
"addq %%rax, %%rcx\n" /*q1 stored to rcx*/
"adcq %%rdx, %%r15\n"
"movq %4, %%rax\n"
"adcq %%r9, %%r8\n"
"mulq %%r11\n"
"addq %%rax, %%rcx\n"
"adcq %%rdx, %%r15\n"
"adcq %%r9, %%r8\n"
"addq 16(%%rsi), %%r15\n"
"adcq %%r9, %%r8\n"
"movq %%r10, %%rax\n"
"adcq %%r9, %%r9\n"
"mulq %%r13\n"
"addq %%rax, %%r15\n"
"adcq %%rdx, %%r8\n"
"movq %4, %%rax\n"
"adcq $0, %%r9\n"
"mulq %%r12\n"
"addq %%rax, %%r15\n"
"adcq %%rdx, %%r8\n"
"adcq $0, %%r9\n"
"movq %%r10, %%rax\n"
"movq $0, %%r10\n"
"addq %%r11, %%r15\n" /*q2 into r15*/
"adcq $0, %%r8\n"
"adcq $0, %%r9\n"
"addq 24(%%rsi), %%r8\n"
"adcq $0, %%r9\n"
"adcq %%r10, %%r10\n"
"mulq %%r14\n"
"addq %%rax, %%r8\n"
"adcq %%rdx, %%r9\n"
"movq %4, %%rax\n"
"movq %%rax, %%rsi\n"
"adcq $0, %%r10\n"
"mulq %%r13\n"
"addq %%rax, %%r8\n"
"adcq %%rdx, %%r9\n"
"adcq $0, %%r10\n"
"addq %%r8, %%r12\n" /* q3 into r12*/
"adcq $0, %%r9\n"
"movq $0, %%r8\n"
"movq %%rsi, %%rax\n"
"adcq $0, %%r10\n"
"mulq %%r14\n"
"addq %%rax, %%r9\n"
"adcq %%rdx, %%r10\n"
"adcq %%r8, %%r8\n"
"addq %%r9, %%r13\n" /*q4 into r13*/
"adcq $0, %%r10\n"
"adcq $0, %%r8\n"
"addq %%r14, %%r10\n" /* q5 into r10 */
"movq %3, %%rax\n"
"movq %%rax, %%r9\n"
"adcq $0, %%r8\n" /*q6 into r8*/
/* %q5 input for second operation is %q0 output from first / RBX as the connecting link
%q6 input for second operation is %q1 output from first / RCX as the connecting link
%q7 input for second operation is %q2 output from first / R15 as the connecting link
%q8 input for second operation is %q3 output from first / R12 as the connecting link
%q9 input for second operation is %q4 output from first / R13 as the connecting link*
%q10 input for second operation is %q5 output from first / R10 as the connecting link*
%q11 input for second operation is %q6 output from first / R8 as the connecting link */
/* Reduce 385 bits into 258. */
"mulq %%r13\n"
"xorq %%r14, %%r14\n"
"xorq %%r11, %%r11\n"
"addq %%rax, %%rbx\n" /* q0 output*/
"adcq %%rdx, %%r14\n"
"addq %%rcx, %%r14\n"
"mov $0, %%ecx\n"
"movq %%r9, %%rax\n"
"adcq %%r11, %%r11\n"
"mulq %%r10\n"
"addq %%rax, %%r14\n"
"adcq %%rdx, %%r11\n"
"movq %%rsi, %%rax\n"
"adcq %%rcx, %%rcx\n"
"mulq %%r13\n"
"addq %%rax, %%r14\n" /* q1 output */
"movq %%r9, %%rax\n"
"adcq %%rdx, %%r11\n"
"adcq $0, %%rcx\n"
"xorq %%r9, %%r9\n"
"addq %%r15, %%r11\n"
"adcq %%r9, %%rcx\n"
"movq %%rax, %%r15\n"
"adcq %%r9, %%r9\n"
"mulq %%r8\n"
"addq %%rax, %%r11\n"
"adcq %%rdx, %%rcx\n"
"movq %%rsi, %%rax\n"
"adcq $0, %%r9\n"
"mulq %%r10\n"
"addq %%rax, %%r11\n"
"adcq %%rdx, %%rcx\n"
"adcq $0, %%r9\n"
"addq %%r13, %%r11\n" /* q2 output */
"adcq $0, %%rcx\n"
"adcq $0, %%r9\n"
"addq %%r12, %%rcx\n"
"movq %%rsi, %%rax\n"
"adcq $0, %%r9\n"
"mulq %%r8\n"
"addq %%rax, %%rcx\n"
"adcq %%rdx, %%r9\n"
"addq %%r10, %%rcx\n" /* q3 output */
"adcq $0, %%r9\n"
"movq %%r15, %%rax\n"
"addq %%r8, %%r9\n" /* q4 output */
/* %q1 input for next operation is %q0 output from prior / RBX as the connecting link
%q2 input for next operation is %q1 output from prior / R14 as the connecting link
%q3 input for next operation is %q2 output from prior / R11 as the connecting link
%q4 input for next operation is %q3 output from prior / RCX as the connecting link
%q5 input for next operation is %q4 output from prior / R9 as the connecting link */
/* Reduce 258 bits into 256. */
"mulq %%r9\n"
"addq %%rbx, %%rax\n"
"adcq $0, %%rdx\n"
"movq %%rax, %%r8\n" /* 0(q2) output */
"movq %%rdx, %%r12\n"
"xorq %%r13, %%r13\n"
"addq %%r14, %%r12\n"
"movq %%rsi, %%rax\n"
"adcq %%r13, %%r13\n"
"mulq %%r9\n"
"addq %%rax, %%r12\n" /* 8(q2) output */
"adcq %%rdx, %%r13\n"
"xor %%ebx, %%ebx\n"
"addq %%r9, %%r13\n"
"adcq %%rbx, %%rbx\n"
"movq $0xffffffffffffffff, %%r14\n"
"addq %%r11, %%r13\n" /* 16(q2) output */
"movq $0, %%r11\n"
"adcq $0, %%rbx\n"
"addq %%rcx, %%rbx\n" /* 24(q2) output */
"adcq $0, %%r11\n" /* c output */
/*FINAL REDUCTION */
/* r8 carries ex 0(%%rdi),
r12 carries ex 8(%%rdi),
r13 carries ex 16(%%rdi),
rbx carries ex 24(%%rdi)
r11 carries c */
"movq $0xbaaedce6af48a03b,%%r9\n"
"movq $0xbaaedce6af48a03a,%%rcx\n"
"movq $0xbfd25e8cd0364140,%%r10\n"
"cmp %%r14 ,%%rbx\n"
"setne %%dl\n"
"cmp $0xfffffffffffffffd,%%r13\n"
"setbe %%al\n"
"or %%eax,%%edx\n"
"cmp %%rcx,%%r12\n"
"setbe %%cl\n"
"or %%edx,%%ecx\n"
"cmp %%r9,%%r12\n"
"movzbl %%dl,%%edx\n"
"seta %%r9b\n"
"cmp %%r10,%%r8\n"
"movzbl %%cl,%%ecx\n"
"seta %%r10b\n"
"not %%ecx\n"
"not %%edx\n"
"or %%r10d,%%r9d\n"
"movzbl %%r9b,%%r9d\n"
"and %%r9d,%%ecx\n"
"xor %%r9d,%%r9d\n"
"cmp %%r14,%%r13\n"
"sete %%r9b\n"
"xor %%r10d,%%r10d\n"
"and %%r9d,%%edx\n"
"or %%edx,%%ecx\n"
"xor %%edx,%%edx\n"
"add %%ecx,%%r11d\n"
"imulq %%r11,%%r15\n"
"addq %%r15,%%r8\n"
"adcq %%rdx,%%r10\n"
"imulq %%r11,%%rsi\n"
"xorq %%r15,%%r15\n"
"xor %%eax,%%eax\n"
"movq %%r8,0(%q2)\n"
"xor %%edx,%%edx\n"
"addq %%r12,%%rsi\n"
"adcq %%rdx,%%rdx\n"
"addq %%rsi,%%r10\n"
"movq %%r10,8(%q2)\n"
"adcq %%rdx,%%r15\n"
"addq %%r11,%%r13\n"
"adcq %%rax,%%rax\n"
"addq %%r15,%%r13\n"
"movq %%r13,16(%q2)\n"
"adcq $0,%%rax\n"
"addq %%rbx,%%rax\n"
"movq %%rax,24(%q2)\n"
: "=D"(r)
: "S"(l), "D"(r), "n"(SECP256K1_N_C_0), "n"(SECP256K1_N_C_1)
: "rax", "rbx", "rcx", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "cc", "memory");
#else
uint64_t l[8];
secp256k1_scalar_mul_512(l, a, b);
secp256k1_scalar_reduce_512(r, l);
#endif
}
static void secp256k1_scalar_sqr(secp256k1_scalar *r, const secp256k1_scalar *a) {
#ifdef USE_ASM_X86_64
uint64_t l[8];
__asm__ __volatile__(
/* Preload */
"movq 0(%%rdi), %%r11\n"
"movq 8(%%rdi), %%r12\n"
"movq 16(%%rdi), %%rcx\n"
"movq 24(%%rdi), %%r14\n"
/* (rax,rdx) = a0 * a0 */
"movq %%r11, %%rax\n"
"mulq %%r11\n"
/* Extract l0 */
"movq %%rax, %%rbx\n" /*0(%%rsi)\n"*/
/* (r8,r9,r10) = (rdx,0) */
"movq %%rdx, %%r15\n"
"xorq %%r9, %%r9\n"
"xorq %%r10, %%r10\n"
"xorq %%r8, %%r8\n"
/* (r8,r9,r10) += 2 * a0 * a1 */
"movq %%r11, %%rax\n"
"mulq %%r12\n"
"addq %%rax, %%r15\n"
"adcq %%rdx, %%r9\n"
"adcq $0, %%r10\n"
"addq %%rax, %%r15\n" /*8 rsi in r15*/
"adcq %%rdx, %%r9\n"
"movq %%r11, %%rax\n"
"adcq $0, %%r10\n"
/* Extract l1 */
/* 8(rsi) in r15*/
/* (r9,r10,r8) += 2 * a0 * a2 */
"mulq %%rcx\n"
"addq %%rax, %%r9\n"
"adcq %%rdx, %%r10\n"
"adcq $0, %%r8\n"
"addq %%rax, %%r9\n"
"adcq %%rdx, %%r10\n"
"movq %%r12, %%rax\n"
"adcq $0, %%r8\n"
/* (r9,r10,r8) += a1 * a1 */
"mulq %%r12\n"
"addq %%rax, %%r9\n"
"adcq %%rdx, %%r10\n"
/* Extract l2 */
"movq %%r9, 16(%%rsi)\n"
"movq %%r11, %%rax\n"
"movq $0, %%r9\n"
/* (r10,r8,r9) += 2 * a0 * a3 */
"adcq $0, %%r8\n"
"mulq %%r14\n"
"addq %%rax, %%r10\n"
"adcq %%rdx, %%r8\n"
"adcq $0, %%r9\n"
"addq %%rax, %%r10\n"
"adcq %%rdx, %%r8\n"
"movq %%r12, %%rax\n"
"adcq $0, %%r9\n"
/* (r10,r8,r9) += 2 * a1 * a2 */
"mulq %%rcx\n"
"addq %%rax, %%r10\n"
"adcq %%rdx, %%r8\n"
"adcq $0, %%r9\n"
"addq %%rax, %%r10\n"
"adcq %%rdx, %%r8\n"
"movq %%r10, %%r13\n"
"movq %%r12, %%rax\n"
"adcq $0, %%r9\n"
/* Extract l3 */
/*"movq %%r10, 24(%%rsi)\n"*/
/* (r8,r9,r10) += 2 * a1 * a3 */
"mulq %%r14\n"
"xorq %%r10, %%r10\n"
"addq %%rax, %%r8\n"
"adcq %%rdx, %%r9\n"
"adcq $0, %%r10\n"
"addq %%rax, %%r8\n"
"adcq %%rdx, %%r9\n"
"movq %%rcx, %%rax\n"
"adcq $0, %%r10\n"
/* (r8,r9,r10) += a2 * a2 */
"mulq %%rcx\n"
"addq %%rax, %%r8\n"
"adcq %%rdx, %%r9\n"
/* Extract l4 */
/*"movq %%r8, 32(%%rsi)\n"*/
"movq %%r8, %%r11\n"
"movq %%rcx, %%rax\n"
"movq $0, %%r8\n"
/* (r9,r10,r8) += 2 * a2 * a3 */
"adcq $0, %%r10\n"
"mulq %%r14\n"
"addq %%rax, %%r9\n"
"adcq %%rdx, %%r10\n"
"adcq $0, %%r8\n"
"addq %%rax, %%r9\n"
"adcq %%rdx, %%r10\n"
"movq %%r14, %%rax\n"
"adcq $0, %%r8\n"
/* Extract l5 */
/*"movq %%r9, 40(%%rsi)\n"*/
/* "movq %%r9, %%r12\n"*/
/* (r10,r8) += a3 * a3 */
"mulq %%r14\n"
"addq %%rax, %%r10\n"
/* Extract l6 */
/*"movq %%r10, 48(%%rsi)\n"*/
/*"movq %%r10, %%rcx\n"*/
/* Extract l7 */
/*"movq %%r8, 56(%%rsi)\n"*/
/*"movq %%r8, %%r14\n"*/
:
: "S"(l), "D"(a->d)
: "rax", "rbx", "rcx", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "cc", "memory");
__asm__ __volatile__(
/* Preload. */
/* "movq 32(%%rsi), %%r11\n" */
/* "movq 40(%%rsi), %%r9\n" */
/* "movq 48(%%rsi), %%r10\n" */
/* "movq 56(%%rsi), %%r8\n" */
/* "movq 0(%%rsi), %%rbx\n" */
/* "movq %%rcx, %%r13\n"*/
"movq %3, %%rax\n"
"adcq %%rdx, %%r8\n"
"mulq %%r11\n"
"xor %%ecx, %%ecx\n"
"xorq %%r12, %%r12\n"
"xorq %%r14, %%r14\n"
"addq %%rax, %%rbx\n" /*q0 into rbx*/
"adcq %%rdx, %%rcx\n"
/* "addq 8(%%rsi), %%rcx\n" */
"addq %%r15, %%rcx\n"
"mov $0, %%r15d\n"
"movq %3, %%rax\n"
"adcq %%r12, %%r15\n"
"mulq %%r9\n"
"addq %%rax, %%rcx\n" /*q1 stored to rcx*/
"adcq %%rdx, %%r15\n"
"movq %4, %%rax\n"
"adcq %%r12, %%r14\n"
"mulq %%r11\n"
"addq %%rax, %%rcx\n"
"adcq %%rdx, %%r15\n"
"adcq %%r12, %%r14\n"
"addq 16(%%rsi), %%r15\n"
"adcq %%r12, %%r14\n"
"movq %3, %%rax\n"
"adcq %%r12, %%r12\n"
"mulq %%r10\n"
"movq %4, %%rsi\n"
"addq %%rax, %%r15\n"
"adcq %%rdx, %%r14\n"
"movq %%rsi, %%rax\n"
"adcq $0, %%r12\n"
"mulq %%r9\n"
"addq %%rax, %%r15\n"
"adcq %%rdx, %%r14\n"
"adcq $0, %%r12\n"
"movq %3, %%rax\n"
"addq %%r11, %%r15\n" /*q2 into r15*/
"adcq $0, %%r14\n"
"adcq $0, %%r12\n"
"addq %%r13, %%r14\n"
"movq $0, %%r13\n"
"adcq $0, %%r12\n"
"adcq $0, %%r13\n"
"mulq %%r8\n"
"addq %%rax, %%r14\n"
"movq %%rsi, %%rax\n"
"adcq %%rdx, %%r12\n"
"adcq $0, %%r13\n"
"mulq %%r10\n"
"addq %%rax, %%r14\n"
"adcq %%rdx, %%r12\n"
"adcq $0, %%r13\n"
"addq %%r14, %%r9\n" /* q3 into r9*/
"adcq $0, %%r12\n"
"movq %%rsi, %%rax\n"
"movq $0, %%r14\n"
"adcq $0, %%r13\n"
"mulq %%r8\n"
"addq %%rax, %%r12\n"
"adcq %%rdx, %%r13\n"
"adcq %%r14, %%r14\n"
"addq %%r12, %%r10\n" /*q4 into r10*/
"adcq $0, %%r13\n"
"adcq $0, %%r14\n"
"addq %%r8, %%r13\n" /* q5 into r13 */
"movq %3, %%rax\n"
"movq %%rax, %%r12\n"
"adcq $0, %%r14\n" /*q6 into r14*/
/* %q5 input for second operation is %q0 output from first / RBX as the connecting link
%q6 input for second operation is %q1 output from first / RCX as the connecting link
%q7 input for second operation is %q2 output from first / R15 as the connecting link
%q8 input for second operation is %q3 output from first / r9 as the connecting link
%q9 input for second operation is %q4 output from first / r10 as the connecting link*
%q10 input for second operation is %q5 output from first / r13 as the connecting link*
%q11 input for second operation is %q6 output from first / r14 as the connecting link */
/* Reduce 385 bits into 258. */
"mulq %%r10\n"
"xorq %%r8, %%r8\n"
"xorq %%r11, %%r11\n"
"addq %%rax, %%rbx\n" /* q0 output*/
"adcq %%rdx, %%r8\n"
"addq %%rcx, %%r8\n"
"movq %%r12, %%rax\n"
"mov $0, %%ecx\n"
"adcq %%r11, %%r11\n"
"mulq %%r13\n"
"addq %%rax, %%r8\n"
"adcq %%rdx, %%r11\n"
"movq %%rsi, %%rax\n"
"adcq %%rcx, %%rcx\n"
"mulq %%r10\n"
"addq %%rax, %%r8\n" /* q1 output */
"movq %%r12, %%rax\n"
"adcq %%rdx, %%r11\n"
"adcq $0, %%rcx\n"
"xorq %%r12, %%r12\n"
"addq %%r15, %%r11\n"
"adcq %%r12, %%rcx\n"
"movq %%rax, %%r15\n"
"adcq %%r12, %%r12\n"
"mulq %%r14\n"
"addq %%rax, %%r11\n"
"adcq %%rdx, %%rcx\n"
"movq %%rsi, %%rax\n"
"adcq $0, %%r12\n"
"mulq %%r13\n"
"addq %%rax, %%r11\n"
"adcq %%rdx, %%rcx\n"
"adcq $0, %%r12\n"
"addq %%r10, %%r11\n" /* q2 output */
"adcq $0, %%rcx\n"
"adcq $0, %%r12\n"
"addq %%r9, %%rcx\n"
"movq %%rsi, %%rax\n"
"adcq $0, %%r12\n"
"mulq %%r14\n"
"addq %%rax, %%rcx\n"
"adcq %%rdx, %%r12\n"
"addq %%r13, %%rcx\n" /* q3 output */
"adcq $0, %%r12\n"
"movq %%r15, %%rax\n"
"addq %%r14, %%r12\n" /* q4 output */
/* %q1 input for next operation is %q0 output from prior / RBX as the connecting link
%q2 input for next operation is %q1 output from prior / r8 as the connecting link
%q3 input for next operation is %q2 output from prior / R11 as the connecting link
%q4 input for next operation is %q3 output from prior / RCX as the connecting link
%q5 input for next operation is %q4 output from prior / r12 as the connecting link */
/* Reduce 258 bits into 256. */
"mulq %%r12\n"
"addq %%rbx, %%rax\n"
"adcq $0, %%rdx\n"
"movq %%rax, %%r14\n" /* 0(q2) output */
"movq %%rdx, %%r9\n"
"xorq %%r10, %%r10\n"
"addq %%r8, %%r9\n"
"movq %%rsi, %%rax\n"
"adcq %%r10, %%r10\n"
"mulq %%r12\n"
"addq %%rax, %%r9\n" /* 8(q2) output */
"adcq %%rdx, %%r10\n"
"xor %%ebx, %%ebx\n"
"addq %%r12, %%r10\n"
"adcq %%rbx, %%rbx\n"
"movq $0xffffffffffffffff, %%r8\n"
"addq %%r11, %%r10\n" /* 16(q2) output */
"movq $0, %%r11\n"
"adcq $0, %%rbx\n"
"addq %%rcx, %%rbx\n" /* 24(q2) output */
"adcq $0, %%r11\n" /* c output */
/*FINAL REDUCTION */
/* r14 carries ex 0(%%rdi),
r9 carries ex 8(%%rdi),
r10 carries ex 16(%%rdi),
rbx carries ex 24(%%rdi)
r11 carries c */
"movq $0xbaaedce6af48a03b,%%r12\n"
"movq $0xbaaedce6af48a03a,%%rcx\n"
"movq $0xbfd25e8cd0364140,%%r13\n"
"cmp %%r8 ,%%rbx\n"
"setne %%dl\n"
"cmp $0xfffffffffffffffd,%%r10\n"
"setbe %%al\n"
"or %%eax,%%edx\n"
"cmp %%rcx,%%r9\n"
"setbe %%cl\n"
"or %%edx,%%ecx\n"
"cmp %%r12,%%r9\n"
"movzbl %%dl,%%edx\n"
"seta %%r12b\n"
"cmp %%r13,%%r14\n"
"movzbl %%cl,%%ecx\n"
"seta %%r13b\n"
"not %%ecx\n"
"not %%edx\n"
"or %%r13d,%%r12d\n"
"movzbl %%r12b,%%r12d\n"
"and %%r12d,%%ecx\n"
"xor %%r12d,%%r12d\n"
"cmp %%r8,%%r10\n"
"sete %%r12b\n"
"xor %%r13d,%%r13d\n"
"and %%r12d,%%edx\n"
"or %%edx,%%ecx\n"
"xor %%edx,%%edx\n"
"add %%ecx,%%r11d\n"
"imulq %%r11,%%r15\n"
"addq %%r15,%%r14\n"
"adcq %%rdx,%%r13\n"
"imulq %%r11,%%rsi\n"
"xorq %%r15,%%r15\n"
"xor %%eax,%%eax\n"
"movq %%r14,0(%q2)\n"
"xor %%edx,%%edx\n"
"addq %%r9,%%rsi\n"
"adcq %%rdx,%%rdx\n"
"addq %%rsi,%%r13\n"
"movq %%r13,8(%q2)\n"
"adcq %%rdx,%%r15\n"
"addq %%r11,%%r10\n"
"adcq %%rax,%%rax\n"
"addq %%r15,%%r10\n"
"movq %%r10,16(%q2)\n"
"adcq $0,%%rax\n"
"addq %%rbx,%%rax\n"
"movq %%rax,24(%q2)\n"
: "=D"(r)
: "S"(l), "D"(r), "n"(SECP256K1_N_C_0), "n"(SECP256K1_N_C_1)
: "rax", "rbx", "rcx", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "cc", "memory");
#else
uint64_t l[8];
secp256k1_scalar_sqr_512(l, a);
secp256k1_scalar_reduce_512(r, l);
#endif
}
10:03 < tonikt> Like being able to download a block in fragments
10:03 < sipa> tonikt: BIP37 allows that, in a way
[...]
10:05 < gmaxwell> tonikt: Not a very worthwhile thing in my opinion, and as sipa points out its already possible.
10:05 < gmaxwell> tonikt: Not a very worthwhile thing in my opinion, and as sipa points out its already possible.
/**********************************************************************
* Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille *
* Distributed under the MIT software license, see the accompanying *
* file COPYING or http://www.opensource.org/licenses/mit-license.php.*
**********************************************************************/
/**
* Changelog:
* - March 2013, Diederik Huys: original version
* - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm
* - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly
*/
#ifndef _SECP256K1_FIELD_INNER5X52_IMPL_H_
#define _SECP256K1_FIELD_INNER5X52_IMPL_H_
SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) {
/**
* Registers: rdx:rax = multiplication accumulator
* r9:r8 = c
* r15:rcx = d
* r10-r14 = a0-a4
* rbx = b
* rdi = r
* rsi = a / t?
*/
uint64_t tmp1, tmp2;
__asm__ __volatile__(
"movq 24(%%rsi),%%r13\n"
"movq 0(%%rbx),%%rax\n"
"movq 32(%%rsi),%%r14\n"
/* d += a3 * b0 */
"mulq %%r13\n"
"movq 0(%%rsi),%%r10\n"
"movq 8(%%rsi),%%r11\n"
"movq %%rax,%%r9\n"
"movq 16(%%rsi),%%r12\n"
"movq 8(%%rbx),%%rax\n"
"movq %%rdx,%%rsi\n"
/* d += a2 * b1 */
"mulq %%r12\n"
"addq %%rax,%%r9\n"
"movq 16(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a1 * b2 */
"mulq %%r11\n"
"movq $0x1000003d10,%%rcx\n"
"movq $0xfffffffffffff,%%r15\n"
"addq %%rax,%%r9\n"
"movq 24(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d = a0 * b3 */
"mulq %%r10\n"
"addq %%rax,%%r9\n"
"movq 32(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* c = a4 * b4 */
"mulq %%r14\n"
"movq %%rax,%%r8\n"
"shrdq $52,%%rdx,%%r8\n" /* c >>= 52 (%%r8 only) */
/* d += (c & M) * R */
"andq %%r15,%%rax\n"
"mulq %%rcx\n"
"addq %%rax,%%r9\n"
"adcq %%rdx,%%rsi\n"
/* t3 (tmp1) = d & M */
"movq %%r9,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,%q1\n"
/* d >>= 52 */
"movq 0(%%rbx),%%rax\n"
"shrdq $52,%%rsi,%%r9\n"
"xor %%esi,%%esi\n"
/* d += a4 * b0 */
"mulq %%r14\n"
"addq %%rax,%%r9\n"
"movq 8(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a3 * b1 */
"mulq %%r13\n"
"addq %%rax,%%r9\n"
"movq 16(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a2 * b2 */
"mulq %%r12\n"
"addq %%rax,%%r9\n"
"movq 24(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a1 * b3 */
"mulq %%r11\n"
"addq %%rax,%%r9\n"
"movq 32(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a0 * b4 */
"mulq %%r10\n"
"addq %%rax,%%r9\n"
/* d += c * R */
"movq %%rcx,%%rax\n"
"adcq %%rdx,%%rsi\n"
"mulq %%r8\n"
"addq %%rax,%%r9\n"
"adcq %%rdx,%%rsi\n"
/* t4 = d & M (%%r15) */
"movq %%r9,%%rax\n"
"andq %%r15,%%rax\n"
/* d >>= 52 */
"shrdq $52,%%rsi,%%r9\n"
"xor %%esi,%%esi\n"
/* tx = t4 >> 48 (tmp3) */
"movq %%rax,%%r15\n"
"shrq $48,%%r15\n" /*Q3*/
/* t4 &= (M >> 4) (tmp2) */
"movq $0xffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n"
"movq %%rax,%q2\n"
/*"movq %q2,%%r15\n" */
"movq 0(%%rbx),%%rax\n"
/* c = a0 * b0 */
"mulq %%r10\n"
"movq %%rax,%%r8\n"
"movq 8(%%rbx),%%rax\n"
"movq %%rdx,%%rcx\n"
/* d += a4 * b1 */
"mulq %%r14\n"
"addq %%rax,%%r9\n"
"movq 16(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a3 * b2 */
"mulq %%r13\n"
"addq %%rax,%%r9\n"
"movq 24(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a2 * b3 */
"mulq %%r12\n"
"addq %%rax,%%r9\n"
"movq 32(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a1 * b4 */
"mulq %%r11\n"
"addq %%rax,%%r9\n"
"adcq %%rdx,%%rsi\n"
"movq %%r15,%%rax\n" /*Q3 transfered*/
/* u0 = d & M (%%r15) */
"movq %%r9,%%rdx\n"
"shrdq $52,%%rsi,%%r9\n"
"movq $0xfffffffffffff,%%r15\n"
"xor %%esi, %%esi\n"
"andq %%r15,%%rdx\n"
/* d >>= 52 */
/* u0 = (u0 << 4) | tx (%%r15) */
"shlq $4,%%rdx\n"
"orq %%rax,%%rdx\n"
/* c += u0 * (R >> 4) */
"movq $0x1000003d1,%%rax\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%rcx\n"
/* r[0] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,0(%%rdi)\n"
/* c >>= 52 */
"movq 0(%%rbx),%%rax\n"
"shrdq $52,%%rcx,%%r8\n"
"xor %%ecx,%%ecx\n"
/* c += a1 * b0 */
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"movq 8(%%rbx),%%rax\n"
"adcq %%rdx,%%rcx\n"
/* c += a0 * b1 */
"mulq %%r10\n"
"addq %%rax,%%r8\n"
"movq 16(%%rbx),%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d += a4 * b2 */
"mulq %%r14\n"
"addq %%rax,%%r9\n"
"movq 24(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a3 * b3 */
"mulq %%r13\n"
"addq %%rax,%%r9\n"
"movq 32(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a2 * b4 */
"mulq %%r12\n"
"addq %%rax,%%r9\n"
"adcq %%rdx,%%rsi\n"
/* c += (d & M) * R */
"movq %%r9,%%rax\n"
"movq $0x1000003d10,%%rdx\n"
"andq %%r15,%%rax\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%rcx\n"
/* d >>= 52 */
"shrdq $52,%%rsi,%%r9\n"
/* r[1] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,8(%%rdi)\n"
/* c >>= 52 */
"movq 0(%%rbx),%%rax\n"
"shrdq $52,%%rcx,%%r8\n"
"xor %%ecx,%%ecx\n"
/* c += a2 * b0 */
"mulq %%r12\n"
"addq %%rax,%%r8\n"
"movq 8(%%rbx),%%rax\n"
"adcq %%rdx,%%rcx\n"
/* c += a1 * b1 */
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"movq 16(%%rbx),%%rax\n"
"adcq %%rdx,%%rcx\n"
/* c += a0 * b2 (last use of %%r10 = a0) */
"mulq %%r10\n"
"addq %%rax,%%r8\n"
/* fetch t3 (%%r10, overwrites a0), t4 (%%r15) */
"movq 24(%%rbx),%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d += a4 * b3 */
"mulq %%r14\n"
"movq %q1,%%r10\n"
"xor %%esi, %%esi\n"
"addq %%rax,%%r9\n"
"movq 32(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a3 * b4 */
"mulq %%r13\n"
"addq %%rax,%%r9\n"
"movq $0x1000003d10,%%r11\n"
"adcq %%rdx,%%rsi\n"
/* c += (d & M) * R */
"movq %%r9,%%rax\n"
"andq %%r15,%%rax\n"
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%rcx\n"
/* d >>= 52 (%%r9 only) */
"shrdq $52,%%rsi,%%r9\n"
/* r[2] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %q2,%%rsi\n"
"movq %%rax,16(%%rdi)\n"
/* c >>= 52 */
"shrdq $52,%%rcx,%%r8\n"
/* c += t3 */
"xor %%ecx,%%ecx\n"
"movq %%r9,%%rax\n"
"addq %%r10,%%r8\n"
/* c += d * R */
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%rcx\n"
/* r[3] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,24(%%rdi)\n"
/* c >>= 52 (%%r8 only) */
"shrdq $52,%%rcx,%%r8\n"
/* c += t4 (%%r8 only) */
"addq %%rsi,%%r8\n"
/* r[4] = c */
"movq %%r8,32(%%rdi)\n"
: "+S"(a), "=m"(tmp1), "=m"(tmp2)
: "b"(b), "D"(r)
: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory");
}
SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) {
/**
* Registers: rdx:rax = multiplication accumulator
* r9:r8 = c
* rcx:rbx = d
* r10-r14 = a0-a4
* r15 = M (0xfffffffffffff)
* rdi = r
* rsi = a / t?
*/
uint64_t tmp1a;
__asm__ __volatile__(
"movq 0(%%rsi),%%r10\n"
"movq 8(%%rsi),%%r11\n"
"movq 16(%%rsi),%%r12\n"
"movq 24(%%rsi),%%r13\n"
"movq 32(%%rsi),%%r14\n"
"leaq (%%r10,%%r10,1),%%rax\n"
"movq $0xfffffffffffff,%%r15\n"
/* d = (a0*2) * a3 */
"mulq %%r13\n"
"movq %%rax,%%rbx\n"
"leaq (%%r11,%%r11,1),%%rax\n"
"movq %%rdx,%%rcx\n"
/* d += (a1*2) * a2 */
"mulq %%r12\n"
"addq %%rax,%%rbx\n"
"movq %%r14,%%rax\n"
"adcq %%rdx,%%rcx\n"
/* c = a4 * a4 */
"mulq %%r14\n"
"movq %%rax,%%r8\n"
"movq %%rdx,%%r9\n"
/* d += (c & M) * R */
"movq $0x1000003d10,%%rdx\n"
"andq %%r15,%%rax\n"
"mulq %%rdx\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* c >>= 52 (%%r8 only) */
"shrdq $52,%%r9,%%r8\n"
/* t3 (tmp1) = d & M */
"movq %%rbx,%%rsi\n"
"andq %%r15,%%rsi\n" /*Q1 became rsi*/
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
/* a4 *= 2 */
"movq %%r10,%%rax\n"
"addq %%r14,%%r14\n"
/* d += a0 * a4 */
"mulq %%r14\n"
"xor %%ecx,%%ecx\n"
"addq %%rax,%%rbx\n"
"leaq (%%r11,%%r11,1),%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d+= (a1*2) * a3 */
"mulq %%r13\n"
"addq %%rax,%%rbx\n"
"movq %%r12,%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d += a2 * a2 */
"mulq %%r12\n"
"addq %%rax,%%rbx\n"
/* d += c * R */
"movq %%r8,%%rax\n"
"movq $0x1000003d10,%%r8\n"
"adcq %%rdx,%%rcx\n"
"mulq %%r8\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* t4 = d & M (%%rsi) */
"movq %%rbx,%%rdx\n"
"andq %%r15,%%rdx\n"
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
"xor %%ecx,%%ecx\n"
/* tx = t4 >> 48 (tmp3) */
"movq %%rdx,%%r15\n"
"shrq $48,%%r15\n" /*Q3=R15*/
/* t4 &= (M >> 4) (tmp2) */
"movq $0xffffffffffff,%%rax\n"
"andq %%rax,%%rdx\n"
"movq %%rdx,%q1\n"/*Q2 OUT - renamed to q1*/
/* c = a0 * a0 */
"movq %%r10,%%rax\n"
"mulq %%r10\n"
"movq %%rax,%%r8\n"
"movq %%r11,%%rax\n"
"movq %%rdx,%%r9\n"
/* d += a1 * a4 */
"mulq %%r14\n"
"addq %%rax,%%rbx\n"
"leaq (%%r12,%%r12,1),%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d += (a2*2) * a3 */
"mulq %%r13\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* u0 = d & M (%%rsi) */
"movq %%rbx,%%rdx\n"
"movq $0xfffffffffffff,%%rax\n"
"andq %%rax,%%rdx\n"
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
"xor %%ecx,%%ecx\n"
/* u0 = (u0 << 4) | tx (%%rsi) */
"shlq $4,%%rdx\n"
"orq %%r15,%%rdx\n" /*Q3 - R15 RETURNS*/
/* c += u0 * (R >> 4) */
"movq $0x1000003d1,%%rax\n"
"movq $0xfffffffffffff,%%r15\n" /*R15 back in its place*/
"mulq %%rdx\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* r[0] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,0(%%rdi)\n"
/* c >>= 52 */
"shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n"
/* a0 *= 2 */
"addq %%r10,%%r10\n"
/* c += a0 * a1 */
"movq %%r10,%%rax\n"
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"movq %%r12,%%rax\n"
"adcq %%rdx,%%r9\n"
/* d += a2 * a4 */
"mulq %%r14\n"
"addq %%rax,%%rbx\n"
"movq %%r13,%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d += a3 * a3 */
"mulq %%r13\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* c += (d & M) * R */
"movq %%rbx,%%rax\n"
"andq %%r15,%%rax\n"
"movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
"xor %%ecx,%%ecx\n"
/* r[1] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,8(%%rdi)\n"
/* c >>= 52 */
"movq %%r10,%%rax\n"
"shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n"
/* c += a0 * a2 (last use of %%r10) */
"mulq %%r12\n"
"addq %%rax,%%r8\n"
"movq %%r11,%%rax\n"
"movq %q1,%%r12\n" /*Q2 RETURNS*/
"adcq %%rdx,%%r9\n"
/* fetch t3 (%%r10, overwrites a0),t4 (%%rsi) */
/*"movq %q1,%%r10\n" */
/* c += a1 * a1 */
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"movq %%r13,%%rax\n"
"adcq %%rdx,%%r9\n"
/* d += a3 * a4 */
"mulq %%r14\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* c += (d & M) * R */
"movq %%rbx,%%rax\n"
"andq %%r15,%%rax\n"
"movq $0x1000003d10,%%r13\n"
"mulq %%r13\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* d >>= 52 (%%rbx only) */
"shrdq $52,%%rcx,%%rbx\n"
/* r[2] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,16(%%rdi)\n"
/* c >>= 52 */
"shrdq $52,%%r9,%%r8\n"
"xorq %%r14,%%r14\n"
/* c += t3 */
"movq %%rbx,%%rax\n"
"addq %%rsi,%%r8\n" /*RSI = Q1*/
/* c += d * R */
"mulq %%r13\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r14\n"
/* r[3] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,24(%%rdi)\n"
/* c >>= 52 (%%r8 only) */
"shrdq $52,%%r14,%%r8\n"
/* c += t4 (%%r8 only) */
"addq %%r12, %%r8\n"
/* r[4] = c */
"movq %%r8,32(%%rdi)\n"
: "+S"(a), "=m"(tmp1a)
: "D"(r)
: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory");
}
#endif
/**********************************************************************
* Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille *
* Distributed under the MIT software license, see the accompanying *
* file COPYING or http://www.opensource.org/licenses/mit-license.php.*
**********************************************************************/
/**
* Changelog:
* - March 2013, Diederik Huys: original version
* - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm
* - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly
*/
#ifndef _SECP256K1_FIELD_INNER5X52_IMPL_H_
#define _SECP256K1_FIELD_INNER5X52_IMPL_H_
SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) {
/**
* Registers: rdx:rax = multiplication accumulator
* r9:r8 = c
* r15:rcx = d
* r10-r14 = a0-a4
* rbx = b
* rdi = r
* rsi = a / t?
*/
/* xmm0 = q1 xmm6=q2 */
/* This has 17 mem accesses + 17 xmm uses vs 35 mem access and no xmm use*/
__asm__ __volatile__(
"push %%rbx\n"
"movq %%rsp, %%xmm1\n"
"movq %%rbp, %%xmm2\n"
"movq %%rdi, %%xmm3\n"
"movq 0(%%rbx),%%rdi\n"
"movq 8(%%rbx),%%rbp\n"
"movq 16(%%rbx),%%rsp\n"
"movq %%rdi,%%xmm4\n"
"movq 24(%%rsi),%%r13\n"
"movq %%rdi,%%rax\n"
"movq 32(%%rsi),%%r14\n"
/* d += a3 * b0 */
"mulq %%r13\n"
"movq 0(%%rsi),%%r10\n"
"movq %%rax,%%r9\n"
"movq 8(%%rsi),%%r11\n"
"movq 16(%%rsi),%%r12\n"
"movq %%rbp,%%rax\n"
"movq %%rdx,%%rsi\n"
/* d += a2 * b1 */
"mulq %%r12\n"
"movq 24(%%rbx),%%rcx\n"
"movq 32(%%rbx),%%rbx\n"
"addq %%rax,%%r9\n"
"movq %%rsp,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a1 * b2 */
"mulq %%r11\n"
"addq %%rax,%%r9\n"
"movq %%rcx,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d = a0 * b3 */
"mulq %%r10\n"
"addq %%rax,%%r9\n"
"movq %%rbx,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* c = a4 * b4 */
"mulq %%r14\n"
"movq $0xfffffffffffff,%%r15\n"
"movq %%rax,%%r8\n"
/* d += (c & M) * R */
"andq %%r15,%%rax\n"
"shrdq $52,%%rdx,%%r8\n" /* c >>= 52 (%%r8 only) */
"movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%r9\n"
"adcq %%rdx,%%rsi\n"
/* t3 (tmp1) = d & M */
"movq %%r9,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,%%xmm0\n"
/* d >>= 52 */
"movq %%rdi,%%rax\n"
"shrdq $52,%%rsi,%%r9\n"
"xor %%esi,%%esi\n"
/* d += a4 * b0 */
"mulq %%r14\n"
"addq %%rax,%%r9\n"
"movq %%rbp,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a3 * b1 */
"mulq %%r13\n"
"addq %%rax,%%r9\n"
"movq %%rsp,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a2 * b2 */
"mulq %%r12\n"
"addq %%rax,%%r9\n"
"movq %%rcx,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a1 * b3 */
"mulq %%r11\n"
"addq %%rax,%%r9\n"
"movq %%rbx,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a0 * b4 */
"mulq %%r10\n"
"addq %%rax,%%r9\n"
/* d += c * R */
"movq $0x1000003d10,%%rax\n"
"adcq %%rdx,%%rsi\n"
"mulq %%r8\n"
"addq %%rax,%%r9\n"
"adcq %%rdx,%%rsi\n"
/* t4 = d & M (%%r15) */
"movq %%r9,%%rax\n"
"andq %%r15,%%rax\n"
/* d >>= 52 */
"shrdq $52,%%rsi,%%r9\n"
"xor %%esi,%%esi\n"
/* tx = t4 >> 48 (tmp3) */
"movq %%rax,%%r15\n"
"shrq $48,%%r15\n" /*Q3*/
/* t4 &= (M >> 4) (tmp2) */
"movq $0xffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n"
"movq %%rax,%%xmm6\n"
/*"movq %q2,%%r15\n" */
"movq %%rdi,%%rax\n"
/* c = a0 * b0 */
"mulq %%r10\n"
"movq %%rcx,%%xmm5\n"
"movq %%rax,%%r8\n"
"movq %%rbp,%%rax\n"
"movq %%rdx,%%rcx\n"
/* d += a4 * b1 */
"mulq %%r14\n"
"addq %%rax,%%r9\n"
"movq %%rsp,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a3 * b2 */
"mulq %%r13\n"
"addq %%rax,%%r9\n"
"movq %%xmm5,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a2 * b3 */
"mulq %%r12\n"
"addq %%rax,%%r9\n"
"movq %%rbx,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a1 * b4 */
"mulq %%r11\n"
"addq %%rax,%%r9\n"
"adcq %%rdx,%%rsi\n"
"movq %%r15,%%rax\n" /*Q3 transfered*/
/* u0 = d & M (%%r15) */
"movq %%r9,%%rdx\n"
"shrdq $52,%%rsi,%%r9\n"
"movq $0xfffffffffffff,%%r15\n"
"xor %%esi, %%esi\n"
"andq %%r15,%%rdx\n"
/* d >>= 52 */
/* u0 = (u0 << 4) | tx (%%r15) */
"shlq $4,%%rdx\n"
"orq %%rax,%%rdx\n"
/* c += u0 * (R >> 4) */
"movq $0x1000003d1,%%rax\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%rcx\n"
/* r[0] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,%%rdx\n"
/* c >>= 52 */
"movq %%rdi,%%rax\n"
"movq %%xmm3, %%rdi\n"
"shrdq $52,%%rcx,%%r8\n"
"xor %%ecx,%%ecx\n"
"movq %%rdx,0(%%rdi)\n"
/* c += a1 * b0 */
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"movq %%rbp,%%rax\n"
"adcq %%rdx,%%rcx\n"
/* c += a0 * b1 */
"mulq %%r10\n"
"addq %%rax,%%r8\n"
"movq %%rsp,%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d += a4 * b2 */
"mulq %%r14\n"
"addq %%rax,%%r9\n"
"movq %%xmm5,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a3 * b3 */
"mulq %%r13\n"
"addq %%rax,%%r9\n"
"movq %%rbx,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a2 * b4 */
"mulq %%r12\n"
"addq %%rax,%%r9\n"
"adcq %%rdx,%%rsi\n"
/* c += (d & M) * R */
"movq %%r9,%%rax\n"
"movq $0x1000003d10,%%rdx\n"
"andq %%r15,%%rax\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%rcx\n"
/* d >>= 52 */
"shrdq $52,%%rsi,%%r9\n"
/* r[1] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,8(%%rdi)\n"
/* c >>= 52 */
"movq %%xmm4,%%rax\n"
"shrdq $52,%%rcx,%%r8\n"
"xor %%ecx,%%ecx\n"
/* c += a2 * b0 */
"mulq %%r12\n"
"addq %%rax,%%r8\n"
"movq %%rbp,%%rax\n"
"adcq %%rdx,%%rcx\n"
/* c += a1 * b1 */
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"movq %%rsp,%%rax\n"
"adcq %%rdx,%%rcx\n"
/* c += a0 * b2 (last use of %%r10 = a0) */
"mulq %%r10\n"
"addq %%rax,%%r8\n"
/* fetch t3 (%%r10, overwrites a0), t4 (%%r15) */
"movq %%xmm5,%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d += a4 * b3 */
"mulq %%r14\n"
"movq %%xmm0,%%r10\n"
"xor %%esi, %%esi\n"
"addq %%rax,%%r9\n"
"movq %%rbx,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a3 * b4 */
"mulq %%r13\n"
"addq %%rax,%%r9\n"
"movq $0x1000003d10,%%rbx\n"
"adcq %%rdx,%%rsi\n"
/* c += (d & M) * R */
"movq %%r9,%%rax\n"
"andq %%r15,%%rax\n"
"mulq %%rbx\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%rcx\n"
/* d >>= 52 (%%r9 only) */
"shrdq $52,%%rsi,%%r9\n"
/* r[2] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,16(%%rdi)\n"
/* c >>= 52 */
"shrdq $52,%%rcx,%%r8\n"
/* c += t3 */
"movq %%r9,%%rax\n"
"addq %%r10,%%r8\n"
"xor %%ecx,%%ecx\n"
/* c += d * R */
"mulq %%rbx\n"
"movq %%xmm1, %%rsp\n"
"movq %%xmm2, %%rbp\n"
"movq %%xmm6,%%rsi\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%rcx\n"
/* r[3] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,24(%%rdi)\n"
/* c >>= 52 (%%r8 only) */
"shrdq $52,%%rcx,%%r8\n"
/* c += t4 (%%r8 only) */
"addq %%rsi,%%r8\n"
/* r[4] = c */
"movq %%r8,32(%%rdi)\n"
"pop %%rbx\n"
: "+S"(a)
: "b"(b), "D"(r)
: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory");
}
SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) {
/**
* Registers: rdx:rax = multiplication accumulator
* r9:r8 = c
* rcx:rbx = d
* r10-r14 = a0-a4
* r15 = M (0xfffffffffffff)
* rdi = r
* rsi = a / t?
*/
/* tmp1a = xmm0 */
__asm__ __volatile__(
"movq %%rsp, %%xmm1\n"
"movq %%rbp, %%xmm2\n"
"movq 0(%%rsi),%%r10\n"
"movq 8(%%rsi),%%r11\n"
"movq 16(%%rsi),%%r12\n"
"movq 24(%%rsi),%%r13\n"
"movq 32(%%rsi),%%r14\n"
"leaq (%%r10,%%r10,1),%%rax\n"
"movq $0xfffffffffffff,%%r15\n"
/* d = (a0*2) * a3 */
"mulq %%r13\n"
"movq %%rax,%%rbx\n"
"leaq (%%r11,%%r11,1),%%rax\n"
"movq %%rdx,%%rcx\n"
/* d += (a1*2) * a2 */
"mulq %%r12\n"
"addq %%rax,%%rbx\n"
"movq %%r14,%%rax\n"
"adcq %%rdx,%%rcx\n"
/* c = a4 * a4 */
"mulq %%r14\n"
"movq %%rax,%%r8\n"
"movq %%rdx,%%r9\n"
/* d += (c & M) * R */
"movq $0x1000003d10,%%rsp\n"
"andq %%r15,%%rax\n"
"mulq %%rsp\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* c >>= 52 (%%r8 only) */
"shrdq $52,%%r9,%%r8\n"
/* t3 (tmp1) = d & M */
"movq %%rbx,%%rsi\n"
"andq %%r15,%%rsi\n" /*Q1 OUT*/
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
/* a4 *= 2 */
"movq %%r10,%%rax\n"
"addq %%r14,%%r14\n"
/* d += a0 * a4 */
"mulq %%r14\n"
"xor %%ecx,%%ecx\n"
"addq %%rax,%%rbx\n"
"leaq (%%r11,%%r11,1),%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d+= (a1*2) * a3 */
"mulq %%r13\n"
"addq %%rax,%%rbx\n"
"movq %%r12,%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d += a2 * a2 */
"mulq %%r12\n"
"addq %%rax,%%rbx\n"
/* d += c * R */
"movq %%r8,%%rax\n"
"adcq %%rdx,%%rcx\n"
"mulq %%rsp\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* t4 = d & M (%%rsi) */
"movq %%rbx,%%rdx\n"
"andq %%r15,%%rdx\n"
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
"xor %%ecx,%%ecx\n"
/* tx = t4 >> 48 (tmp3) */
"movq %%rdx,%%rbp\n"
"shrq $48,%%rbp\n" /*Q3 OUT*/
/* t4 &= (M >> 4) (tmp2) */
"movq $0xffffffffffff,%%rax\n"
"andq %%rax,%%rdx\n"
"movq %%rdx,%%xmm0\n"/*Q2 OUT*/
/* c = a0 * a0 */
"movq %%r10,%%rax\n"
"mulq %%r10\n"
"movq %%rax,%%r8\n"
"movq %%r11,%%rax\n"
"movq %%rdx,%%r9\n"
/* d += a1 * a4 */
"mulq %%r14\n"
"addq %%rax,%%rbx\n"
"leaq (%%r12,%%r12,1),%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d += (a2*2) * a3 */
"mulq %%r13\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* u0 = d & M (%%rsi) */
"movq %%rbx,%%rdx\n"
"movq %%r15,%%rax\n"
"andq %%rax,%%rdx\n"
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
"xor %%ecx,%%ecx\n"
/* u0 = (u0 << 4) | tx (%%rsi) */
"shlq $4,%%rdx\n"
"orq %%rbp,%%rdx\n" /*Q3 RETURNS*/
/* c += u0 * (R >> 4) */
"movq $0x1000003d1,%%rax\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* r[0] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,0(%%rdi)\n"
/* c >>= 52 */
"shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n"
/* a0 *= 2 */
"addq %%r10,%%r10\n"
/* c += a0 * a1 */
"movq %%r10,%%rax\n"
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"movq %%r12,%%rax\n"
"adcq %%rdx,%%r9\n"
/* d += a2 * a4 */
"mulq %%r14\n"
"addq %%rax,%%rbx\n"
"movq %%r13,%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d += a3 * a3 */
"mulq %%r13\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* c += (d & M) * R */
"movq %%rbx,%%rax\n"
"andq %%r15,%%rax\n"
"mulq %%rsp\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
"xor %%ecx,%%ecx\n"
/* r[1] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,8(%%rdi)\n"
/* c >>= 52 */
"movq %%r10,%%rax\n"
"shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n"
/* c += a0 * a2 (last use of %%r10) */
"mulq %%r12\n"
"addq %%rax,%%r8\n"
"movq %%r11,%%rax\n"
"movq %%xmm0,%%r12\n" /*Q2 RETURNS*/
"adcq %%rdx,%%r9\n"
/* fetch t3 (%%r10, overwrites a0),t4 (%%rsi) */
/*"movq %q1,%%r10\n" */
/* c += a1 * a1 */
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"movq %%r13,%%rax\n"
"adcq %%rdx,%%r9\n"
/* d += a3 * a4 */
"mulq %%r14\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* c += (d & M) * R */
"movq %%rbx,%%rax\n"
"andq %%r15,%%rax\n"
"mulq %%rsp\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* d >>= 52 (%%rbx only) */
"shrdq $52,%%rcx,%%rbx\n"
/* r[2] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,16(%%rdi)\n"
/* c >>= 52 */
"shrdq $52,%%r9,%%r8\n"
"xorq %%r14,%%r14\n"
/* c += t3 */
"movq %%rbx,%%rax\n"
"addq %%rsi,%%r8\n" /*RSI = Q1 RETURNS*/
/* c += d * R */
"mulq %%rsp\n"
"movq %%xmm1, %%rsp\n"
"movq %%xmm2, %%rbp\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r14\n"
/* r[3] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,24(%%rdi)\n"
/* c >>= 52 (%%r8 only) */
"shrdq $52,%%r14,%%r8\n"
/* c += t4 (%%r8 only) */
"addq %%r12, %%r8\n"
/* r[4] = c */
"movq %%r8,32(%%rdi)\n"
: "+S"(a)
: "D"(r)
: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory");
}
#endif
/** If flag is true, set *r equal to *a; otherwise leave it. Constant-time. */
static void secp256k1_fe_cmov(secp256k1_fe *r, const secp256k1_fe *a, int flag);
field_get_b32: min 0.647us / avg 0.666us / max 0.751us
field_set_b32: min 0.551us / avg 0.571us / max 0.624us
becomes
field_get_b32: min 0us / avg 0.0000000477us / max 0.000000238us
field_set_b32: min 0us / avg 0.0000000238us / max 0.000000238us
Times for tests:
gcc version 6.3.1 20170109 (GCC)
1) CFLAGS -g -O2
real 0m14.365s
user 0m14.357s
sys 0m0.007s
2) CFLAGS -O3 -march=sklake
real 0m13.549s
user 0m13.547s
sys 0m0.000s
3) CFLAGS -O3 -march=sklake & USE_ENDOMORPHISM 1
real 0m15.660s
user 0m15.660s
sys 0m0.000s
4) CFLAGS -g -O2 & USE_ENDOMORPHISM 1
real 0m16.139s
user 0m16.137s
sys 0m0.000s
5) CFLAGS -g -O2 & undef USE_ASM_X86_64
real 0m14.849s
user 0m14.847s
sys 0m0.000s
6) CFLAGS -O3 -march=sklake & undef USE_ASM_X86_64
real 0m14.520s
user 0m14.517s
sys 0m0.000s
Times for tests:
gcc version 6.3.1 20170109 (GCC)
1) CFLAGS -g -O2
real 0m14.365s
user 0m14.357s
sys 0m0.007s
2) CFLAGS -O3 -march=sklake
real 0m13.549s
user 0m13.547s
sys 0m0.000s
3) CFLAGS -O3 -march=sklake & USE_ENDOMORPHISM 1
real 0m15.660s
user 0m15.660s
sys 0m0.000s
4) CFLAGS -g -O2 & USE_ENDOMORPHISM 1
real 0m16.139s
user 0m16.137s
sys 0m0.000s
5) CFLAGS -g -O2 & undef USE_ASM_X86_64
real 0m14.849s
user 0m14.847s
sys 0m0.000s
6) CFLAGS -O3 -march=sklake & undef USE_ASM_X86_64
real 0m14.520s
user 0m14.517s
sys 0m0.000s
r->n[0] = (uint64_t)a[31]
| (uint64_t)a[30] << 8
| (uint64_t)a[29] << 16
| (uint64_t)a[28] << 24
| (uint64_t)a[27] << 32
| (uint64_t)a[26] << 40
| (uint64_t)(a[25] & 0xF) << 48;
r->n[1] = (uint64_t)((a[25] >> 4) & 0xF)
| (uint64_t)a[24] << 4
| (uint64_t)a[23] << 12
| (uint64_t)a[22] << 20
| (uint64_t)a[21] << 28
| (uint64_t)a[20] << 36
| (uint64_t)a[19] << 44;
r->n[2] = (uint64_t)a[18]
| (uint64_t)a[17] << 8
| (uint64_t)a[16] << 16
| (uint64_t)a[15] << 24
| (uint64_t)a[14] << 32
| (uint64_t)a[13] << 40
| (uint64_t)(a[12] & 0xF) << 48;
r->n[3] = (uint64_t)((a[12] >> 4) & 0xF)
| (uint64_t)a[11] << 4
| (uint64_t)a[10] << 12
| (uint64_t)a[9] << 20
| (uint64_t)a[8] << 28
| (uint64_t)a[7] << 36
| (uint64_t)a[6] << 44;
r->n[4] = (uint64_t)a[5]
| (uint64_t)a[4] << 8
| (uint64_t)a[3] << 16
| (uint64_t)a[2] << 24
| (uint64_t)a[1] << 32
| (uint64_t)a[0] << 40;