It was the Bitcointalk forum that inspired us to create Bitcointalksearch.org - Bitcointalk is an excellent site that should be the default page for anybody dealing in cryptocurrency, since it is a virtual gold-mine of data. However, our experience and user feedback led us create our site; Bitcointalk's search is slow, and difficult to get the results you need, because you need to log in first to find anything useful - furthermore, there are rate limiters for their search functionality.
The aim of our project is to create a faster website that yields more results and faster without having to create an account and eliminate the need to log in - your personal data, therefore, will never be in jeopardy since we are not asking for any of your data and you don't need to provide them to use our site with all of its capabilities.
We created this website with the sole purpose of users being able to search quickly and efficiently in the field of cryptocurrency so they will have access to the latest and most accurate information and thereby assisting the crypto-community at large.
/**********************************************************************
* Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille *
* Distributed under the MIT software license, see the accompanying *
* file COPYING or http://www.opensource.org/licenses/mit-license.php.*
**********************************************************************/
/**
* Changelog:
* - March 2013, Diederik Huys: original version
* - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm
* - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly
*/
#ifndef _SECP256K1_FIELD_INNER5X52_IMPL_H_
#define _SECP256K1_FIELD_INNER5X52_IMPL_H_
SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) {
/**
* Registers: rdx:rax = multiplication accumulator
* r9:r8 = c
* r15:rcx = d
* r10-r14 = a0-a4
* rbx = b
* rdi = r
* rsi = a / t?
*/
uint64_t tmp1, tmp2;
__asm__ __volatile__(
"movq 24(%%rsi),%%r13\n"
"movq 0(%%rbx),%%rax\n"
"movq 32(%%rsi),%%r14\n"
/* d += a3 * b0 */
"mulq %%r13\n"
"movq 0(%%rsi),%%r10\n"
"movq 8(%%rsi),%%r11\n"
"movq %%rax,%%r9\n"
"movq 16(%%rsi),%%r12\n"
"movq 8(%%rbx),%%rax\n"
"movq %%rdx,%%rsi\n"
/* d += a2 * b1 */
"mulq %%r12\n"
"addq %%rax,%%r9\n"
"movq 16(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a1 * b2 */
"mulq %%r11\n"
"movq $0x1000003d10,%%rcx\n"
"movq $0xfffffffffffff,%%r15\n"
"addq %%rax,%%r9\n"
"movq 24(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d = a0 * b3 */
"mulq %%r10\n"
"addq %%rax,%%r9\n"
"movq 32(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* c = a4 * b4 */
"mulq %%r14\n"
"movq %%rax,%%r8\n"
"shrdq $52,%%rdx,%%r8\n" /* c >>= 52 (%%r8 only) */
/* d += (c & M) * R */
"andq %%r15,%%rax\n"
"mulq %%rcx\n"
"addq %%rax,%%r9\n"
"adcq %%rdx,%%rsi\n"
/* t3 (tmp1) = d & M */
"movq %%r9,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,%q1\n"
/* d >>= 52 */
"movq 0(%%rbx),%%rax\n"
"shrdq $52,%%rsi,%%r9\n"
"xor %%esi,%%esi\n"
/* d += a4 * b0 */
"mulq %%r14\n"
"addq %%rax,%%r9\n"
"movq 8(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a3 * b1 */
"mulq %%r13\n"
"addq %%rax,%%r9\n"
"movq 16(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a2 * b2 */
"mulq %%r12\n"
"addq %%rax,%%r9\n"
"movq 24(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a1 * b3 */
"mulq %%r11\n"
"addq %%rax,%%r9\n"
"movq 32(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a0 * b4 */
"mulq %%r10\n"
"addq %%rax,%%r9\n"
/* d += c * R */
"movq %%rcx,%%rax\n"
"adcq %%rdx,%%rsi\n"
"mulq %%r8\n"
"addq %%rax,%%r9\n"
"adcq %%rdx,%%rsi\n"
/* t4 = d & M (%%r15) */
"movq %%r9,%%rax\n"
"andq %%r15,%%rax\n"
/* d >>= 52 */
"shrdq $52,%%rsi,%%r9\n"
"xor %%esi,%%esi\n"
/* tx = t4 >> 48 (tmp3) */
"movq %%rax,%%r15\n"
"shrq $48,%%r15\n" /*Q3*/
/* t4 &= (M >> 4) (tmp2) */
"movq $0xffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n"
"movq %%rax,%q2\n"
/*"movq %q2,%%r15\n" */
"movq 0(%%rbx),%%rax\n"
/* c = a0 * b0 */
"mulq %%r10\n"
"movq %%rax,%%r8\n"
"movq 8(%%rbx),%%rax\n"
"movq %%rdx,%%rcx\n"
/* d += a4 * b1 */
"mulq %%r14\n"
"addq %%rax,%%r9\n"
"movq 16(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a3 * b2 */
"mulq %%r13\n"
"addq %%rax,%%r9\n"
"movq 24(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a2 * b3 */
"mulq %%r12\n"
"addq %%rax,%%r9\n"
"movq 32(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a1 * b4 */
"mulq %%r11\n"
"addq %%rax,%%r9\n"
"adcq %%rdx,%%rsi\n"
"movq %%r15,%%rax\n" /*Q3 transfered*/
/* u0 = d & M (%%r15) */
"movq %%r9,%%rdx\n"
"shrdq $52,%%rsi,%%r9\n"
"movq $0xfffffffffffff,%%r15\n"
"xor %%esi, %%esi\n"
"andq %%r15,%%rdx\n"
/* d >>= 52 */
/* u0 = (u0 << 4) | tx (%%r15) */
"shlq $4,%%rdx\n"
"orq %%rax,%%rdx\n"
/* c += u0 * (R >> 4) */
"movq $0x1000003d1,%%rax\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%rcx\n"
/* r[0] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,0(%%rdi)\n"
/* c >>= 52 */
"movq 0(%%rbx),%%rax\n"
"shrdq $52,%%rcx,%%r8\n"
"xor %%ecx,%%ecx\n"
/* c += a1 * b0 */
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"movq 8(%%rbx),%%rax\n"
"adcq %%rdx,%%rcx\n"
/* c += a0 * b1 */
"mulq %%r10\n"
"addq %%rax,%%r8\n"
"movq 16(%%rbx),%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d += a4 * b2 */
"mulq %%r14\n"
"addq %%rax,%%r9\n"
"movq 24(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a3 * b3 */
"mulq %%r13\n"
"addq %%rax,%%r9\n"
"movq 32(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a2 * b4 */
"mulq %%r12\n"
"addq %%rax,%%r9\n"
"adcq %%rdx,%%rsi\n"
/* c += (d & M) * R */
"movq %%r9,%%rax\n"
"movq $0x1000003d10,%%rdx\n"
"andq %%r15,%%rax\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%rcx\n"
/* d >>= 52 */
"shrdq $52,%%rsi,%%r9\n"
/* r[1] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,8(%%rdi)\n"
/* c >>= 52 */
"movq 0(%%rbx),%%rax\n"
"shrdq $52,%%rcx,%%r8\n"
"xor %%ecx,%%ecx\n"
/* c += a2 * b0 */
"mulq %%r12\n"
"addq %%rax,%%r8\n"
"movq 8(%%rbx),%%rax\n"
"adcq %%rdx,%%rcx\n"
/* c += a1 * b1 */
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"movq 16(%%rbx),%%rax\n"
"adcq %%rdx,%%rcx\n"
/* c += a0 * b2 (last use of %%r10 = a0) */
"mulq %%r10\n"
"addq %%rax,%%r8\n"
/* fetch t3 (%%r10, overwrites a0), t4 (%%r15) */
"movq 24(%%rbx),%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d += a4 * b3 */
"mulq %%r14\n"
"movq %q1,%%r10\n"
"xor %%esi, %%esi\n"
"addq %%rax,%%r9\n"
"movq 32(%%rbx),%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a3 * b4 */
"mulq %%r13\n"
"addq %%rax,%%r9\n"
"movq $0x1000003d10,%%r11\n"
"adcq %%rdx,%%rsi\n"
/* c += (d & M) * R */
"movq %%r9,%%rax\n"
"andq %%r15,%%rax\n"
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%rcx\n"
/* d >>= 52 (%%r9 only) */
"shrdq $52,%%rsi,%%r9\n"
/* r[2] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %q2,%%rsi\n"
"movq %%rax,16(%%rdi)\n"
/* c >>= 52 */
"shrdq $52,%%rcx,%%r8\n"
/* c += t3 */
"xor %%ecx,%%ecx\n"
"movq %%r9,%%rax\n"
"addq %%r10,%%r8\n"
/* c += d * R */
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%rcx\n"
/* r[3] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,24(%%rdi)\n"
/* c >>= 52 (%%r8 only) */
"shrdq $52,%%rcx,%%r8\n"
/* c += t4 (%%r8 only) */
"addq %%rsi,%%r8\n"
/* r[4] = c */
"movq %%r8,32(%%rdi)\n"
: "+S"(a), "=m"(tmp1), "=m"(tmp2)
: "b"(b), "D"(r)
: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory");
}
SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) {
/**
* Registers: rdx:rax = multiplication accumulator
* r9:r8 = c
* rcx:rbx = d
* r10-r14 = a0-a4
* r15 = M (0xfffffffffffff)
* rdi = r
* rsi = a / t?
*/
uint64_t tmp1a;
__asm__ __volatile__(
"movq 0(%%rsi),%%r10\n"
"movq 8(%%rsi),%%r11\n"
"movq 16(%%rsi),%%r12\n"
"movq 24(%%rsi),%%r13\n"
"movq 32(%%rsi),%%r14\n"
"leaq (%%r10,%%r10,1),%%rax\n"
"movq $0xfffffffffffff,%%r15\n"
/* d = (a0*2) * a3 */
"mulq %%r13\n"
"movq %%rax,%%rbx\n"
"leaq (%%r11,%%r11,1),%%rax\n"
"movq %%rdx,%%rcx\n"
/* d += (a1*2) * a2 */
"mulq %%r12\n"
"addq %%rax,%%rbx\n"
"movq %%r14,%%rax\n"
"adcq %%rdx,%%rcx\n"
/* c = a4 * a4 */
"mulq %%r14\n"
"movq %%rax,%%r8\n"
"movq %%rdx,%%r9\n"
/* d += (c & M) * R */
"movq $0x1000003d10,%%rdx\n"
"andq %%r15,%%rax\n"
"mulq %%rdx\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* c >>= 52 (%%r8 only) */
"shrdq $52,%%r9,%%r8\n"
/* t3 (tmp1) = d & M */
"movq %%rbx,%%rsi\n"
"andq %%r15,%%rsi\n" /*Q1 became rsi*/
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
/* a4 *= 2 */
"movq %%r10,%%rax\n"
"addq %%r14,%%r14\n"
/* d += a0 * a4 */
"mulq %%r14\n"
"xor %%ecx,%%ecx\n"
"addq %%rax,%%rbx\n"
"leaq (%%r11,%%r11,1),%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d+= (a1*2) * a3 */
"mulq %%r13\n"
"addq %%rax,%%rbx\n"
"movq %%r12,%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d += a2 * a2 */
"mulq %%r12\n"
"addq %%rax,%%rbx\n"
/* d += c * R */
"movq %%r8,%%rax\n"
"movq $0x1000003d10,%%r8\n"
"adcq %%rdx,%%rcx\n"
"mulq %%r8\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* t4 = d & M (%%rsi) */
"movq %%rbx,%%rdx\n"
"andq %%r15,%%rdx\n"
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
"xor %%ecx,%%ecx\n"
/* tx = t4 >> 48 (tmp3) */
"movq %%rdx,%%r15\n"
"shrq $48,%%r15\n" /*Q3=R15*/
/* t4 &= (M >> 4) (tmp2) */
"movq $0xffffffffffff,%%rax\n"
"andq %%rax,%%rdx\n"
"movq %%rdx,%q1\n"/*Q2 OUT - renamed to q1*/
/* c = a0 * a0 */
"movq %%r10,%%rax\n"
"mulq %%r10\n"
"movq %%rax,%%r8\n"
"movq %%r11,%%rax\n"
"movq %%rdx,%%r9\n"
/* d += a1 * a4 */
"mulq %%r14\n"
"addq %%rax,%%rbx\n"
"leaq (%%r12,%%r12,1),%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d += (a2*2) * a3 */
"mulq %%r13\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* u0 = d & M (%%rsi) */
"movq %%rbx,%%rdx\n"
"movq $0xfffffffffffff,%%rax\n"
"andq %%rax,%%rdx\n"
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
"xor %%ecx,%%ecx\n"
/* u0 = (u0 << 4) | tx (%%rsi) */
"shlq $4,%%rdx\n"
"orq %%r15,%%rdx\n" /*Q3 - R15 RETURNS*/
/* c += u0 * (R >> 4) */
"movq $0x1000003d1,%%rax\n"
"movq $0xfffffffffffff,%%r15\n" /*R15 back in its place*/
"mulq %%rdx\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* r[0] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,0(%%rdi)\n"
/* c >>= 52 */
"shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n"
/* a0 *= 2 */
"addq %%r10,%%r10\n"
/* c += a0 * a1 */
"movq %%r10,%%rax\n"
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"movq %%r12,%%rax\n"
"adcq %%rdx,%%r9\n"
/* d += a2 * a4 */
"mulq %%r14\n"
"addq %%rax,%%rbx\n"
"movq %%r13,%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d += a3 * a3 */
"mulq %%r13\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* c += (d & M) * R */
"movq %%rbx,%%rax\n"
"andq %%r15,%%rax\n"
"movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
"xor %%ecx,%%ecx\n"
/* r[1] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,8(%%rdi)\n"
/* c >>= 52 */
"movq %%r10,%%rax\n"
"shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n"
/* c += a0 * a2 (last use of %%r10) */
"mulq %%r12\n"
"addq %%rax,%%r8\n"
"movq %%r11,%%rax\n"
"movq %q1,%%r12\n" /*Q2 RETURNS*/
"adcq %%rdx,%%r9\n"
/* fetch t3 (%%r10, overwrites a0),t4 (%%rsi) */
/*"movq %q1,%%r10\n" */
/* c += a1 * a1 */
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"movq %%r13,%%rax\n"
"adcq %%rdx,%%r9\n"
/* d += a3 * a4 */
"mulq %%r14\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* c += (d & M) * R */
"movq %%rbx,%%rax\n"
"andq %%r15,%%rax\n"
"movq $0x1000003d10,%%r13\n"
"mulq %%r13\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* d >>= 52 (%%rbx only) */
"shrdq $52,%%rcx,%%rbx\n"
/* r[2] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,16(%%rdi)\n"
/* c >>= 52 */
"shrdq $52,%%r9,%%r8\n"
"xorq %%r14,%%r14\n"
/* c += t3 */
"movq %%rbx,%%rax\n"
"addq %%rsi,%%r8\n" /*RSI = Q1*/
/* c += d * R */
"mulq %%r13\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r14\n"
/* r[3] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,24(%%rdi)\n"
/* c >>= 52 (%%r8 only) */
"shrdq $52,%%r14,%%r8\n"
/* c += t4 (%%r8 only) */
"addq %%r12, %%r8\n"
/* r[4] = c */
"movq %%r8,32(%%rdi)\n"
: "+S"(a), "=m"(tmp1a)
: "D"(r)
: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory");
}
#endif
/**********************************************************************
* Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille *
* Distributed under the MIT software license, see the accompanying *
* file COPYING or http://www.opensource.org/licenses/mit-license.php.*
**********************************************************************/
/**
* Changelog:
* - March 2013, Diederik Huys: original version
* - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm
* - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly
*/
#ifndef _SECP256K1_FIELD_INNER5X52_IMPL_H_
#define _SECP256K1_FIELD_INNER5X52_IMPL_H_
SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) {
/**
* Registers: rdx:rax = multiplication accumulator
* r9:r8 = c
* r15:rcx = d
* r10-r14 = a0-a4
* rbx = b
* rdi = r
* rsi = a / t?
*/
/* xmm0 = q1 xmm6=q2 */
/* This has 17 mem accesses + 17 xmm uses vs 35 mem access and no xmm use*/
__asm__ __volatile__(
"push %%rbx\n"
"movq %%rsp, %%xmm1\n"
"movq %%rbp, %%xmm2\n"
"movq %%rdi, %%xmm3\n"
"movq 0(%%rbx),%%rdi\n"
"movq 8(%%rbx),%%rbp\n"
"movq 16(%%rbx),%%rsp\n"
"movq %%rdi,%%xmm4\n"
"movq 24(%%rsi),%%r13\n"
"movq %%rdi,%%rax\n"
"movq 32(%%rsi),%%r14\n"
/* d += a3 * b0 */
"mulq %%r13\n"
"movq 0(%%rsi),%%r10\n"
"movq %%rax,%%r9\n"
"movq 8(%%rsi),%%r11\n"
"movq 16(%%rsi),%%r12\n"
"movq %%rbp,%%rax\n"
"movq %%rdx,%%rsi\n"
/* d += a2 * b1 */
"mulq %%r12\n"
"movq 24(%%rbx),%%rcx\n"
"movq 32(%%rbx),%%rbx\n"
"addq %%rax,%%r9\n"
"movq %%rsp,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a1 * b2 */
"mulq %%r11\n"
"addq %%rax,%%r9\n"
"movq %%rcx,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d = a0 * b3 */
"mulq %%r10\n"
"addq %%rax,%%r9\n"
"movq %%rbx,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* c = a4 * b4 */
"mulq %%r14\n"
"movq $0xfffffffffffff,%%r15\n"
"movq %%rax,%%r8\n"
/* d += (c & M) * R */
"andq %%r15,%%rax\n"
"shrdq $52,%%rdx,%%r8\n" /* c >>= 52 (%%r8 only) */
"movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%r9\n"
"adcq %%rdx,%%rsi\n"
/* t3 (tmp1) = d & M */
"movq %%r9,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,%%xmm0\n"
/* d >>= 52 */
"movq %%rdi,%%rax\n"
"shrdq $52,%%rsi,%%r9\n"
"xor %%esi,%%esi\n"
/* d += a4 * b0 */
"mulq %%r14\n"
"addq %%rax,%%r9\n"
"movq %%rbp,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a3 * b1 */
"mulq %%r13\n"
"addq %%rax,%%r9\n"
"movq %%rsp,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a2 * b2 */
"mulq %%r12\n"
"addq %%rax,%%r9\n"
"movq %%rcx,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a1 * b3 */
"mulq %%r11\n"
"addq %%rax,%%r9\n"
"movq %%rbx,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a0 * b4 */
"mulq %%r10\n"
"addq %%rax,%%r9\n"
/* d += c * R */
"movq $0x1000003d10,%%rax\n"
"adcq %%rdx,%%rsi\n"
"mulq %%r8\n"
"addq %%rax,%%r9\n"
"adcq %%rdx,%%rsi\n"
/* t4 = d & M (%%r15) */
"movq %%r9,%%rax\n"
"andq %%r15,%%rax\n"
/* d >>= 52 */
"shrdq $52,%%rsi,%%r9\n"
"xor %%esi,%%esi\n"
/* tx = t4 >> 48 (tmp3) */
"movq %%rax,%%r15\n"
"shrq $48,%%r15\n" /*Q3*/
/* t4 &= (M >> 4) (tmp2) */
"movq $0xffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n"
"movq %%rax,%%xmm6\n"
/*"movq %q2,%%r15\n" */
"movq %%rdi,%%rax\n"
/* c = a0 * b0 */
"mulq %%r10\n"
"movq %%rcx,%%xmm5\n"
"movq %%rax,%%r8\n"
"movq %%rbp,%%rax\n"
"movq %%rdx,%%rcx\n"
/* d += a4 * b1 */
"mulq %%r14\n"
"addq %%rax,%%r9\n"
"movq %%rsp,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a3 * b2 */
"mulq %%r13\n"
"addq %%rax,%%r9\n"
"movq %%xmm5,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a2 * b3 */
"mulq %%r12\n"
"addq %%rax,%%r9\n"
"movq %%rbx,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a1 * b4 */
"mulq %%r11\n"
"addq %%rax,%%r9\n"
"adcq %%rdx,%%rsi\n"
"movq %%r15,%%rax\n" /*Q3 transfered*/
/* u0 = d & M (%%r15) */
"movq %%r9,%%rdx\n"
"shrdq $52,%%rsi,%%r9\n"
"movq $0xfffffffffffff,%%r15\n"
"xor %%esi, %%esi\n"
"andq %%r15,%%rdx\n"
/* d >>= 52 */
/* u0 = (u0 << 4) | tx (%%r15) */
"shlq $4,%%rdx\n"
"orq %%rax,%%rdx\n"
/* c += u0 * (R >> 4) */
"movq $0x1000003d1,%%rax\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%rcx\n"
/* r[0] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,%%rdx\n"
/* c >>= 52 */
"movq %%rdi,%%rax\n"
"movq %%xmm3, %%rdi\n"
"shrdq $52,%%rcx,%%r8\n"
"xor %%ecx,%%ecx\n"
"movq %%rdx,0(%%rdi)\n"
/* c += a1 * b0 */
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"movq %%rbp,%%rax\n"
"adcq %%rdx,%%rcx\n"
/* c += a0 * b1 */
"mulq %%r10\n"
"addq %%rax,%%r8\n"
"movq %%rsp,%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d += a4 * b2 */
"mulq %%r14\n"
"addq %%rax,%%r9\n"
"movq %%xmm5,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a3 * b3 */
"mulq %%r13\n"
"addq %%rax,%%r9\n"
"movq %%rbx,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a2 * b4 */
"mulq %%r12\n"
"addq %%rax,%%r9\n"
"adcq %%rdx,%%rsi\n"
/* c += (d & M) * R */
"movq %%r9,%%rax\n"
"movq $0x1000003d10,%%rdx\n"
"andq %%r15,%%rax\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%rcx\n"
/* d >>= 52 */
"shrdq $52,%%rsi,%%r9\n"
/* r[1] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,8(%%rdi)\n"
/* c >>= 52 */
"movq %%xmm4,%%rax\n"
"shrdq $52,%%rcx,%%r8\n"
"xor %%ecx,%%ecx\n"
/* c += a2 * b0 */
"mulq %%r12\n"
"addq %%rax,%%r8\n"
"movq %%rbp,%%rax\n"
"adcq %%rdx,%%rcx\n"
/* c += a1 * b1 */
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"movq %%rsp,%%rax\n"
"adcq %%rdx,%%rcx\n"
/* c += a0 * b2 (last use of %%r10 = a0) */
"mulq %%r10\n"
"addq %%rax,%%r8\n"
/* fetch t3 (%%r10, overwrites a0), t4 (%%r15) */
"movq %%xmm5,%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d += a4 * b3 */
"mulq %%r14\n"
"movq %%xmm0,%%r10\n"
"xor %%esi, %%esi\n"
"addq %%rax,%%r9\n"
"movq %%rbx,%%rax\n"
"adcq %%rdx,%%rsi\n"
/* d += a3 * b4 */
"mulq %%r13\n"
"addq %%rax,%%r9\n"
"movq $0x1000003d10,%%rbx\n"
"adcq %%rdx,%%rsi\n"
/* c += (d & M) * R */
"movq %%r9,%%rax\n"
"andq %%r15,%%rax\n"
"mulq %%rbx\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%rcx\n"
/* d >>= 52 (%%r9 only) */
"shrdq $52,%%rsi,%%r9\n"
/* r[2] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,16(%%rdi)\n"
/* c >>= 52 */
"shrdq $52,%%rcx,%%r8\n"
/* c += t3 */
"movq %%r9,%%rax\n"
"addq %%r10,%%r8\n"
"xor %%ecx,%%ecx\n"
/* c += d * R */
"mulq %%rbx\n"
"movq %%xmm1, %%rsp\n"
"movq %%xmm2, %%rbp\n"
"movq %%xmm6,%%rsi\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%rcx\n"
/* r[3] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,24(%%rdi)\n"
/* c >>= 52 (%%r8 only) */
"shrdq $52,%%rcx,%%r8\n"
/* c += t4 (%%r8 only) */
"addq %%rsi,%%r8\n"
/* r[4] = c */
"movq %%r8,32(%%rdi)\n"
"pop %%rbx\n"
: "+S"(a)
: "b"(b), "D"(r)
: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory");
}
SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) {
/**
* Registers: rdx:rax = multiplication accumulator
* r9:r8 = c
* rcx:rbx = d
* r10-r14 = a0-a4
* r15 = M (0xfffffffffffff)
* rdi = r
* rsi = a / t?
*/
/* tmp1a = xmm0 */
__asm__ __volatile__(
"movq %%rsp, %%xmm1\n"
"movq %%rbp, %%xmm2\n"
"movq 0(%%rsi),%%r10\n"
"movq 8(%%rsi),%%r11\n"
"movq 16(%%rsi),%%r12\n"
"movq 24(%%rsi),%%r13\n"
"movq 32(%%rsi),%%r14\n"
"leaq (%%r10,%%r10,1),%%rax\n"
"movq $0xfffffffffffff,%%r15\n"
/* d = (a0*2) * a3 */
"mulq %%r13\n"
"movq %%rax,%%rbx\n"
"leaq (%%r11,%%r11,1),%%rax\n"
"movq %%rdx,%%rcx\n"
/* d += (a1*2) * a2 */
"mulq %%r12\n"
"addq %%rax,%%rbx\n"
"movq %%r14,%%rax\n"
"adcq %%rdx,%%rcx\n"
/* c = a4 * a4 */
"mulq %%r14\n"
"movq %%rax,%%r8\n"
"movq %%rdx,%%r9\n"
/* d += (c & M) * R */
"movq $0x1000003d10,%%rsp\n"
"andq %%r15,%%rax\n"
"mulq %%rsp\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* c >>= 52 (%%r8 only) */
"shrdq $52,%%r9,%%r8\n"
/* t3 (tmp1) = d & M */
"movq %%rbx,%%rsi\n"
"andq %%r15,%%rsi\n" /*Q1 OUT*/
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
/* a4 *= 2 */
"movq %%r10,%%rax\n"
"addq %%r14,%%r14\n"
/* d += a0 * a4 */
"mulq %%r14\n"
"xor %%ecx,%%ecx\n"
"addq %%rax,%%rbx\n"
"leaq (%%r11,%%r11,1),%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d+= (a1*2) * a3 */
"mulq %%r13\n"
"addq %%rax,%%rbx\n"
"movq %%r12,%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d += a2 * a2 */
"mulq %%r12\n"
"addq %%rax,%%rbx\n"
/* d += c * R */
"movq %%r8,%%rax\n"
"adcq %%rdx,%%rcx\n"
"mulq %%rsp\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* t4 = d & M (%%rsi) */
"movq %%rbx,%%rdx\n"
"andq %%r15,%%rdx\n"
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
"xor %%ecx,%%ecx\n"
/* tx = t4 >> 48 (tmp3) */
"movq %%rdx,%%rbp\n"
"shrq $48,%%rbp\n" /*Q3 OUT*/
/* t4 &= (M >> 4) (tmp2) */
"movq $0xffffffffffff,%%rax\n"
"andq %%rax,%%rdx\n"
"movq %%rdx,%%xmm0\n"/*Q2 OUT*/
/* c = a0 * a0 */
"movq %%r10,%%rax\n"
"mulq %%r10\n"
"movq %%rax,%%r8\n"
"movq %%r11,%%rax\n"
"movq %%rdx,%%r9\n"
/* d += a1 * a4 */
"mulq %%r14\n"
"addq %%rax,%%rbx\n"
"leaq (%%r12,%%r12,1),%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d += (a2*2) * a3 */
"mulq %%r13\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* u0 = d & M (%%rsi) */
"movq %%rbx,%%rdx\n"
"movq %%r15,%%rax\n"
"andq %%rax,%%rdx\n"
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
"xor %%ecx,%%ecx\n"
/* u0 = (u0 << 4) | tx (%%rsi) */
"shlq $4,%%rdx\n"
"orq %%rbp,%%rdx\n" /*Q3 RETURNS*/
/* c += u0 * (R >> 4) */
"movq $0x1000003d1,%%rax\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* r[0] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,0(%%rdi)\n"
/* c >>= 52 */
"shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n"
/* a0 *= 2 */
"addq %%r10,%%r10\n"
/* c += a0 * a1 */
"movq %%r10,%%rax\n"
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"movq %%r12,%%rax\n"
"adcq %%rdx,%%r9\n"
/* d += a2 * a4 */
"mulq %%r14\n"
"addq %%rax,%%rbx\n"
"movq %%r13,%%rax\n"
"adcq %%rdx,%%rcx\n"
/* d += a3 * a3 */
"mulq %%r13\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* c += (d & M) * R */
"movq %%rbx,%%rax\n"
"andq %%r15,%%rax\n"
"mulq %%rsp\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
"xor %%ecx,%%ecx\n"
/* r[1] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,8(%%rdi)\n"
/* c >>= 52 */
"movq %%r10,%%rax\n"
"shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n"
/* c += a0 * a2 (last use of %%r10) */
"mulq %%r12\n"
"addq %%rax,%%r8\n"
"movq %%r11,%%rax\n"
"movq %%xmm0,%%r12\n" /*Q2 RETURNS*/
"adcq %%rdx,%%r9\n"
/* fetch t3 (%%r10, overwrites a0),t4 (%%rsi) */
/*"movq %q1,%%r10\n" */
/* c += a1 * a1 */
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"movq %%r13,%%rax\n"
"adcq %%rdx,%%r9\n"
/* d += a3 * a4 */
"mulq %%r14\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* c += (d & M) * R */
"movq %%rbx,%%rax\n"
"andq %%r15,%%rax\n"
"mulq %%rsp\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* d >>= 52 (%%rbx only) */
"shrdq $52,%%rcx,%%rbx\n"
/* r[2] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,16(%%rdi)\n"
/* c >>= 52 */
"shrdq $52,%%r9,%%r8\n"
"xorq %%r14,%%r14\n"
/* c += t3 */
"movq %%rbx,%%rax\n"
"addq %%rsi,%%r8\n" /*RSI = Q1 RETURNS*/
/* c += d * R */
"mulq %%rsp\n"
"movq %%xmm1, %%rsp\n"
"movq %%xmm2, %%rbp\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r14\n"
/* r[3] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,24(%%rdi)\n"
/* c >>= 52 (%%r8 only) */
"shrdq $52,%%r14,%%r8\n"
/* c += t4 (%%r8 only) */
"addq %%r12, %%r8\n"
/* r[4] = c */
"movq %%r8,32(%%rdi)\n"
: "+S"(a)
: "D"(r)
: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory");
}
#endif
/** If flag is true, set *r equal to *a; otherwise leave it. Constant-time. */
static void secp256k1_fe_cmov(secp256k1_fe *r, const secp256k1_fe *a, int flag);
field_get_b32: min 0.647us / avg 0.666us / max 0.751us
field_set_b32: min 0.551us / avg 0.571us / max 0.624us
becomes
field_get_b32: min 0us / avg 0.0000000477us / max 0.000000238us
field_set_b32: min 0us / avg 0.0000000238us / max 0.000000238us
Times for tests:
gcc version 6.3.1 20170109 (GCC)
1) CFLAGS -g -O2
real 0m14.365s
user 0m14.357s
sys 0m0.007s
2) CFLAGS -O3 -march=sklake
real 0m13.549s
user 0m13.547s
sys 0m0.000s
3) CFLAGS -O3 -march=sklake & USE_ENDOMORPHISM 1
real 0m15.660s
user 0m15.660s
sys 0m0.000s
4) CFLAGS -g -O2 & USE_ENDOMORPHISM 1
real 0m16.139s
user 0m16.137s
sys 0m0.000s
5) CFLAGS -g -O2 & undef USE_ASM_X86_64
real 0m14.849s
user 0m14.847s
sys 0m0.000s
6) CFLAGS -O3 -march=sklake & undef USE_ASM_X86_64
real 0m14.520s
user 0m14.517s
sys 0m0.000s
Times for tests:
gcc version 6.3.1 20170109 (GCC)
1) CFLAGS -g -O2
real 0m14.365s
user 0m14.357s
sys 0m0.007s
2) CFLAGS -O3 -march=sklake
real 0m13.549s
user 0m13.547s
sys 0m0.000s
3) CFLAGS -O3 -march=sklake & USE_ENDOMORPHISM 1
real 0m15.660s
user 0m15.660s
sys 0m0.000s
4) CFLAGS -g -O2 & USE_ENDOMORPHISM 1
real 0m16.139s
user 0m16.137s
sys 0m0.000s
5) CFLAGS -g -O2 & undef USE_ASM_X86_64
real 0m14.849s
user 0m14.847s
sys 0m0.000s
6) CFLAGS -O3 -march=sklake & undef USE_ASM_X86_64
real 0m14.520s
user 0m14.517s
sys 0m0.000s
r->n[0] = (uint64_t)a[31]
| (uint64_t)a[30] << 8
| (uint64_t)a[29] << 16
| (uint64_t)a[28] << 24
| (uint64_t)a[27] << 32
| (uint64_t)a[26] << 40
| (uint64_t)(a[25] & 0xF) << 48;
r->n[1] = (uint64_t)((a[25] >> 4) & 0xF)
| (uint64_t)a[24] << 4
| (uint64_t)a[23] << 12
| (uint64_t)a[22] << 20
| (uint64_t)a[21] << 28
| (uint64_t)a[20] << 36
| (uint64_t)a[19] << 44;
r->n[2] = (uint64_t)a[18]
| (uint64_t)a[17] << 8
| (uint64_t)a[16] << 16
| (uint64_t)a[15] << 24
| (uint64_t)a[14] << 32
| (uint64_t)a[13] << 40
| (uint64_t)(a[12] & 0xF) << 48;
r->n[3] = (uint64_t)((a[12] >> 4) & 0xF)
| (uint64_t)a[11] << 4
| (uint64_t)a[10] << 12
| (uint64_t)a[9] << 20
| (uint64_t)a[8] << 28
| (uint64_t)a[7] << 36
| (uint64_t)a[6] << 44;
r->n[4] = (uint64_t)a[5]
| (uint64_t)a[4] << 8
| (uint64_t)a[3] << 16
| (uint64_t)a[2] << 24
| (uint64_t)a[1] << 32
| (uint64_t)a[0] << 40;