While we're waiting for RTX 5090 here's some really fast jumper for 64-bit CPUs.
This is 100% working code as I'm using it to test that my CUDA kernel jumps correctly. I really needed it to be as fast as possible so I don't get old waiting for results to validate.
This uses libsecp256k1 internal headers with inlined field and group basic operations, and does batched addition with non-dependent tree inversion loops (translation: a good compiler will use unrolling, SIMD and other CPU instructions to speed things up).
Group operations / second / thread is around 15 - 20 Mo/s on a high-end Intel CPU.
Compile with "-march=native" for best results.
No, this is not a fully-working puzzle breaker. You need to use your brain to add DP logic, saving, and collision checks. This is just the lowest-level detail: a very fast CPU kangaroo jumper for secp256k1.
This function also assumes that a jumped kangaroo can never be a point in the set of jump points, nor its opposite. This guarantee applies to my Kangaroo algorithm by design, so the logic of point doubling or point at infinity is not needed at all.
#include "field_impl.h" // field operations
#include "group_impl.h" // group operations
#define FE_INV(r, x) secp256k1_fe_impl_inv_var(&(r), &(x))
#define FE_MUL(r, a, b) secp256k1_fe_mul_inner((r).n, (a).n, (b).n)
#define FE_SQR(r, x) secp256k1_fe_sqr_inner((r).n, (x).n)
#define FE_ADD(r, d) secp256k1_fe_impl_add(&(r), &(d))
#define FE_NEG(r, a, m) secp256k1_fe_impl_negate_unchecked(&(r), &(a), (m))
static
void jump_batch(
secp256k1_ge * ge,
const secp256k1_ge * jp,
secp256k1_fe * xz, // product tree leafs + parent nodes
secp256k1_fe * xzOut,
U32 batch_size
) {
secp256k1_fe t1, t2, t3;
int64_t i;
for (i = 0; i < batch_size; i++) {
uint8_t jIdx;
#if JUMP_FUNC == JUMP_FUNC_LOW_52
jIdx = ge[i].x.n[0] % NUM_JUMP_POINTS;
#elif JUMP_FUNC == JUMP_FUNC_LOW_64
jIdx = (ge[i].x.n[0] | (ge[i].x.n[1] << 52)) % NUM_JUMP_POINTS;
#endif
xz[i] = ge[i].x;
FE_NEG(t1, jp[jIdx].x, 1);
FE_ADD(xz[i], t1); // XZ[i] = x1 - x2
}
for (i = 0; i < batch_size - 1; i++) {
FE_MUL(xz[batch_size + i], xz[i * 2], xz[i * 2 + 1]);
}
FE_INV(xzOut[batch_size * 2 - 2], xz[2 * batch_size - 2]);
for (i = batch_size - 2; i >= 0; i--) {
FE_MUL(xzOut[i * 2], xz[i * 2 + 1], xzOut[batch_size + i]);
FE_MUL(xzOut[i * 2 + 1], xz[i * 2], xzOut[batch_size + i]);
}
secp256k1_ge * _a = ge;
const secp256k1_fe * _inv = xzOut;
for (i = 0; i < batch_size; i++) {
uint8_t jIdx;
#if JUMP_FUNC == JUMP_FUNC_LOW_52
jIdx = ge[i].x.n[0] % NUM_JUMP_POINTS;
#elif JUMP_FUNC == JUMP_FUNC_LOW_64
jIdx = (ge[i].x.n[0] | (ge[i].x.n[1] << 52)) % NUM_JUMP_POINTS;
#endif
const secp256k1_ge * _b = &jp[jIdx];
FE_NEG(t1, _b->y, 1); // T1 = -y2
FE_ADD(_a->y, t1); // Y1 = y1 - y2 m = max_y + 2(1)
FE_MUL(_a->y, _a->y, *_inv); // Y1 = m = (y1 - y2) / (x1 - x2) m = 1
FE_SQR(t2, _a->y); // T2 = m**2 m = 1
FE_NEG(t3, _b->x, 1); // T3 = -x2
FE_ADD(t2, t3); // T2 = m**2 - x2 m = 1 + 2(1) = 3(2)
FE_NEG(_a->x, _a->x, 1); // X1 = -x1 m = max_x + 1
FE_ADD(_a->x, t2); // X1 = x3 = m**2 - x1 - x2 max_x = 3 + max_x + 1
secp256k1_fe_normalize_weak(&_a->x);
FE_NEG(t2, _a->x, 1); // T2 = -x3 m = 1 + 1 = 2
FE_ADD(t2, _b->x); // T1 = x2 - x3 m = 2 + 1 = 3
FE_MUL(_a->y, _a->y, t2); // Y1 = m * (x2 - x3) m = 1
FE_ADD(_a->y, t1); // Y1 = y3 = m * (x2 - x3) - y2 m = 1 + 2 = 3
secp256k1_fe_normalize_weak(&_a->y);
++_a;
++_inv;
}
}
Easy to parallelize, let's add a wrapper that jumps a specific buffer of kangaroos:
static
void computeBatchJump(
secp256k1_ge * ge,
const secp256k1_ge * jp,
U32 batch_size,
U32 num_jumps
) {
size_t tree_sz = (batch_size * 2 - 1) * sizeof(secp256k1_fe);
// printf("Allocating %zu bytes for tree\n", tree_sz);
secp256k1_fe * xz_1 = malloc(tree_sz);
if (NULL == xz_1) return;
secp256k1_fe * xz_2 = malloc(tree_sz);
if (NULL == xz_2) return;
for (uint32_t loop = 0; loop < num_jumps; loop++) {
jump_batch(ge, jp, xz_1, xz_2, batch_size);
}
free(xz_1);
free(xz_2);
}
And now, once you have a really big buffer of kangaroos, you can run the jumps on all of your physical cores:
#define JUMPS_PER_STAGE 32768
secp256k1_ge * secp_ge = malloc(numElements * sizeof(secp256k1_ge));
secp256k1_ge * secp_jp = malloc(NUM_JUMP_POINTS * sizeof(secp256k1_ge));
// init the jump points, init the kangaroos to your needs
// ...
int numLaunches = 1; // extra multiplier for the total number of jumps
int numThr = omp_get_max_threads();
// use the max amount of threads that exactly divides the number of items
while (numThr > 0 && numElements % numThr) numThr--;
U64 gePerPart = numElements / numThr;
printf("\tThreads: %u; elements/thread: %lu\n", numThr, gePerPart);
double ompStartTime = omp_get_wtime();
for (U32 launchIdx = 0; launchIdx < numLaunches; launchIdx++) {
#pragma omp parallel for
for (U32 tIdx = 0; tIdx < numThr; tIdx++) {
U64 offset = tIdx * gePerPart;
secp256k1_ge * localGE = secp_ge + offset;
computeBatchJump(localGE, secp_jp, gePerPart, JUMPS_PER_STAGE);
}
}
double ompEndTime = omp_get_wtime();
elapsedTime = ompEndTime - ompStartTime;
speed = (double) totalCount / elapsedTime;
Good luck.