Author

Topic: GPUCompute.h => ComputeKeysSEARCH_MODE_SA Rewrited for better performance (Read 132 times)

?
Activity: -
Merit: -
Using shared Memory is undoubtedly a fairly exponential advantage, but you can't use it that way when you have to do a complete refactoring of all the GPU libraries § (In fact with your code it is not even possible to compile) Anyway, you understood a priori that Shared Memory is the final goal  Grin

I thought you knew how to code, I gave you the basics, if you can't code it to make it work you are an  bad Coder.
?
Activity: -
Merit: -
something like this would be better!

Code:
__device__ void ComputeKeysSEARCH_MODE_SA(uint32_t mode, uint64_t* startx, uint64_t* starty,
    uint32_t* hash160, uint32_t maxFound, uint32_t* out)
{
    const int group_size = GRP_SIZE / 2 + 1;
    __shared__ uint64_t dx[group_size][4];
    __shared__ uint64_t gx_local[HSIZE][4], gy_local[HSIZE][4];
    uint64_t px[4], py[4], pyn[4], sx[4], sy[4], dy[4], _s[4], _p2[4];

    __syncthreads();

    Load256A(sx, startx);
    Load256A(sy, starty);
    Load256(px, sx);
    Load256(py, sy);

    __syncthreads();

    for (uint32_t i = 0; i < HSIZE; ++i) {
        Load256(gx_local[i], Gx + 4 * i);
        Load256(gy_local[i], Gy + 4 * i);
        ModSub256(dx[i], gx_local[i], sx);
    }
    ModSub256(dx[HSIZE], _2Gnx, sx);

    __syncthreads();
    _ModInvGrouped(dx);

    __syncthreads();
    CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2);

    ModNeg256(pyn, py);

    #pragma unroll
    for (uint32_t i = 0; i < HSIZE; ++i) {
uint64_t gx_local_cache[4], gy_local_cache[4];
        Load256(gx_local_cache, Gx + 4 * i);
        Load256(gy_local_cache, Gy + 4 * i);

        uint64_t px_local[4], py_local[4];
        Load256(px_local, sx);
        Load256(py_local, sy);

        uint64_t dy_cache[4], s_cache[4], p2_cache[4];
        for (int sign = 1; sign >= -1; sign -= 2) {
            Load256(px, px_local);
            Load256(py, py_local);

            if (sign == 1) {
                ModSub256(dy_cache, gy_local_cache, py);
            } else {
                ModSub256(dy_cache, pyn, gy_local_cache);
            }

            s_cache[0] = __umul64hi(dy_cache[0], dx[i][0]);
            p2_cache[0] = __umul64hi(s_cache[0], s_cache[0]);

            ModSub256(px, p2_cache, px);
            ModSub256(px, gx_local_cache);

            ModSub256(py, gx_local_cache, px);
            _ModMult(py, s_cache);
            ModSub256(py, gy_local_cache);

            if (sign == 1) {
                CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2 + (i + 1));
            } else {
                CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2 - (i + 1));
            }
        }
    }
}

Using shared Memory is undoubtedly a fairly exponential advantage, but you can't use it that way when you have to do a complete refactoring of all the GPU libraries § (In fact with your code it is not even possible to compile) Anyway, you understood a priori that Shared Memory is the final goal  Grin

I published this first version because it works. However, I am working on an enhanced version of these libraries using Shared Memory.
?
Activity: -
Merit: -
something like this would be better!

Code:
__device__ void ComputeKeysSEARCH_MODE_SA(uint32_t mode, uint64_t* startx, uint64_t* starty,
    uint32_t* hash160, uint32_t maxFound, uint32_t* out)
{
    const int group_size = GRP_SIZE / 2 + 1;
    __shared__ uint64_t dx[group_size][4];
    __shared__ uint64_t gx_local[HSIZE][4], gy_local[HSIZE][4];
    uint64_t px[4], py[4], pyn[4], sx[4], sy[4], dy[4], _s[4], _p2[4];

    __syncthreads();

    Load256A(sx, startx);
    Load256A(sy, starty);
    Load256(px, sx);
    Load256(py, sy);

    __syncthreads();

    for (uint32_t i = 0; i < HSIZE; ++i) {
        Load256(gx_local[i], Gx + 4 * i);
        Load256(gy_local[i], Gy + 4 * i);
        ModSub256(dx[i], gx_local[i], sx);
    }
    ModSub256(dx[HSIZE], _2Gnx, sx);

    __syncthreads();
    _ModInvGrouped(dx);

    __syncthreads();
    CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2);

    ModNeg256(pyn, py);

    #pragma unroll
    for (uint32_t i = 0; i < HSIZE; ++i) {
uint64_t gx_local_cache[4], gy_local_cache[4];
        Load256(gx_local_cache, Gx + 4 * i);
        Load256(gy_local_cache, Gy + 4 * i);

        uint64_t px_local[4], py_local[4];
        Load256(px_local, sx);
        Load256(py_local, sy);

        uint64_t dy_cache[4], s_cache[4], p2_cache[4];
        for (int sign = 1; sign >= -1; sign -= 2) {
            Load256(px, px_local);
            Load256(py, py_local);

            if (sign == 1) {
                ModSub256(dy_cache, gy_local_cache, py);
            } else {
                ModSub256(dy_cache, pyn, gy_local_cache);
            }

            s_cache[0] = __umul64hi(dy_cache[0], dx[i][0]);
            p2_cache[0] = __umul64hi(s_cache[0], s_cache[0]);

            ModSub256(px, p2_cache, px);
            ModSub256(px, gx_local_cache);

            ModSub256(py, gx_local_cache, px);
            _ModMult(py, s_cache);
            ModSub256(py, gy_local_cache);

            if (sign == 1) {
                CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2 + (i + 1));
            } else {
                CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2 - (i + 1));
            }
        }
    }
}
?
Activity: -
Merit: -
Hi everyone,

I recently converted my entire ETH farm (about 80x RTX 3080) into a puzzle-hunting farm. During this process, I explored various online tools, including KeyHuntCuda, RotorCuda, etc.

Most of these tools use the Vanity Search libraries by Jean Luc PONS.

Since my focus is on targeting a single address (without a public key), both sequentially and randomly,
I decided to rewrite the ComputeKeysSEARCH_MODE_SA function from CPUCompute.h.

With just one RTX 3080, which typically achieves around 2000M keys, I managed to reach 2500M keys with these modifications.

Moreover, with the original version, when using -rkey and a new random cycle starts, there is a significant drop in performance. However, with this version, the performance drop is minimal, and the speed quickly recovers when a new random cycle begins.

Below is a brief summary of the changes I made:

  • Additional Synchronization (__syncthreads()) to ensure consistency between threads.
  • Loop Unrolling to improve loop efficiency.
  • Local Caching of values (Gx, Gy, sx, sy) to reduce global memory latency.
  • Combined Calculation of P + i*G and P - i*G to reduce code duplication.
  • Prefetching values from global memory into local registers for better performance.
  • Optimized Multiplications using __umul64hi for faster 64-bit calculations.
  • Optimized Matching Check (CHECK_HASH_SEARCH_MODE_SA) to reduce latency between calculations and checks.

So I wanted to share with all of you guys.

Waiting for your feedback cheers

Code:
__device__ void ComputeKeysSEARCH_MODE_SA(uint32_t mode, uint64_t* startx, uint64_t* starty,
uint32_t* hash160, uint32_t maxFound, uint32_t* out)
{

    uint64_t dx[GRP_SIZE / 2 + 1][4];
    uint64_t px[4];
    uint64_t py[4];
    uint64_t pyn[4];
    uint64_t sx[4];
    uint64_t sy[4];
    uint64_t dy[4];
    uint64_t _s[4];
    uint64_t _p2[4];

    __syncthreads();
    // Load starting key
    Load256A(sx, startx);
    Load256A(sy, starty);
    Load256(px, sx);
    Load256(py, sy);

    __syncthreads();
    // Fill group with delta x
    uint32_t i;
    for (i = 0; i < HSIZE; i++) {
        ModSub256(dx[i], Gx + 4 * i, sx);
    }
    ModSub256(dx[i], Gx + 4 * i, sx);   // For the first point
    ModSub256(dx[i + 1], _2Gnx, sx);    // For the next center point

    __syncthreads();
    // Compute modular inverse
    _ModInvGrouped(dx);

    // We use the fact that P + i*G and P - i*G has the same deltax, so the same inverse
    // We compute key in the positive and negative way from the center of the group

    __syncthreads();
    // Check starting point
    CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2);

    ModNeg256(pyn, py);

    __syncthreads();
    // Loop unrolling and variable caching for optimization
    #pragma unroll
    for (i = 0; i < HSIZE; i++) {
        // Pre-fetching Gx and Gy values to reduce memory latency
        uint64_t gx_local[4], gy_local[4];
        Load256(gx_local, Gx + 4 * i);
        Load256(gy_local, Gy + 4 * i);

        // Pre-fetching starting key values to reuse in both P + i*G and P - i*G calculations
        uint64_t px_local[4], py_local[4];
        Load256(px_local, sx);
        Load256(py_local, sy);

        // Compute P + i*G and P - i*G within a single iteration
        uint64_t dy_local[4], _s_local[4], _p2_local[4];
        for (int sign = 1; sign >= -1; sign -= 2) {
            Load256(px, px_local);
            Load256(py, py_local);

            if (sign == 1) {
                ModSub256(dy_local, gy_local, py);  // P + i*G
            } else {
                ModSub256(dy_local, pyn, gy_local); // P - i*G
            }

            _s_local[0] = __umul64hi(dy_local[0], dx[i][0]);    //  s = (p2.y - p1.y) * inverse(p2.x - p1.x)
            _p2_local[0] = __umul64hi(_s_local[0], _s_local[0]);           // _p2 = pow2(s)

            ModSub256(px, _p2_local, px);
            ModSub256(px, gx_local);                // px = pow2(s) - p1.x - p2.x;

            ModSub256(py, gx_local, px);
            _ModMult(py, _s_local);                 // py = - s * (ret.x - p2.x)
            ModSub256(py, gy_local);                // py = - p2.y - s * (ret.x - p2.x);

            if (sign == 1) {
                CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2 + (i + 1));
            } else {
                CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2 - (i + 1));
            }
        }
    }
Jump to: