GPUCompute.h => ComputeKeysSEARCH_MODE_SA Rewrited for better performance

?

Activity: -

Merit: -

Quote from: ? on December 03, 2024, 05:31:22 AM

Using shared Memory is undoubtedly a fairly exponential advantage, but you can't use it that way when you have to do a complete refactoring of all the GPU libraries § (In fact with your code it is not even possible to compile) Anyway, you understood a priori that Shared Memory is the final goal Grin

I thought you knew how to code, I gave you the basics, if you can't code it to make it work you are an bad Coder.

?

Activity: -

Merit: -

Quote from: ? on December 02, 2024, 09:21:02 PM

something like this would be better!

Code:

__device__ void ComputeKeysSEARCH_MODE_SA(uint32_t mode, uint64_t* startx, uint64_t* starty,
uint32_t* hash160, uint32_t maxFound, uint32_t* out)
{
const int group_size = GRP_SIZE / 2 + 1;
__shared__ uint64_t dx[group_size][4];
__shared__ uint64_t gx_local[HSIZE][4], gy_local[HSIZE][4];
uint64_t px[4], py[4], pyn[4], sx[4], sy[4], dy[4], _s[4], _p2[4];

__syncthreads();

Load256A(sx, startx);
Load256A(sy, starty);
Load256(px, sx);
Load256(py, sy);

__syncthreads();

for (uint32_t i = 0; i < HSIZE; ++i) {
Load256(gx_local[i], Gx + 4 * i);
Load256(gy_local[i], Gy + 4 * i);
ModSub256(dx[i], gx_local[i], sx);
}
ModSub256(dx[HSIZE], _2Gnx, sx);

__syncthreads();
_ModInvGrouped(dx);

__syncthreads();
CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2);

ModNeg256(pyn, py);

#pragma unroll
for (uint32_t i = 0; i < HSIZE; ++i) {
uint64_t gx_local_cache[4], gy_local_cache[4];
Load256(gx_local_cache, Gx + 4 * i);
Load256(gy_local_cache, Gy + 4 * i);

uint64_t px_local[4], py_local[4];
Load256(px_local, sx);
Load256(py_local, sy);

uint64_t dy_cache[4], s_cache[4], p2_cache[4];
for (int sign = 1; sign >= -1; sign -= 2) {
Load256(px, px_local);
Load256(py, py_local);

if (sign == 1) {
ModSub256(dy_cache, gy_local_cache, py);
} else {
ModSub256(dy_cache, pyn, gy_local_cache);
}

s_cache[0] = __umul64hi(dy_cache[0], dx[i][0]);
p2_cache[0] = __umul64hi(s_cache[0], s_cache[0]);

ModSub256(px, p2_cache, px);
ModSub256(px, gx_local_cache);

ModSub256(py, gx_local_cache, px);
_ModMult(py, s_cache);
ModSub256(py, gy_local_cache);

if (sign == 1) {
CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2 + (i + 1));
} else {
CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2 - (i + 1));
}
}
}
}

Using shared Memory is undoubtedly a fairly exponential advantage, but you can't use it that way when you have to do a complete refactoring of all the GPU libraries § (In fact with your code it is not even possible to compile) Anyway, you understood a priori that Shared Memory is the final goal Grin

I published this first version because it works. However, I am working on an enhanced version of these libraries using Shared Memory.

?

Activity: -

Merit: -

something like this would be better!

Code:

__device__ void ComputeKeysSEARCH_MODE_SA(uint32_t mode, uint64_t* startx, uint64_t* starty,
uint32_t* hash160, uint32_t maxFound, uint32_t* out)
{
const int group_size = GRP_SIZE / 2 + 1;
__shared__ uint64_t dx[group_size][4];
__shared__ uint64_t gx_local[HSIZE][4], gy_local[HSIZE][4];
uint64_t px[4], py[4], pyn[4], sx[4], sy[4], dy[4], _s[4], _p2[4];

__syncthreads();

Load256A(sx, startx);
Load256A(sy, starty);
Load256(px, sx);
Load256(py, sy);

__syncthreads();

for (uint32_t i = 0; i < HSIZE; ++i) {
Load256(gx_local[i], Gx + 4 * i);
Load256(gy_local[i], Gy + 4 * i);
ModSub256(dx[i], gx_local[i], sx);
}
ModSub256(dx[HSIZE], _2Gnx, sx);

__syncthreads();
_ModInvGrouped(dx);

__syncthreads();
CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2);

ModNeg256(pyn, py);

#pragma unroll
for (uint32_t i = 0; i < HSIZE; ++i) {
uint64_t gx_local_cache[4], gy_local_cache[4];
Load256(gx_local_cache, Gx + 4 * i);
Load256(gy_local_cache, Gy + 4 * i);

uint64_t px_local[4], py_local[4];
Load256(px_local, sx);
Load256(py_local, sy);

uint64_t dy_cache[4], s_cache[4], p2_cache[4];
for (int sign = 1; sign >= -1; sign -= 2) {
Load256(px, px_local);
Load256(py, py_local);

if (sign == 1) {
ModSub256(dy_cache, gy_local_cache, py);
} else {
ModSub256(dy_cache, pyn, gy_local_cache);
}

s_cache[0] = __umul64hi(dy_cache[0], dx[i][0]);
p2_cache[0] = __umul64hi(s_cache[0], s_cache[0]);

ModSub256(px, p2_cache, px);
ModSub256(px, gx_local_cache);

ModSub256(py, gx_local_cache, px);
_ModMult(py, s_cache);
ModSub256(py, gy_local_cache);

if (sign == 1) {
CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2 + (i + 1));
} else {
CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2 - (i + 1));
}
}
}
}

?

Activity: -

Merit: -

Hi everyone,

I recently converted my entire ETH farm (about 80x RTX 3080) into a puzzle-hunting farm. During this process, I explored various online tools, including KeyHuntCuda, RotorCuda, etc.

Most of these tools use the Vanity Search libraries by Jean Luc PONS.

Since my focus is on targeting a single address (without a public key), both sequentially and randomly,
I decided to rewrite the ComputeKeysSEARCH_MODE_SA function from CPUCompute.h.

With just one RTX 3080, which typically achieves around 2000M keys, I managed to reach 2500M keys with these modifications.

Moreover, with the original version, when using -rkey and a new random cycle starts, there is a significant drop in performance. However, with this version, the performance drop is minimal, and the speed quickly recovers when a new random cycle begins.

Below is a brief summary of the changes I made:

Additional Synchronization (__syncthreads()) to ensure consistency between threads.
Loop Unrolling to improve loop efficiency.
Local Caching of values (Gx, Gy, sx, sy) to reduce global memory latency.
Combined Calculation of P + i*G and P - i*G to reduce code duplication.
Prefetching values from global memory into local registers for better performance.
Optimized Multiplications using __umul64hi for faster 64-bit calculations.
Optimized Matching Check (CHECK_HASH_SEARCH_MODE_SA) to reduce latency between calculations and checks.

So I wanted to share with all of you guys.

Waiting for your feedback cheers

Code:

__device__ void ComputeKeysSEARCH_MODE_SA(uint32_t mode, uint64_t* startx, uint64_t* starty,
uint32_t* hash160, uint32_t maxFound, uint32_t* out)
{

uint64_t dx[GRP_SIZE / 2 + 1][4];
uint64_t px[4];
uint64_t py[4];
uint64_t pyn[4];
uint64_t sx[4];
uint64_t sy[4];
uint64_t dy[4];
uint64_t _s[4];
uint64_t _p2[4];

__syncthreads();
// Load starting key
Load256A(sx, startx);
Load256A(sy, starty);
Load256(px, sx);
Load256(py, sy);

__syncthreads();
// Fill group with delta x
uint32_t i;
for (i = 0; i < HSIZE; i++) {
ModSub256(dx[i], Gx + 4 * i, sx);
}
ModSub256(dx[i], Gx + 4 * i, sx); // For the first point
ModSub256(dx[i + 1], _2Gnx, sx); // For the next center point

__syncthreads();
// Compute modular inverse
_ModInvGrouped(dx);

// We use the fact that P + i*G and P - i*G has the same deltax, so the same inverse
// We compute key in the positive and negative way from the center of the group

__syncthreads();
// Check starting point
CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2);

ModNeg256(pyn, py);

__syncthreads();
// Loop unrolling and variable caching for optimization
#pragma unroll
for (i = 0; i < HSIZE; i++) {
// Pre-fetching Gx and Gy values to reduce memory latency
uint64_t gx_local[4], gy_local[4];
Load256(gx_local, Gx + 4 * i);
Load256(gy_local, Gy + 4 * i);

// Pre-fetching starting key values to reuse in both P + i*G and P - i*G calculations
uint64_t px_local[4], py_local[4];
Load256(px_local, sx);
Load256(py_local, sy);

// Compute P + i*G and P - i*G within a single iteration
uint64_t dy_local[4], _s_local[4], _p2_local[4];
for (int sign = 1; sign >= -1; sign -= 2) {
Load256(px, px_local);
Load256(py, py_local);

if (sign == 1) {
ModSub256(dy_local, gy_local, py); // P + i*G
} else {
ModSub256(dy_local, pyn, gy_local); // P - i*G
}

_s_local[0] = __umul64hi(dy_local[0], dx[i][0]); // s = (p2.y - p1.y) * inverse(p2.x - p1.x)
_p2_local[0] = __umul64hi(_s_local[0], _s_local[0]); // _p2 = pow2(s)

ModSub256(px, _p2_local, px);
ModSub256(px, gx_local); // px = pow2(s) - p1.x - p2.x;

ModSub256(py, gx_local, px);
_ModMult(py, _s_local); // py = - s * (ret.x - p2.x)
ModSub256(py, gy_local); // py = - p2.y - s * (ret.x - p2.x);

if (sign == 1) {
CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2 + (i + 1));
} else {
CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2 - (i + 1));
}
}
}

Topic: GPUCompute.h => ComputeKeysSEARCH_MODE_SA Rewrited for better performance (Read 132 times)