Pages:
Author

Topic: VanitySearch (Yet another address prefix finder) - page 54. (Read 32966 times)

sr. member
Activity: 462
Merit: 701
An other report from a user using CUDA 8 and gcc 4.8 on a GeForce GTX 460. It works.
sr. member
Activity: 462
Merit: 701
Don't worry, cuda 8 needs g++ 4.9, that's the problem.

I use g++ 4.8/CUDA 8 with my old Quadro and it works.

About the performance, I think most of the people use only compressed addresses.

If you do a specific ComputeKeys for only compressed keys (don't compute y at all!):

Yes you're right, I will make a second kernel optimized for compressed addresses only.
legendary
Activity: 1948
Merit: 2097
Yes, I already did it.

It will make me crazy.
It works on my 2 configs and a user on github just post a report on a GeForce GTX 1080 Ti (ccap=6.1) running on Ubuntu 18.04 and it works fine (he uses CUDA10).

Don't worry, cuda 8 needs g++ 4.9, that's the problem.


About the performance, I think most of the people use only compressed addresses.

If you do a specific ComputeKeys for only compressed keys (don't compute y at all!):

Code:
    for (uint32_t i = 0; i < HSIZE; i++) {

      // P = StartPoint + i*G
      Load256(px, sx);
      Load256(py, sy);
      ModSub256(dy, Gy[i], py);

      _ModMult(_s, dy, dx[i]);      //  s = (p2.y-p1.y)*inverse(p2.x-p1.x)
      //_ModMult(_p2, _s, _s);        // _p = pow2(s)
      _ModSqr(_p2, _s);

      ModSub256(px, _p2,px);
      ModSub256(px, Gx[i]);         // px = pow2(s) - p1.x - p2.x;
      /*
      ModSub256(py, Gx[i], px);
      _ModMult(py, _s);             // py = - s*(ret.x-p2.x)
      ModSub256(py, Gy[i]);         // py = - p2.y - s*(ret.x-p2.x);  
      */
      CHECK_PREFIX(GRP_SIZE / 2 + (i + 1));
      
      // P = StartPoint - i*G, if (x,y) = i*G then (x,-y) = -i*G
      Load256(px, sx);
      Load256(py, sy);
      //ModNeg256(dy,Gy[i]);
      //ModSub256(dy, py);
      ModSub256(dy, pyn, Gy[i]);

      _ModMult(_s, dy, dx[i]);      //  s = (p2.y-p1.y)*inverse(p2.x-p1.x)
      //_ModMult(_p2, _s, _s);        // _p = pow2(s)
      _ModSqr(_p2, _s);

      ModSub256(px, _p2, px);
      ModSub256(px, Gx[i]);         // px = pow2(s) - p1.x - p2.x;
      /*
      ModSub256(py, Gx[i], px);
      _ModMult(py, _s);             // py = - s*(ret.x-p2.x)
      
      ModAdd256(py, Gy[i]);         // py = - p2.y - s*(ret.x-p2.x);  

      //ModSub256(py, sx, px);
      //_ModMult(py, _s);             // py = - s*(ret.x-p2.x)
      //ModSub256(py, sy);
      */
      CHECK_PREFIX(GRP_SIZE / 2 - (i + 1));

    }
    
    // First point (startP - (GRP_SZIE/2)*G)
    Load256(px, sx);
    Load256(py, sy);
    ModNeg256(dy, Gy[i]);
    ModSub256(dy, py);

    _ModMult(_s, dy, dx[i]);      //  s = (p2.y-p1.y)*inverse(p2.x-p1.x)
    //_ModMult(_p2, _s, _s);        // _p = pow2(s)
    _ModSqr(_p2, _s);

    ModSub256(px, _p2, px);
    ModSub256(px, Gx[i]);         // px = pow2(s) - p1.x - p2.x;
    /*
    ModSub256(py, Gx[i], px);
    _ModMult(py, _s);             // py = - s*(ret.x-p2.x)
    
    ModAdd256(py, Gy[i]);         // py = - p2.y - s*(ret.x-p2.x);  
    */
    CHECK_PREFIX(0);

    i++;

    // Next start point (startP + GRP_SIZE*G)
    Load256(px, sx);
    Load256(py, sy);
    ModSub256(dy, _2Gny, py);

    _ModMult(_s, dy, dx[i]);      //  s = (p2.y-p1.y)*inverse(p2.x-p1.x)
    //_ModMult(_p2, _s, _s);        // _p = pow2(s)
    _ModSqr(_p2, _s);

    ModSub256(px, _p2, px);
    ModSub256(px, _2Gnx);         // px = pow2(s) - p1.x - p2.x;

    ModSub256(py, _2Gnx, px);
    _ModMult(py, _s);             // py = - s*(ret.x-p2.x)
    //_ModSqr(py, _s);
    ModSub256(py, _2Gny);         // py = - p2.y - s*(ret.x-p2.x);  

    Load256(sx, px);
    Load256(sy, py);

  }

  // Update starting point
  __syncthreads();
  Store256A(startx, sx);

you can save time. Then: SHA256 ("02+x")  and SHA256("03+x") (without thinking at y value)

On my system I got about a 8% increase of performance.

Obviously at the end you have to do a double check to know if the correct private key for the found address is k or n-k. But only for the address found.
sr. member
Activity: 462
Merit: 701
Yes, I already did it.

It will make me crazy.
It works on my 2 configs and a user on github just post a report on a GeForce GTX 1080 Ti (ccap=6.1) running on Ubuntu 18.04 and it works fine (he uses CUDA10).
legendary
Activity: 1948
Merit: 2097
You may have notice that I changed the makefile.
Now you should call it like this:

Code:
make gpu=1 ccap=50 all

And also set the good variable:
Code:
CUDA       = /usr/local/cuda-8.0
CXXCUDA    = /usr/bin/g++-4.8

The readme is up-to-date

Yes, I already did it.
sr. member
Activity: 462
Merit: 701
You may have notice that I changed the makefile.
Now you should call it like this:

Code:
make gpu=1 ccap=50 all

And also set the good variable:
Code:
CUDA       = /usr/local/cuda-8.0
CXXCUDA    = /usr/bin/g++-4.8

The readme is up-to-date
legendary
Activity: 1948
Merit: 2097
Unfortunately all wrong!!!

That's strange. May be I introduced an other bug.
If you restore the volatile it works ?


No.
sr. member
Activity: 462
Merit: 701
Unfortunately all wrong!!!

That's strange. May be I introduced an other bug.
If you restore the volatile it works ?
legendary
Activity: 1948
Merit: 2097
I removed again the volatile and added "memory" to clobber list of inline assembly. This should prevent the compiler to permute instruction (for pipelining optimization) and loose a carry or get a unexpected one.

Thanks to test the source on github and tell me if you still have the errors.

This is my last idea...

Unfortunately all wrong!!!

Code:
CPU found 1577 items
GPU: point   correct [0/243]
GPU: endo #1 correct [0/251]
GPU: endo #2 correct [0/268]
GPU: sym/point   correct [0/257]
GPU: sym/endo #1 correct [0/256]
GPU: sym/endo #2 correct [0/302]
GPU/CPU check Failed !
sr. member
Activity: 462
Merit: 701

You can delete:

and delete u0, u1, u2 ,u3, r0, r1, r2, r3


I committed your mods and I removed unused variable and changed a bit the squaring, I just replaced the reset of variable t1 and t2 by UADD(t1, 0x0ULL, 0x0ULL); . With this, it is no longer necessary to reset to 0 t1 or t2, t1 is set with carry flag.
I also added my reduction which use MADC instruction (multiply and add).

You can try both implementation by changing at GPUEngine.gu:665
Code:
#if 1
to
Code:
#if 0


I also ported your ModSqr to CPU release in IntMod.cpp.

On my hardware no significant performance increase, the square is ~10% faster the classic mult, so on the global process, no measurable performance increase.

I removed again the volatile and added "memory" to clobber list of inline assembly. This should prevent the compiler to permute instruction (for pipelining optimization) and loose a carry or get a unexpected one.

Thanks to test the source on github and tell me if you still have the errors.

This is my last idea...
sr. member
Activity: 462
Merit: 701
It can be due to a wrong optimization concerning a carry somewhere which could explain that it works from time to time.
I had a similar problem with the CPU release when I compiled with gcc 6, gcc 7 or Visual C++ work flawlessly.
The patch (a volatile also) is at IntMop.cpp:859 and IntMp.cpp:915
sr. member
Activity: 462
Merit: 701
You can also try VanitySearch -u -check
It will perform the check using uncompressed addresses and so use the CheckHashUncomp() function which is similar except that it calls GetHash160() instead of GetHash160Comp()

sr. member
Activity: 462
Merit: 701
endo and sym are computed in CheckHashComp() in GPUCompute.h.
I quoted my last post and I added few comments.
The point (px,py) is always OK so no errors before CHECK_POINT(h, incr, 0);
The errors randomly appear after this line.
It seems that nvcc generates (in your case) a wrong code.


Code:
__device__ __noinline__ void CheckHashComp(prefix_t *prefix, uint64_t *px, uint64_t *py,
  int32_t incr, uint32_t tid, uint32_t *lookup32, uint32_t *out) {

  uint32_t   h[20];
  uint64_t   pe1x[4];
  uint64_t   pe2x[4];

  // Point
  _GetHash160Comp(px, py, (uint8_t *)h);
  CHECK_POINT(h, incr, 0);                         <-- 100% Ok up to here, means that (px,py) is good

  // Endo #1  if (x, y) = k * G, then (beta*x, y) = lambda*k*G
  _ModMult(pe1x, px, _beta);
  _GetHash160Comp(pe1x, py, (uint8_t *)h);   <-- 50% Wrong from here
  CHECK_POINT(h, incr, 1);

  // Endo #2 if (x, y) = k * G, then (beta2*x, y) = lambda2*k*G
  _ModMult(pe2x, px, _beta2);
  _GetHash160Comp(pe2x, py, (uint8_t *)h);
  CHECK_POINT(h, incr, 2);

  ModNeg256(py);

  // Symetric points

  _GetHash160Comp(px, py, (uint8_t *)h);
  CHECK_POINT(h, -incr, 0);
  _GetHash160Comp(pe1x, py, (uint8_t *)h);
  CHECK_POINT(h, -incr, 1);
  _GetHash160Comp(pe2x, py, (uint8_t *)h);
  CHECK_POINT(h, -incr, 2);

}

legendary
Activity: 1948
Merit: 2097
OK thanks, it works Smiley

On my 645 GTX same performance. Sqr bring few spill moves more (there is more temp variables than in ModMult).
I didn't try yet on the OLD Quadro 600.
I will see If I can win few registers.

With Sqr
1>    33280 bytes stack frame, 128 bytes spill stores, 436 bytes spill loads
Without Sqr
1>    33280 bytes stack frame, 120 bytes spill stores, 424 bytes spill loads

You can delete:

Code:
  //uint64_t r0 = 0x0ULL;
  //uint64_t r1 = 0x0ULL;
  //uint64_t r3 = 0x0ULL;
  //uint64_t r4 = 0x0ULL;

  uint64_t r0;
  uint64_t r1;
  uint64_t r3 ;
  uint64_t r4 ;

and delete u0, u1, u2 ,u3, r0, r1, r2, r3
legendary
Activity: 1948
Merit: 2097
Code:
CPU found 1559 items
GPU: point   correct [249/249]
GPU: endo #1 correct [203/281]
GPU: endo #2 correct [220/286]
GPU: sym/point   correct [102/246]
GPU: sym/endo #1 correct [180/248]
GPU: sym/endo #2 correct [179/249]
GPU/CPU check Failed !

Where you compute endo and sym?
Without "volatile" I always get these errors.
sr. member
Activity: 462
Merit: 701
OK thanks, it works Smiley

On my 645 GTX same performance. Sqr bring few spill moves more (there is more temp variables than in ModMult).
I didn't try yet on the OLD Quadro 600.
I will see If I can win few registers.

With Sqr
1>    33280 bytes stack frame, 128 bytes spill stores, 436 bytes spill loads
Without Sqr
1>    33280 bytes stack frame, 120 bytes spill stores, 424 bytes spill loads
legendary
Activity: 1948
Merit: 2097
From 153 MKeys/s to 160 MKeys/s

using a _ModSqr instead of _ModMult

Thanks, I tried but the -check failed.
I will have a look at it.
I committed the patch with few of your mods , i also review a bit the main loop.


There were errors. Now it should work:
Code:
__device__ void _ModSqr(uint64_t *rp, const uint64_t *up) {


  uint64_t u0 = up[0];
  uint64_t u1 = up[1];
  uint64_t u2 = up[2];
  uint64_t u3 = up[3];


  uint64_t u10, u11;


  uint64_t r0 = 0x0ULL;
  uint64_t r1 = 0x0ULL;
  uint64_t r3 = 0x0ULL;
  uint64_t r4 = 0x0ULL;

  uint64_t t1 = 0x0ULL;
  uint64_t t2 = 0x0ULL;

  uint64_t s0, s1, s2, s3, s4, s5, s6, s7;
  uint64_t z1, z2, z3, z4, z5, z6, z7, z8;

  //k=0
  UMULLO(s0, u0, u0);
  UMULHI(r1, u0, u0);

  //k=1
  UMULLO(r3, u0, u1);
  UMULHI(r4, u0, u1);
  UADDO1(r3, r3);
  UADDC1(r4, r4);
  UADD1(t1, 0x0ULL);
  UADDO1(r3, r1);
  UADDC1(r4, 0x0ULL);
  UADD1(t1, 0x0ULL);
  s1 = r3;



  //k=2
  UMULLO(r0, u0, u2);
  UMULHI(r1, u0, u2);
  UADDO1(r0, r0);
  UADDC1(r1, r1);
  UADD1(t2, 0x0ULL);
  UMULLO(u10, u1, u1);
  UMULHI(u11, u1, u1);
  UADDO1(r0, u10);
  UADDC1(r1, u11);
  UADD1(t2, 0x0ULL);
  UADDO1(r0, r4);
  UADDC1(r1, t1);
  UADD1(t2, 0x0ULL);


  s2 = r0;
  t1 = 0;
 
  //k=3
  UMULLO(r3, u0, u3);
  UMULHI(r4, u0, u3);
  UMULLO(u10, u1, u2);
  UMULHI(u11, u1, u2);
  UADDO1(r3, u10);
  UADDC1(r4, u11);
  UADD1(t1, 0x0ULL);
  t1 += t1;
  UADDO1(r3, r3);
  UADDC1(r4, r4);
  UADD1(t1, 0x0ULL);
  UADDO1(r3, r1);
  UADDC1(r4, t2);
  UADD1(t1, 0x0ULL);

  s3 = r3;
  t2 = 0;

  //k=4

  UMULLO(r0, u1, u3);
  UMULHI(r1, u1, u3);
  UADDO1(r0, r0);
  UADDC1(r1, r1);
  UADD1(t2, 0x0ULL);
  UMULLO(u10, u2, u2);
  UMULHI(u11, u2, u2);
  UADDO1(r0, u10);
  UADDC1(r1, u11);
  UADD1(t2, 0x0ULL);
  UADDO1(r0, r4);
  UADDC1(r1, t1);
  UADD1(t2, 0x0ULL);

  s4 = r0;
  t1 = 0;

  //k=5
  UMULLO(r3, u2, u3);
  UMULHI(r4, u2, u3);
  UADDO1(r3, r3);
  UADDC1(r4, r4);
  UADD1(t1, 0x0ULL);
  UADDO1(r3, r1);
  UADDC1(r4, t2);
  UADD1(t1, 0x0ULL);

  s5 = r3;



  //k=6
  UMULLO(r0, u3, u3);
  UMULHI(r1, u3, u3);
  UADDO1(r0, r4);
  UADD1(r1, t1);
  s6 = r0;
 
  //k=7
  s7 = r1;
 
  //Reduction
  UMULLO(z3, s5, 0x1000003d1ULL);
  UMULHI(z4, s5, 0x1000003d1ULL);
  UMULLO(z5, s6, 0x1000003d1ULL);
  UMULHI(z6, s6, 0x1000003d1ULL);
  UMULLO(z7, s7, 0x1000003d1ULL);
  UMULHI(z8, s7, 0x1000003d1ULL);
  UMULLO(z1, s4, 0x1000003d1ULL);
  UMULHI(z2, s4, 0x1000003d1ULL);
  UADDO1(z1, s0);
  UADD1(z2, 0x0ULL);


  UADDO1(z2, s1);
  UADDC1(z4, s2);
  UADDC1(z6, s3);
  UADD1(z8, 0x0ULL);

 
  //uint64_t c = 0;


  UADDO1(z3, z2);
  UADDC1(z5, z4);
  UADDC1(z7, z6);
  UADD1(z8, 0x0ULL);
 
  UMULLO(u10, z8, 0x1000003d1ULL);
  UMULHI(u11, z8, 0x1000003d1ULL);
  UADDO1(z1, u10);
  UADDC1(z3, u11);
  UADDC1(z5, 0x0ULL);
  UADD1(z7, 0x0ULL);

  /*
  UADD1(c, 0x0ULL);   
     
  rp[0] = z1;
  rp[1] = z3;
  if(c == 1){

     UADDO1(z5, 0x1ULL);
     UADD1(z7, 0x0ULL);

  }
  */

  rp[0] = z1;
  rp[1] = z3;
  rp[2] = z5;
  rp[3] = z7;
 
 
}
sr. member
Activity: 462
Merit: 701
From 153 MKeys/s to 160 MKeys/s

using a _ModSqr instead of _ModMult

Thanks, I tried but the -check failed.
I will have a look at it.
I committed the patch with few of your mods , i also review a bit the main loop.
sr. member
Activity: 462
Merit: 701
Still errors.

OK Thanks for testing. I give up for the moment. I run out of ideas.
I let the volatile.
Hope I will manage to reproduce this.
legendary
Activity: 1948
Merit: 2097
From 153 MKeys/s to 160 MKeys/s

using a _ModSqr instead of _ModMult

in GPUCompute.h, __device__ void ComputeKeys
Code:
      //_ModMult(_p2, _s, _s);        // _p = pow2(s)
      _ModSqr(_p2, _s);


      //_ModMult(py, _s);            
      _ModSqr(py, _s);


in GPUEngine.cu:
Code:
__device__ void _ModSqr(uint64_t *rp, const uint64_t *up) {


  uint64_t u0 = up[0];
  uint64_t u1 = up[1];
  uint64_t u2 = up[2];
  uint64_t u3 = up[3];


  uint64_t u10, u11;


  uint64_t r0 = 0;
  uint64_t r1 = 0;
  uint64_t r3 = 0;
  uint64_t r4 = 0;

  uint64_t t1 = 0;
  uint64_t t2 = 0;


  uint64_t s0, s1, s2, s3, s4, s5, s6, s7;
  uint64_t z1, z2, z3, z4, z5, z6, z7, z8;

  z1 = z2 = 0;

  //k=0
  UMULLO(s0, u0, u0);
  UMULHI(r1, u0, u0);

  //k=1
  UMULLO(r3, u0, u1);
  UMULHI(r4, u0, u1);
  UADDO1(r4, r4);
  UADDC1(u0, u0);
  UADDC1(r4, u1);
  UADDC1(u0, 0x0ULL);
  UADD1(r3, 0x0ULL);
  s1 = r3;



  //k=2
  UMULLO(r0, u0, u2);
  UMULHI(r1, u0, u2);
  UADDO1(r0, r0);
  UADDC1(r1, r1);
  UADD1(t2, 0x0ULL);
  UMULLO(u10, u1, u1);
  UMULHI(u11, u1, u1);
  UADDO1(r0, u10);
  UADDC1(r1, u11);
  UADD1(t2, 0x0ULL);
  UADDO1(r0, r4);
  UADDC1(r1, t1);
  UADD1(t2, 0x0ULL);


  s2 = r0;
  t1 = 0;
  
  //k=3
  UMULLO(r3, u0, u3);
  UMULHI(r4, u0, u3);
  UMULLO(u10, u1, u2);
  UMULHI(u11, u1, u2);
  UADDO1(r3, u10);
  UADDC1(r4, u11);
  UADD1(t1, 0x0ULL);
  t1 += t1;
  UADDO1(r3, r3);
  UADDC1(r4, r4);
  UADD1(t1, 0x0ULL);
  UADDO1(r3, r1);
  UADDC1(r4, t2);
  UADD1(t1, 0x0ULL);

  s3 = r3;
  t2 = 0;

  //k=4

  UMULLO(r0, u1, u3);
  UMULHI(r1, u1, u3);
  UADDO1(r0, r0);
  UADDC1(r1, r1);
  UADD1(t2, 0x0ULL);
  UMULLO(u10, u2, u2);
  UMULHI(u11, u2, u2);
  UADDO1(r0, u10);
  UADDC1(r1, u11);
  UADD1(t2, 0x0ULL);
  UADDO1(r0, r4);
  UADDC1(r1, t1);
  UADD1(t2, 0x0ULL);

  s4 = r0;
  t1 = 0;

  //k=5
  UMULLO(r3, u2, u3);
  UMULHI(r4, u2, u3);
  UADDO1(r3, r3);
  UADDC1(r4, r4);
  UADD1(t1, 0x0ULL);
  UADDO1(r3, r1);
  UADDC1(r4, t2);
  UADD1(t1, 0x0ULL);

  s5 = r3;



  //k=6
  UMULLO(r0, u3, u3);
  UMULHI(r1, u3, u3);
  UADDO1(r0, r4);
  UADD1(r1, t1);
  s6 = r0;
  
  //k=7
  s7 = r1;
  
  //Reduction
  UMULLO(z3, s5, 0x1000003d1);
  UMULHI(z4, s5, 0x1000003d1);
  UMULLO(z5, s6, 0x1000003d1);
  UMULHI(z6, s6, 0x1000003d1);
  UMULLO(z7, s7, 0x1000003d1);
  UMULHI(z8, s7, 0x1000003d1);
  UMULLO(z1, s4, 0x1000003d1ULL);
  UMULHI(z2, s4, 0x1000003d1ULL);
  UADDO1(z1, s0);
  UADD1(z2, 0x0ULL);


  UADDO1(z2, s1);
  UADDC1(z4, s2);
  UADDC1(z6, s3);
  UADD1(z8, 0x0ULL);

  
  uint64_t c = 0;


  UADDO1(z3, z2);
  UADDC1(z5, z4);
  UADDC1(z7, z6);
  UADD1(z8, 0x0ULL);
 
  UMULLO(u10, z8, 0x1000003d1ULL);
  UMULHI(u11, z8, 0x1000003d1ULL);
  UADDO1(z1, u10);
  UADDC1(z3, u11);
  UADD1(c, 0x0ULL);  
      
  rp[0] = z1;
  rp[1] = z3;
  if(c == 1){

     UADDC1(z5, 0x1ULL);
     UADD1(z7, 0x0ULL);

  }

  rp[2] = z5;
  rp[3] = z7;
  
 
}
Pages:
Jump to: