It will make me crazy.
It works on my 2 configs and a user on github just post a report on a GeForce GTX 1080 Ti (ccap=6.1) running on Ubuntu 18.04 and it works fine (he uses CUDA10).
It was the Bitcointalk forum that inspired us to create Bitcointalksearch.org - Bitcointalk is an excellent site that should be the default page for anybody dealing in cryptocurrency, since it is a virtual gold-mine of data. However, our experience and user feedback led us create our site; Bitcointalk's search is slow, and difficult to get the results you need, because you need to log in first to find anything useful - furthermore, there are rate limiters for their search functionality.
The aim of our project is to create a faster website that yields more results and faster without having to create an account and eliminate the need to log in - your personal data, therefore, will never be in jeopardy since we are not asking for any of your data and you don't need to provide them to use our site with all of its capabilities.
We created this website with the sole purpose of users being able to search quickly and efficiently in the field of cryptocurrency so they will have access to the latest and most accurate information and thereby assisting the crypto-community at large.
make gpu=1 ccap=50 all
CUDA = /usr/local/cuda-8.0
CXXCUDA = /usr/bin/g++-4.8
make gpu=1 ccap=50 all
CUDA = /usr/local/cuda-8.0
CXXCUDA = /usr/bin/g++-4.8
CPU found 1577 items
GPU: point correct [0/243]
GPU: endo #1 correct [0/251]
GPU: endo #2 correct [0/268]
GPU: sym/point correct [0/257]
GPU: sym/endo #1 correct [0/256]
GPU: sym/endo #2 correct [0/302]
GPU/CPU check Failed !
#if 1
#if 0
__device__ __noinline__ void CheckHashComp(prefix_t *prefix, uint64_t *px, uint64_t *py,
int32_t incr, uint32_t tid, uint32_t *lookup32, uint32_t *out) {
uint32_t h[20];
uint64_t pe1x[4];
uint64_t pe2x[4];
// Point
_GetHash160Comp(px, py, (uint8_t *)h);
CHECK_POINT(h, incr, 0); <-- 100% Ok up to here, means that (px,py) is good
// Endo #1 if (x, y) = k * G, then (beta*x, y) = lambda*k*G
_ModMult(pe1x, px, _beta);
_GetHash160Comp(pe1x, py, (uint8_t *)h); <-- 50% Wrong from here
CHECK_POINT(h, incr, 1);
// Endo #2 if (x, y) = k * G, then (beta2*x, y) = lambda2*k*G
_ModMult(pe2x, px, _beta2);
_GetHash160Comp(pe2x, py, (uint8_t *)h);
CHECK_POINT(h, incr, 2);
ModNeg256(py);
// Symetric points
_GetHash160Comp(px, py, (uint8_t *)h);
CHECK_POINT(h, -incr, 0);
_GetHash160Comp(pe1x, py, (uint8_t *)h);
CHECK_POINT(h, -incr, 1);
_GetHash160Comp(pe2x, py, (uint8_t *)h);
CHECK_POINT(h, -incr, 2);
}
//uint64_t r0 = 0x0ULL;
//uint64_t r1 = 0x0ULL;
//uint64_t r3 = 0x0ULL;
//uint64_t r4 = 0x0ULL;
uint64_t r0;
uint64_t r1;
uint64_t r3 ;
uint64_t r4 ;
CPU found 1559 items
GPU: point correct [249/249]
GPU: endo #1 correct [203/281]
GPU: endo #2 correct [220/286]
GPU: sym/point correct [102/246]
GPU: sym/endo #1 correct [180/248]
GPU: sym/endo #2 correct [179/249]
GPU/CPU check Failed !
__device__ void _ModSqr(uint64_t *rp, const uint64_t *up) {
uint64_t u0 = up[0];
uint64_t u1 = up[1];
uint64_t u2 = up[2];
uint64_t u3 = up[3];
uint64_t u10, u11;
uint64_t r0 = 0x0ULL;
uint64_t r1 = 0x0ULL;
uint64_t r3 = 0x0ULL;
uint64_t r4 = 0x0ULL;
uint64_t t1 = 0x0ULL;
uint64_t t2 = 0x0ULL;
uint64_t s0, s1, s2, s3, s4, s5, s6, s7;
uint64_t z1, z2, z3, z4, z5, z6, z7, z8;
//k=0
UMULLO(s0, u0, u0);
UMULHI(r1, u0, u0);
//k=1
UMULLO(r3, u0, u1);
UMULHI(r4, u0, u1);
UADDO1(r3, r3);
UADDC1(r4, r4);
UADD1(t1, 0x0ULL);
UADDO1(r3, r1);
UADDC1(r4, 0x0ULL);
UADD1(t1, 0x0ULL);
s1 = r3;
//k=2
UMULLO(r0, u0, u2);
UMULHI(r1, u0, u2);
UADDO1(r0, r0);
UADDC1(r1, r1);
UADD1(t2, 0x0ULL);
UMULLO(u10, u1, u1);
UMULHI(u11, u1, u1);
UADDO1(r0, u10);
UADDC1(r1, u11);
UADD1(t2, 0x0ULL);
UADDO1(r0, r4);
UADDC1(r1, t1);
UADD1(t2, 0x0ULL);
s2 = r0;
t1 = 0;
//k=3
UMULLO(r3, u0, u3);
UMULHI(r4, u0, u3);
UMULLO(u10, u1, u2);
UMULHI(u11, u1, u2);
UADDO1(r3, u10);
UADDC1(r4, u11);
UADD1(t1, 0x0ULL);
t1 += t1;
UADDO1(r3, r3);
UADDC1(r4, r4);
UADD1(t1, 0x0ULL);
UADDO1(r3, r1);
UADDC1(r4, t2);
UADD1(t1, 0x0ULL);
s3 = r3;
t2 = 0;
//k=4
UMULLO(r0, u1, u3);
UMULHI(r1, u1, u3);
UADDO1(r0, r0);
UADDC1(r1, r1);
UADD1(t2, 0x0ULL);
UMULLO(u10, u2, u2);
UMULHI(u11, u2, u2);
UADDO1(r0, u10);
UADDC1(r1, u11);
UADD1(t2, 0x0ULL);
UADDO1(r0, r4);
UADDC1(r1, t1);
UADD1(t2, 0x0ULL);
s4 = r0;
t1 = 0;
//k=5
UMULLO(r3, u2, u3);
UMULHI(r4, u2, u3);
UADDO1(r3, r3);
UADDC1(r4, r4);
UADD1(t1, 0x0ULL);
UADDO1(r3, r1);
UADDC1(r4, t2);
UADD1(t1, 0x0ULL);
s5 = r3;
//k=6
UMULLO(r0, u3, u3);
UMULHI(r1, u3, u3);
UADDO1(r0, r4);
UADD1(r1, t1);
s6 = r0;
//k=7
s7 = r1;
//Reduction
UMULLO(z3, s5, 0x1000003d1ULL);
UMULHI(z4, s5, 0x1000003d1ULL);
UMULLO(z5, s6, 0x1000003d1ULL);
UMULHI(z6, s6, 0x1000003d1ULL);
UMULLO(z7, s7, 0x1000003d1ULL);
UMULHI(z8, s7, 0x1000003d1ULL);
UMULLO(z1, s4, 0x1000003d1ULL);
UMULHI(z2, s4, 0x1000003d1ULL);
UADDO1(z1, s0);
UADD1(z2, 0x0ULL);
UADDO1(z2, s1);
UADDC1(z4, s2);
UADDC1(z6, s3);
UADD1(z8, 0x0ULL);
//uint64_t c = 0;
UADDO1(z3, z2);
UADDC1(z5, z4);
UADDC1(z7, z6);
UADD1(z8, 0x0ULL);
UMULLO(u10, z8, 0x1000003d1ULL);
UMULHI(u11, z8, 0x1000003d1ULL);
UADDO1(z1, u10);
UADDC1(z3, u11);
UADDC1(z5, 0x0ULL);
UADD1(z7, 0x0ULL);
/*
UADD1(c, 0x0ULL);
rp[0] = z1;
rp[1] = z3;
if(c == 1){
UADDO1(z5, 0x1ULL);
UADD1(z7, 0x0ULL);
}
*/
rp[0] = z1;
rp[1] = z3;
rp[2] = z5;
rp[3] = z7;
}
//_ModMult(_p2, _s, _s); // _p = pow2(s)
_ModSqr(_p2, _s);
//_ModMult(py, _s);
_ModSqr(py, _s);
__device__ void _ModSqr(uint64_t *rp, const uint64_t *up) {
uint64_t u0 = up[0];
uint64_t u1 = up[1];
uint64_t u2 = up[2];
uint64_t u3 = up[3];
uint64_t u10, u11;
uint64_t r0 = 0;
uint64_t r1 = 0;
uint64_t r3 = 0;
uint64_t r4 = 0;
uint64_t t1 = 0;
uint64_t t2 = 0;
uint64_t s0, s1, s2, s3, s4, s5, s6, s7;
uint64_t z1, z2, z3, z4, z5, z6, z7, z8;
z1 = z2 = 0;
//k=0
UMULLO(s0, u0, u0);
UMULHI(r1, u0, u0);
//k=1
UMULLO(r3, u0, u1);
UMULHI(r4, u0, u1);
UADDO1(r4, r4);
UADDC1(u0, u0);
UADDC1(r4, u1);
UADDC1(u0, 0x0ULL);
UADD1(r3, 0x0ULL);
s1 = r3;
//k=2
UMULLO(r0, u0, u2);
UMULHI(r1, u0, u2);
UADDO1(r0, r0);
UADDC1(r1, r1);
UADD1(t2, 0x0ULL);
UMULLO(u10, u1, u1);
UMULHI(u11, u1, u1);
UADDO1(r0, u10);
UADDC1(r1, u11);
UADD1(t2, 0x0ULL);
UADDO1(r0, r4);
UADDC1(r1, t1);
UADD1(t2, 0x0ULL);
s2 = r0;
t1 = 0;
//k=3
UMULLO(r3, u0, u3);
UMULHI(r4, u0, u3);
UMULLO(u10, u1, u2);
UMULHI(u11, u1, u2);
UADDO1(r3, u10);
UADDC1(r4, u11);
UADD1(t1, 0x0ULL);
t1 += t1;
UADDO1(r3, r3);
UADDC1(r4, r4);
UADD1(t1, 0x0ULL);
UADDO1(r3, r1);
UADDC1(r4, t2);
UADD1(t1, 0x0ULL);
s3 = r3;
t2 = 0;
//k=4
UMULLO(r0, u1, u3);
UMULHI(r1, u1, u3);
UADDO1(r0, r0);
UADDC1(r1, r1);
UADD1(t2, 0x0ULL);
UMULLO(u10, u2, u2);
UMULHI(u11, u2, u2);
UADDO1(r0, u10);
UADDC1(r1, u11);
UADD1(t2, 0x0ULL);
UADDO1(r0, r4);
UADDC1(r1, t1);
UADD1(t2, 0x0ULL);
s4 = r0;
t1 = 0;
//k=5
UMULLO(r3, u2, u3);
UMULHI(r4, u2, u3);
UADDO1(r3, r3);
UADDC1(r4, r4);
UADD1(t1, 0x0ULL);
UADDO1(r3, r1);
UADDC1(r4, t2);
UADD1(t1, 0x0ULL);
s5 = r3;
//k=6
UMULLO(r0, u3, u3);
UMULHI(r1, u3, u3);
UADDO1(r0, r4);
UADD1(r1, t1);
s6 = r0;
//k=7
s7 = r1;
//Reduction
UMULLO(z3, s5, 0x1000003d1);
UMULHI(z4, s5, 0x1000003d1);
UMULLO(z5, s6, 0x1000003d1);
UMULHI(z6, s6, 0x1000003d1);
UMULLO(z7, s7, 0x1000003d1);
UMULHI(z8, s7, 0x1000003d1);
UMULLO(z1, s4, 0x1000003d1ULL);
UMULHI(z2, s4, 0x1000003d1ULL);
UADDO1(z1, s0);
UADD1(z2, 0x0ULL);
UADDO1(z2, s1);
UADDC1(z4, s2);
UADDC1(z6, s3);
UADD1(z8, 0x0ULL);
uint64_t c = 0;
UADDO1(z3, z2);
UADDC1(z5, z4);
UADDC1(z7, z6);
UADD1(z8, 0x0ULL);
UMULLO(u10, z8, 0x1000003d1ULL);
UMULHI(u11, z8, 0x1000003d1ULL);
UADDO1(z1, u10);
UADDC1(z3, u11);
UADD1(c, 0x0ULL);
rp[0] = z1;
rp[1] = z3;
if(c == 1){
UADDC1(z5, 0x1ULL);
UADD1(z7, 0x0ULL);
}
rp[2] = z5;
rp[3] = z7;
}
#define SET0(a) asm volatile ("mov.u64 %0,0;" : "=l"(a))
// ---------------------------------------------------------------------------------------
// Compute a*b*(mod n)
// a and b must be lower than n
// ---------------------------------------------------------------------------------------
__device__ void _ModMult(uint64_t *r, uint64_t *a, uint64_t *b) {
uint64_t r512[8];
uint64_t t[NBBLOCK];
uint64_t ah,al;
SET0(r512[5]);
SET0(r512[6]);
SET0(r512[7]);
// 256*256 multiplier
#define SET0(a) asm volatile ("mov.u64 %0,0;" : "=l"(a))
// ---------------------------------------------------------------------------------------
// Compute a*b*(mod n)
// a and b must be lower than n
// ---------------------------------------------------------------------------------------
__device__ void _ModMult(uint64_t *r, uint64_t *a, uint64_t *b) {
uint64_t r512[8];
uint64_t t[NBBLOCK];
uint64_t ah,al;
SET0(r512[5]);
SET0(r512[6]);
SET0(r512[7]);
// 256*256 multiplier
// Reduce from 512 to 320
- UMult(t,(r512 + 4), 0x1000003D1ULL);
+ UMULLO(t[0],r512[4],0x1000003D1ULL);
+ UMULLO(t[1],r512[5],0x1000003D1ULL);
+ MADDO(t[1], r512[4],0x1000003D1ULL,t[1]);
+ UMULLO(t[2],r512[6],0x1000003D1ULL);
+ MADDC(t[2],r512[5],0x1000003D1ULL, t[2]);
+ UMULLO(t[3],r512[7],0x1000003D1ULL);
+ MADDC(t[3],r512[6],0x1000003D1ULL, t[3]);
+ MADD(t[4],r512[7],0x1000003D1ULL, 0ULL);