Author

Topic: [ANN] cudaMiner & ccMiner CUDA based mining applications [Windows/Linux/MacOSX] - page 718. (Read 3426976 times)

legendary
Activity: 1151
Merit: 1001
I put the original scanhash function from blakecoin cpuminer (block.c) into cudaminer
and even then the shares were rejected with the same reason.

This leads me to believe that there may be subtle differences in the stratum implementation
that I need to track down.

Christian

Sorry, I was wrong, messed conf file. Actually changing just cl file was not enough Sad

I think all the differences are in .cl code.
I tried sgminer4.1 +6850 (version compiled 19-02), there is blake.cl in there, 9K big, author Thomas Pornin - all shares rejected with same error as above.
Then downloaded the custom cgminer for blake, the file is blake256.cl - 6K big.
Just putting blake256.cl into kernel folder and setting it as algo - bam, sgminer started sending shares which were accepted, so all the diff are in cl


Code of blake256.cl
Code:
// BLAKE-256 hash algorithm in OpenCL, 8 rounds, second block for blakecoin

#ifdef cl_khr_byte_addressable_store
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : disable
#endif

#ifdef VECTORS4
typedef uint4 uint32_t;
#elif defined(VECTORS2)
typedef uint2 uint32_t;
#else
typedef uint uint32_t;
#endif

typedef unsigned char  uint8_t;

#define SWAP32_V(n) \
            (((n) << 24)               | (((n) & 0xff00) << 8) |     \
            (((n) >> 8) & 0xff00)      | ((n) >> 24))

typedef struct
{
  uint32_t h[8];
  uint t;
} state256;


#define NB_ROUNDS32 8

constant uint8_t sigma[16][16] =
{
  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
  {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
  {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
  { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
  { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
  {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
  {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
  { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
  {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
  {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
  {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
  { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
  { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
};

constant uint u256[16] =
{
  0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344,
  0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89,
  0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c,
  0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917
};

constant uint8_t steps[8][5] =
{
  /* column step */
  { 0,  4,  8, 12,  0 },
  { 1,  5,  9, 13,  2 },
  { 2,  6, 10, 14,  4 },
  { 3,  7, 11, 15,  6 },
  /* diagonal step */
  { 0,  5, 10, 15,  8 },
  { 1,  6, 11, 12, 10 },
  { 2,  7,  8, 13, 12 },
  { 3,  4,  9, 14, 14 }
};

//#define ROT32(x,n) (((x)<<(32-n))|( (x)>>(n)))
#define ROT32(x,n)   (rotate((uint32_t)x, (uint32_t)32-n))
#define ADD32(x,y)   ((uint32_t)((x) + (y)))
#define XOR32(x,y)   ((uint32_t)((x) ^ (y)))

#define G(a,b,c,d,i) \
do {\
    v[a] += XOR32(m[sigma[r][i]], u256[sigma[r][i+1]]) + v[b];\
    v[d]  = ROT32(XOR32(v[d],v[a]),16);\
    v[c] += v[d];\
    v[b]  = ROT32(XOR32(v[b],v[c]),12);\
    v[a] += XOR32(m[sigma[r][i+1]], u256[sigma[r][i]]) + v[b]; \
    v[d]  = ROT32(XOR32(v[d],v[a]), 8);\
    v[c] += v[d];\
    v[b]  = ROT32(XOR32(v[b],v[c]), 7);\
  } while (0)


// compress a block
void blake256_compress_block( private state256 *S, private uint32_t *m)
{
  private uint32_t v[16];
  #pragma unroll 8
  for( int i = 0; i < 8; ++i )  { v[i] = S->h[i]; v[i+8] = u256[i]; };

  v[12] ^= S->t;
  v[13] ^= S->t;

  #pragma unroll 7
  for(int r = 0; r < 7; r++ )
  {
   #pragma unroll 8
   for(int j = 0; j < 8; j++)
     G( steps[j][0], steps[j][1], steps[j][2], steps[j][3], steps[j][4] );
   /*
    // column step
    G( 0,  4,  8, 12,  0 );
    G( 1,  5,  9, 13,  2 );
    G( 2,  6, 10, 14,  4 );
    G( 3,  7, 11, 15,  6 );
    // diagonal step
    G( 0,  5, 10, 15,  8 );
    G( 1,  6, 11, 12, 10 );
    G( 2,  7,  8, 13, 12 );
    G( 3,  4,  9, 14, 14 );
*/    
  }
  // not need last round last step
   int r = 7;
   #pragma unroll 7
   for(int j = 0; j < 7; j++)
     G( steps[j][0], steps[j][1], steps[j][2], steps[j][3], steps[j][4] );

  S->h[7] ^= v[7] ^ v[15];
}


#define FOUND (0xFF)
#define SETFOUND(Xnonce) output[output[FOUND]++] = Xnonce

#ifndef WORKSIZE
#define WORKSIZE 64
#endif


__attribute__((vec_type_hint(uint32_t)))
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(
volatile __global uint * restrict output,
// precalc hash from fisrt part of message
const uint h0,
const uint h1,
const uint h2,
const uint h3,
const uint h4,
const uint h5,
const uint h6,
const uint h7,
// last 12 bytes of original message
const uint in16,
const uint in17,
const uint in18
)
{

  private state256 S;
  
  S.h[0] = h0;
  S.h[1] = h1;
  S.h[2] = h2;
  S.h[3] = h3;
  S.h[4] = h4;
  S.h[5] = h5;
  S.h[6] = h6;
  S.h[7] = h7;

#if defined(VECTORS4)
    uint32_t gx = get_global_id(0);
    const uint gs = get_global_size(0);
gx.y += gs;
gx.z += gs*2;
gx.w += gs*3;
#elif defined(VECTORS2)
    uint32_t gx = get_global_id(0);
gx.y += get_global_size(0);
#else
    const uint32_t gx = get_global_id(0);
#endif

private uint32_t m[16];
    S.t = 640;
    m[0] = in16;
    m[1] = in17;
    m[2] = in18;
    m[3] = SWAP32_V(gx);
    m[4] = 0x80000000;
    #pragma unroll 8
    for (int i = 5;i<13;i++) m[i] = 0;
    m[13] = 1;
    m[14] = 0;
    m[15] = 640;
    
    blake256_compress_block( &S, &m );

#if defined(VECTORS4)
bool result = any(S.h[7] == 0);

if (result) {
if (S.h[7].x == 0)
SETFOUND(gx.x);
if (S.h[7].y == 0)
SETFOUND(gx.y);
if (S.h[7].z == 0)
SETFOUND(gx.z);
if (S.h[7].w == 0)
SETFOUND(gx.w);
}
#elif defined(VECTORS2)
bool result = any(S.h[7] == 0);

if (result) {
if (S.h[7].x == 0)
SETFOUND(gx.x);
if (S.h[7].y == 0)
SETFOUND(gx.y);
}
#else
// if (S.h[7] <= 0x000000FF) // from 0 to 255 low difficulty shares and above, maybe need to swap h[7] before, do not remember
if (S.h[7] == 0)
SETFOUND(gx);
#endif
}
hero member
Activity: 756
Merit: 502
I just ran the blake algo on the blakecoin pool
The gtx780ti seems to run at 971Mhash/s, however all shares are rejected.
Debug gives this reject reason: H-not-zero

I put the original scanhash function from blakecoin cpuminer (block.c) into cudaminer
and even then the shares were rejected with the same reason.

This leads me to believe that there may be subtle differences in the stratum implementation
that I need to track down.

Christian
full member
Activity: 193
Merit: 100
Guys, what we got today about speed, comparing to similar level GPUs between ATI and Nvidia?
sr. member
Activity: 408
Merit: 250
ded
are we going to see any speed increases with CUDA 6.0?  Grin

Getting ahead of yourself there aren't you?  Roll Eyes

can't help it  Grin

didn't see the cuda update in Arch and accidentally updated to 6.0 Tongue

are we going to see any speed increases with CUDA 6.0?  Grin

I don't think so...with current hardware at least not.  Cry
Christian said so a few posts back.


Sad oh well

Christian has already done quite a bit of work getting us to where we are now. Big thanks to him.
hero member
Activity: 756
Merit: 502
Christian, tell me please, is there any chances to adapt cudaminer at quark algos? or qubit and dark ?
Who knows, maybe it will be a great benefit?
Thx

these coins use a mix of many algos, which is too much work for me to implement in CUDA.

Currently I focus on single algorithms, like for example Blake256.

Christian
full member
Activity: 193
Merit: 100
Christian, tell me please, is there any chances to adapt cudaminer at quark algos? or qubit and dark ?
Who knows, maybe it will be a great benefit?
Thx
hero member
Activity: 812
Merit: 1000
are we going to see any speed increases with CUDA 6.0?  Grin

I don't think so...with current hardware at least not.  Cry
Christian said so a few posts back.
full member
Activity: 168
Merit: 100
are we going to see any speed increases with CUDA 6.0?  Grin

Getting ahead of yourself there aren't you?  Roll Eyes
sr. member
Activity: 408
Merit: 250
ded
are we going to see any speed increases with CUDA 6.0?  Grin
newbie
Activity: 26
Merit: 0
Any suggestions on a vertcoin pool for someone only doing 90Kh? I don't recognize any of the names on the vertcoin.com pool page but dedicatedpool, and that's a 2% fee. I picked a P2P node with low latency at random, I don't know that going P2P is going to be the right path for me.  Tongue

EDIT: I let cudaminer autotune, and it picked K4x32 for my 650ti boost, but I"m running about 2% rejection rate at 64*C and gettng occassional "result did not validate on CPU" errors.
legendary
Activity: 1400
Merit: 1000
Quote
I take no credit in this. Found it days ago and saved it in a file. I do believe credit goes to cbuchner1

--algo=keccak -i 0 -l T1024x24 -C 2 -L 1024 -m 1

I am getting 161k to 163k on my 750ti overclocked.


Thank you for this! I still am only 153mh per card but that is at least twice as fast as I was running before.

I use EVGA Precision and have 1 card overclocked 110MHz on gpu clock and 325MHz on mem clock (this one is a non factory overclocked card). I have seen people say they have the memory overclocked by as much as 600.
newbie
Activity: 19
Merit: 0
Quote
I take no credit in this. Found it days ago and saved it in a file. I do believe credit goes to cbuchner1

--algo=keccak -i 0 -l T1024x24 -C 2 -L 1024 -m 1

I am getting 161k to 163k on my 750ti overclocked.


Thank you for this! I still am only 153mh per card but that is at least twice as fast as I was running before.
legendary
Activity: 1694
Merit: 1024
Is it possible to set up two pools in one config with CUDAMiner? I'm looking to rent out my rig at Betarigs, and to do that I'd need to use two pool addresses and logins in my config.
How does work this renting business ?
Does the guy who rent a rig, access the machine (through ssh or vpn) and control it or you just have to point the machine where he ask ?
Basically you add the BetaRig's IP, username and password to your CGMiner config before your other IP username and password (for your main mining pool) and then when someone rents out your rig, it'll switch to the BetaRig's credidentials automatically and then when the time period for renting is up, back to your original mining pool.

Basically it doesn't send your machine work until someone rents your PC, and then stops sending work when the rental has finished.
legendary
Activity: 1400
Merit: 1000
What is the maximum of GTX750 Ti in Maxcoin (keccak SHA3).

Can hit 200 mh/s max overclocked?
My 750Ti can +/- 165 mh with 2014-02-28

What settings are u using to get this hashrate with a 750 ti? I only get ~70mh/s with -l K1000x24 -H 0 -C 1
You are using settings that seem more appropriate for something like a 780, have you tried lowering the x24 to x5?

I take no credit in this. Found it days ago and saved it in a file. I do believe credit goes to cbuchner1

--algo=keccak -i 0 -l T1024x24 -C 2 -L 1024 -m 1

I am getting 161k to 163k on my 750ti overclocked.
full member
Activity: 182
Merit: 100
What is the maximum of GTX750 Ti in Maxcoin (keccak SHA3).

Can hit 200 mh/s max overclocked?
My 750Ti can +/- 165 mh with 2014-02-28

What settings are u using to get this hashrate with a 750 ti? I only get ~70mh/s with -l K1000x24 -H 0 -C 1
You are using settings that seem more appropriate for something like a 780, have you tried lowering the x24 to x5?
member
Activity: 66
Merit: 10
What is the maximum of GTX750 Ti in Maxcoin (keccak SHA3).

Can hit 200 mh/s max overclocked?
My 750Ti can +/- 165 mh with 2014-02-28

What settings are u using to get this hashrate with a 750 ti? I only get ~70mh/s with -l K1000x24 -H 0 -C 1
full member
Activity: 182
Merit: 100
After a full day of test I'm really out of ideas  Undecided  pheraps is better to look for a new motherboard ( maybe the gigabyte 990fxa-ud3 )
Get a better CPU (AMD is known to suck Tongue)
Or, indeed, the motherboard...
full member
Activity: 173
Merit: 100
I had this same problem with a single card and an AMD dual core. It is an issue with some stupid "network manager" built into nVidia drivers. Uninstall driver = no CPU usage. Reinstall driver = constant 40% driver usage. Switched boards to an old Intel dual core and now I do not have this problem.
sr. member
Activity: 289
Merit: 251
Try Win 7 but 64 bits and look if there is a bios update for the mobo. Disable in the bios everything not needed , com ports ..... etc.


Here I am
 -I've installed win 7 64bit
- disabled integrated audio and serial port into bios (there's no much other to do)
- disabled all useless windows service
- installed lastest motherboard chipset driver, lastest nvidia driver, .net framework 4.0, lastest cudaminer (32bit) , msi afterburner
- still todo: update motherboard bios

The good news is that windows finally see all 4GB ddr even if I've 5 cards.
The bad news is that the issue is still here, maybe there's just slight better performance but nothing more


With 5 cards (3 on 16x slot and 2 on riser to 1x slot) very low hashrate ans slugginess due the heavy cpu load


With 4 cards (3 on 16x slot and 1 on riser to 1x slot)  start with a slight low hashrate (850kh/s total with the card on riser only at 80kh/s ) and slowly go down to 420kh/s.  Notice the same heavy cpu load on the "system interrupts" process


With 3 cards (the 3 on 16x slot) the hashrate is ok and there is almost no load on the cpu




After a full day of test I'm really out of ideas  Undecided  pheraps is better to look for a new motherboard ( maybe the gigabyte 990fxa-ud3 )

member
Activity: 64
Merit: 11
new to using cudaminer ..

as an example, if i have 3 cards running at 300khash, does the total of 900khash get reported to a mining pool?
or does a pool/worker read it as 300khash per card, as the share is accepted by the pool?

A pool only sees the shares you submit, and can guesstimate your hashrate from that. It's not very accurate and you shouldn't pay much attention to it unless it's VERY far off.
Jump to: