[ANN] cudaMiner & ccMiner CUDA based mining applications [Windows/Linux/MacOSX] - page 718.

restless

legendary

Activity: 1151

Merit: 1001

Quote from: cbuchner1 on March 09, 2014, 04:43:54 AM

I put the original scanhash function from blakecoin cpuminer (block.c) into cudaminer
and even then the shares were rejected with the same reason.

This leads me to believe that there may be subtle differences in the stratum implementation
that I need to track down.

Christian

Sorry, I was wrong, messed conf file. Actually changing just cl file was not enough Sad

I think all the differences are in .cl code.
I tried sgminer4.1 +6850 (version compiled 19-02), there is blake.cl in there, 9K big, author Thomas Pornin - all shares rejected with same error as above.
Then downloaded the custom cgminer for blake, the file is blake256.cl - 6K big.
Just putting blake256.cl into kernel folder and setting it as algo - bam, sgminer started sending shares which were accepted, so all the diff are in cl

Code of blake256.cl

Code:

// BLAKE-256 hash algorithm in OpenCL, 8 rounds, second block for blakecoin

#ifdef cl_khr_byte_addressable_store
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : disable
#endif

#ifdef VECTORS4
typedef uint4 uint32_t;
#elif defined(VECTORS2)
typedef uint2 uint32_t;
#else
typedef uint uint32_t;
#endif

typedef unsigned char uint8_t;

#define SWAP32_V(n) \
   (((n) << 24) | (((n) & 0xff00) << 8) | \
   (((n) >> 8) & 0xff00) | ((n) >> 24))

typedef struct
{
  uint32_t h[8];
  uint t;
} state256;

#define NB_ROUNDS32 8

constant uint8_t sigma[16][16] =
{
  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
  {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
  {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
  { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
  { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
  {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
  {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
  { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
  {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
  {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
  {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
  { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
  { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
};

constant uint u256[16] =
{
  0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344,
  0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89,
  0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c,
  0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917
};

constant uint8_t steps[8][5] =
{
  /* column step */
  { 0, 4, 8, 12, 0 },
  { 1, 5, 9, 13, 2 },
  { 2, 6, 10, 14, 4 },
  { 3, 7, 11, 15, 6 },
  /* diagonal step */
  { 0, 5, 10, 15, 8 },
  { 1, 6, 11, 12, 10 },
  { 2, 7, 8, 13, 12 },
  { 3, 4, 9, 14, 14 }
};

//#define ROT32(x,n) (((x)<<(32-n))|( (x)>>(n)))
#define ROT32(x,n) (rotate((uint32_t)x, (uint32_t)32-n))
#define ADD32(x,y) ((uint32_t)((x) + (y)))
#define XOR32(x,y) ((uint32_t)((x) ^ (y)))

#define G(a,b,c,d,i) \
do {\
   v[a] += XOR32(m[sigma[r][i]], u256[sigma[r][i+1]]) + v[b];\
   v[d] = ROT32(XOR32(v[d],v[a]),16);\
   v[c] += v[d];\
   v[b] = ROT32(XOR32(v[b],v[c]),12);\
   v[a] += XOR32(m[sigma[r][i+1]], u256[sigma[r][i]]) + v[b]; \
   v[d] = ROT32(XOR32(v[d],v[a]), 8);\
   v[c] += v[d];\
   v[b] = ROT32(XOR32(v[b],v[c]), 7);\
  } while (0)

// compress a block
void blake256_compress_block( private state256 *S, private uint32_t *m)
{
  private uint32_t v[16];
  #pragma unroll 8
  for( int i = 0; i < 8; ++i ) { v[i] = S->h[i]; v[i+8] = u256[i]; };

  v[12] ^= S->t;
  v[13] ^= S->t;

  #pragma unroll 7
  for(int r = 0; r < 7; r++ )
  {
   #pragma unroll 8
   for(int j = 0; j < 8; j++)
   G( steps[j][0], steps[j][1], steps[j][2], steps[j][3], steps[j][4] );
   /*
   // column step
   G( 0, 4, 8, 12, 0 );
   G( 1, 5, 9, 13, 2 );
   G( 2, 6, 10, 14, 4 );
   G( 3, 7, 11, 15, 6 );
   // diagonal step
   G( 0, 5, 10, 15, 8 );
   G( 1, 6, 11, 12, 10 );
   G( 2, 7, 8, 13, 12 );
   G( 3, 4, 9, 14, 14 );
*/
  }
  // not need last round last step
   int r = 7;
   #pragma unroll 7
   for(int j = 0; j < 7; j++)
   G( steps[j][0], steps[j][1], steps[j][2], steps[j][3], steps[j][4] );

  S->h[7] ^= v[7] ^ v[15];
}

#define FOUND (0xFF)
#define SETFOUND(Xnonce) output[output[FOUND]++] = Xnonce

#ifndef WORKSIZE
#define WORKSIZE 64
#endif

__attribute__((vec_type_hint(uint32_t)))
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(
volatile __global uint * restrict output,
// precalc hash from fisrt part of message
const uint h0,
const uint h1,
const uint h2,
const uint h3,
const uint h4,
const uint h5,
const uint h6,
const uint h7,
// last 12 bytes of original message
const uint in16,
const uint in17,
const uint in18
)
{

  private state256 S;

  S.h[0] = h0;
  S.h[1] = h1;
  S.h[2] = h2;
  S.h[3] = h3;
  S.h[4] = h4;
  S.h[5] = h5;
  S.h[6] = h6;
  S.h[7] = h7;

#if defined(VECTORS4)
   uint32_t gx = get_global_id(0);
   const uint gs = get_global_size(0);
gx.y += gs;
gx.z += gs*2;
gx.w += gs*3;
#elif defined(VECTORS2)
   uint32_t gx = get_global_id(0);
gx.y += get_global_size(0);
#else
   const uint32_t gx = get_global_id(0);
#endif

private uint32_t m[16];
   S.t = 640;
   m[0] = in16;
   m[1] = in17;
   m[2] = in18;
   m[3] = SWAP32_V(gx);
   m[4] = 0x80000000;
   #pragma unroll 8
   for (int i = 5;i<13;i++) m[i] = 0;
   m[13] = 1;
   m[14] = 0;
   m[15] = 640;

   blake256_compress_block( &S, &m );

#if defined(VECTORS4)
bool result = any(S.h[7] == 0);

if (result) {
if (S.h[7].x == 0)
SETFOUND(gx.x);
if (S.h[7].y == 0)
SETFOUND(gx.y);
if (S.h[7].z == 0)
SETFOUND(gx.z);
if (S.h[7].w == 0)
SETFOUND(gx.w);
}
#elif defined(VECTORS2)
bool result = any(S.h[7] == 0);

if (result) {
if (S.h[7].x == 0)
SETFOUND(gx.x);
if (S.h[7].y == 0)
SETFOUND(gx.y);
}
#else
// if (S.h[7] <= 0x000000FF) // from 0 to 255 low difficulty shares and above, maybe need to swap h[7] before, do not remember
if (S.h[7] == 0)
SETFOUND(gx);
#endif
}

cbuchner1

hero member

Activity: 756

Merit: 502

Quote from: djm34 on March 08, 2014, 05:23:22 AM

I just ran the blake algo on the blakecoin pool
The gtx780ti seems to run at 971Mhash/s, however all shares are rejected.
Debug gives this reject reason: H-not-zero

I put the original scanhash function from blakecoin cpuminer (block.c) into cudaminer
and even then the shares were rejected with the same reason.

This leads me to believe that there may be subtle differences in the stratum implementation
that I need to track down.

Christian

loonb

full member

Activity: 193

Merit: 100

Guys, what we got today about speed, comparing to similar level GPUs between ATI and Nvidia?

kernels10

sr. member

Activity: 408

Merit: 250

ded

Quote from: Omega0255 on March 09, 2014, 12:55:22 AM

Quote from: kernels10 on March 09, 2014, 12:28:54 AM

are we going to see any speed increases with CUDA 6.0? Grin

Getting ahead of yourself there aren't you? Roll Eyes

can't help it Grin

didn't see the cuda update in Arch and accidentally updated to 6.0 Tongue

Quote from: ivanlabrie on March 09, 2014, 12:57:26 AM

Quote from: kernels10 on March 09, 2014, 12:28:54 AM

are we going to see any speed increases with CUDA 6.0? Grin

I don't think so...with current hardware at least not. Cry

Christian said so a few posts back.

oh well

Christian has already done quite a bit of work getting us to where we are now. Big thanks to him.

cbuchner1

hero member

Activity: 756

Merit: 502

Quote from: loonb on March 09, 2014, 03:18:50 AM

Christian, tell me please, is there any chances to adapt cudaminer at quark algos? or qubit and dark ?
Who knows, maybe it will be a great benefit?
Thx

these coins use a mix of many algos, which is too much work for me to implement in CUDA.

Currently I focus on single algorithms, like for example Blake256.

Christian

loonb

full member

Activity: 193

Merit: 100

Christian, tell me please, is there any chances to adapt cudaminer at quark algos? or qubit and dark ?
Who knows, maybe it will be a great benefit?
Thx

ivanlabrie

hero member

Activity: 812

Merit: 1000

Quote from: kernels10 on March 09, 2014, 12:28:54 AM

are we going to see any speed increases with CUDA 6.0? Grin

I don't think so...with current hardware at least not. Cry

Christian said so a few posts back.

Omega0255

full member

Activity: 168

Merit: 100

Quote from: kernels10 on March 09, 2014, 12:28:54 AM

are we going to see any speed increases with CUDA 6.0? Grin

Getting ahead of yourself there aren't you? Roll Eyes

kernels10

sr. member

Activity: 408

Merit: 250

ded

are we going to see any speed increases with CUDA 6.0? Grin

bacon_wrangler

newbie

Activity: 26

Merit: 0

Any suggestions on a vertcoin pool for someone only doing 90Kh? I don't recognize any of the names on the vertcoin.com pool page but dedicatedpool, and that's a 2% fee. I picked a P2P node with low latency at random, I don't know that going P2P is going to be the right path for me. Tongue

EDIT: I let cudaminer autotune, and it picked K4x32 for my 650ti boost, but I"m running about 2% rejection rate at 64*C and gettng occassional "result did not validate on CPU" errors.

antonio8

legendary

Activity: 1400

Merit: 1000

Quote from: SR20DEN on March 08, 2014, 07:25:33 PM

Quote

I take no credit in this. Found it days ago and saved it in a file. I do believe credit goes to cbuchner1

--algo=keccak -i 0 -l T1024x24 -C 2 -L 1024 -m 1

I am getting 161k to 163k on my 750ti overclocked.

Thank you for this! I still am only 153mh per card but that is at least twice as fast as I was running before.

I use EVGA Precision and have 1 card overclocked 110MHz on gpu clock and 325MHz on mem clock (this one is a non factory overclocked card). I have seen people say they have the memory overclocked by as much as 600.

SR20DEN

newbie

Activity: 19

Merit: 0

Quote

I take no credit in this. Found it days ago and saved it in a file. I do believe credit goes to cbuchner1

--algo=keccak -i 0 -l T1024x24 -C 2 -L 1024 -m 1

I am getting 161k to 163k on my 750ti overclocked.

Thank you for this! I still am only 153mh per card but that is at least twice as fast as I was running before.

guitarplinker

legendary

Activity: 1694

Merit: 1024

Quote from: djm34 on March 08, 2014, 10:21:37 AM

Quote from: guitarplinker on March 08, 2014, 10:02:16 AM

Is it possible to set up two pools in one config with CUDAMiner? I'm looking to rent out my rig at Betarigs, and to do that I'd need to use two pool addresses and logins in my config.

How does work this renting business ?
Does the guy who rent a rig, access the machine (through ssh or vpn) and control it or you just have to point the machine where he ask ?

Basically you add the BetaRig's IP, username and password to your CGMiner config before your other IP username and password (for your main mining pool) and then when someone rents out your rig, it'll switch to the BetaRig's credidentials automatically and then when the time period for renting is up, back to your original mining pool.

Basically it doesn't send your machine work until someone rents your PC, and then stops sending work when the rental has finished.

antonio8

legendary

Activity: 1400

Merit: 1000

Quote from: ManIkWeet on March 08, 2014, 06:36:05 PM

Quote from: bromstar on March 08, 2014, 04:23:27 PM

Quote from: ManiacMiner on March 05, 2014, 04:55:43 PM

Quote from: Benchman on March 05, 2014, 12:27:25 PM

What is the maximum of GTX750 Ti in Maxcoin (keccak SHA3).

Can hit 200 mh/s max overclocked?

My 750Ti can +/- 165 mh with 2014-02-28

What settings are u using to get this hashrate with a 750 ti? I only get ~70mh/s with -l K1000x24 -H 0 -C 1

You are using settings that seem more appropriate for something like a 780, have you tried lowering the x24 to x5?

I take no credit in this. Found it days ago and saved it in a file. I do believe credit goes to cbuchner1

--algo=keccak -i 0 -l T1024x24 -C 2 -L 1024 -m 1

I am getting 161k to 163k on my 750ti overclocked.

ManIkWeet

full member

Activity: 182

Merit: 100

Quote from: bromstar on March 08, 2014, 04:23:27 PM

Quote from: ManiacMiner on March 05, 2014, 04:55:43 PM

Quote from: Benchman on March 05, 2014, 12:27:25 PM

What is the maximum of GTX750 Ti in Maxcoin (keccak SHA3).

Can hit 200 mh/s max overclocked?

My 750Ti can +/- 165 mh with 2014-02-28

What settings are u using to get this hashrate with a 750 ti? I only get ~70mh/s with -l K1000x24 -H 0 -C 1

You are using settings that seem more appropriate for something like a 780, have you tried lowering the x24 to x5?

bromstar

member

Activity: 66

Merit: 10

Quote from: ManiacMiner on March 05, 2014, 04:55:43 PM

Quote from: Benchman on March 05, 2014, 12:27:25 PM

What is the maximum of GTX750 Ti in Maxcoin (keccak SHA3).

Can hit 200 mh/s max overclocked?

My 750Ti can +/- 165 mh with 2014-02-28

What settings are u using to get this hashrate with a 750 ti? I only get ~70mh/s with -l K1000x24 -H 0 -C 1

ManIkWeet

full member

Activity: 182

Merit: 100

Quote from: Decagrog on March 08, 2014, 02:41:39 PM

After a full day of test I'm really out of ideas Undecided

pheraps is better to look for a new motherboard ( maybe the gigabyte 990fxa-ud3 )

Get a better CPU (AMD is known to suck Tongue

)
Or, indeed, the motherboard...

69charger

full member

Activity: 173

Merit: 100

I had this same problem with a single card and an AMD dual core. It is an issue with some stupid "network manager" built into nVidia drivers. Uninstall driver = no CPU usage. Reinstall driver = constant 40% driver usage. Switched boards to an old Intel dual core and now I do not have this problem.

Decagrog

sr. member

Activity: 289

Merit: 251

Quote from: liomojo1 on March 08, 2014, 08:26:20 AM

Try Win 7 but 64 bits and look if there is a bios update for the mobo. Disable in the bios everything not needed , com ports ..... etc.

Here I am
-I've installed win 7 64bit
- disabled integrated audio and serial port into bios (there's no much other to do)
- disabled all useless windows service
- installed lastest motherboard chipset driver, lastest nvidia driver, .net framework 4.0, lastest cudaminer (32bit) , msi afterburner
- still todo: update motherboard bios

The good news is that windows finally see all 4GB ddr even if I've 5 cards.
The bad news is that the issue is still here, maybe there's just slight better performance but nothing more

With 5 cards (3 on 16x slot and 2 on riser to 1x slot) very low hashrate ans slugginess due the heavy cpu load

With 4 cards (3 on 16x slot and 1 on riser to 1x slot) start with a slight low hashrate (850kh/s total with the card on riser only at 80kh/s ) and slowly go down to 420kh/s. Notice the same heavy cpu load on the "system interrupts" process

With 3 cards (the 3 on 16x slot) the hashrate is ok and there is almost no load on the cpu

After a full day of test I'm really out of ideas Undecided

pheraps is better to look for a new motherboard ( maybe the gigabyte 990fxa-ud3 )

justme27

member

Activity: 64

Merit: 11

Quote from: filthybiscuits on March 08, 2014, 02:04:02 PM

new to using cudaminer ..

as an example, if i have 3 cards running at 300khash, does the total of 900khash get reported to a mining pool?
or does a pool/worker read it as 300khash per card, as the share is accepted by the pool?

A pool only sees the shares you submit, and can guesstimate your hashrate from that. It's not very accurate and you shouldn't pay much attention to it unless it's VERY far off.

Topic: [ANN] cudaMiner & ccMiner CUDA based mining applications [Windows/Linux/MacOSX] - page 718. (Read 3426976 times)