and even then the shares were rejected with the same reason.
This leads me to believe that there may be subtle differences in the stratum implementation
that I need to track down.
Christian
I tried sgminer4.1 +6850 (version compiled 19-02), there is blake.cl in there, 9K big, author Thomas Pornin - all shares rejected with same error as above.
Then downloaded the custom cgminer for blake, the file is blake256.cl - 6K big.
Just putting blake256.cl into kernel folder and setting it as algo - bam, sgminer started sending shares which were accepted, so all the diff are in cl
Code of blake256.cl
#ifdef cl_khr_byte_addressable_store
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : disable
#endif
#ifdef VECTORS4
typedef uint4 uint32_t;
#elif defined(VECTORS2)
typedef uint2 uint32_t;
#else
typedef uint uint32_t;
#endif
typedef unsigned char uint8_t;
#define SWAP32_V(n) \
(((n) << 24) | (((n) & 0xff00) << 8) | \
(((n) >> 8) & 0xff00) | ((n) >> 24))
typedef struct
{
uint32_t h[8];
uint t;
} state256;
#define NB_ROUNDS32 8
constant uint8_t sigma[16][16] =
{
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
};
constant uint u256[16] =
{
0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344,
0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89,
0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c,
0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917
};
constant uint8_t steps[8][5] =
{
/* column step */
{ 0, 4, 8, 12, 0 },
{ 1, 5, 9, 13, 2 },
{ 2, 6, 10, 14, 4 },
{ 3, 7, 11, 15, 6 },
/* diagonal step */
{ 0, 5, 10, 15, 8 },
{ 1, 6, 11, 12, 10 },
{ 2, 7, 8, 13, 12 },
{ 3, 4, 9, 14, 14 }
};
//#define ROT32(x,n) (((x)<<(32-n))|( (x)>>(n)))
#define ROT32(x,n) (rotate((uint32_t)x, (uint32_t)32-n))
#define ADD32(x,y) ((uint32_t)((x) + (y)))
#define XOR32(x,y) ((uint32_t)((x) ^ (y)))
#define G(a,b,c,d,i) \
do {\
v[a] += XOR32(m[sigma[r][i]], u256[sigma[r][i+1]]) + v[b];\
v[d] = ROT32(XOR32(v[d],v[a]),16);\
v[c] += v[d];\
v[b] = ROT32(XOR32(v[b],v[c]),12);\
v[a] += XOR32(m[sigma[r][i+1]], u256[sigma[r][i]]) + v[b]; \
v[d] = ROT32(XOR32(v[d],v[a]), 8);\
v[c] += v[d];\
v[b] = ROT32(XOR32(v[b],v[c]), 7);\
} while (0)
// compress a block
void blake256_compress_block( private state256 *S, private uint32_t *m)
{
private uint32_t v[16];
#pragma unroll 8
for( int i = 0; i < 8; ++i ) { v[i] = S->h[i]; v[i+8] = u256[i]; };
v[12] ^= S->t;
v[13] ^= S->t;
#pragma unroll 7
for(int r = 0; r < 7; r++ )
{
#pragma unroll 8
for(int j = 0; j < 8; j++)
G( steps[j][0], steps[j][1], steps[j][2], steps[j][3], steps[j][4] );
/*
// column step
G( 0, 4, 8, 12, 0 );
G( 1, 5, 9, 13, 2 );
G( 2, 6, 10, 14, 4 );
G( 3, 7, 11, 15, 6 );
// diagonal step
G( 0, 5, 10, 15, 8 );
G( 1, 6, 11, 12, 10 );
G( 2, 7, 8, 13, 12 );
G( 3, 4, 9, 14, 14 );
*/
}
// not need last round last step
int r = 7;
#pragma unroll 7
for(int j = 0; j < 7; j++)
G( steps[j][0], steps[j][1], steps[j][2], steps[j][3], steps[j][4] );
S->h[7] ^= v[7] ^ v[15];
}
#define FOUND (0xFF)
#define SETFOUND(Xnonce) output[output[FOUND]++] = Xnonce
#ifndef WORKSIZE
#define WORKSIZE 64
#endif
__attribute__((vec_type_hint(uint32_t)))
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(
volatile __global uint * restrict output,
// precalc hash from fisrt part of message
const uint h0,
const uint h1,
const uint h2,
const uint h3,
const uint h4,
const uint h5,
const uint h6,
const uint h7,
// last 12 bytes of original message
const uint in16,
const uint in17,
const uint in18
)
{
private state256 S;
S.h[0] = h0;
S.h[1] = h1;
S.h[2] = h2;
S.h[3] = h3;
S.h[4] = h4;
S.h[5] = h5;
S.h[6] = h6;
S.h[7] = h7;
#if defined(VECTORS4)
uint32_t gx = get_global_id(0);
const uint gs = get_global_size(0);
gx.y += gs;
gx.z += gs*2;
gx.w += gs*3;
#elif defined(VECTORS2)
uint32_t gx = get_global_id(0);
gx.y += get_global_size(0);
#else
const uint32_t gx = get_global_id(0);
#endif
private uint32_t m[16];
S.t = 640;
m[0] = in16;
m[1] = in17;
m[2] = in18;
m[3] = SWAP32_V(gx);
m[4] = 0x80000000;
#pragma unroll 8
for (int i = 5;i<13;i++) m[i] = 0;
m[13] = 1;
m[14] = 0;
m[15] = 640;
blake256_compress_block( &S, &m );
#if defined(VECTORS4)
bool result = any(S.h[7] == 0);
if (result) {
if (S.h[7].x == 0)
SETFOUND(gx.x);
if (S.h[7].y == 0)
SETFOUND(gx.y);
if (S.h[7].z == 0)
SETFOUND(gx.z);
if (S.h[7].w == 0)
SETFOUND(gx.w);
}
#elif defined(VECTORS2)
bool result = any(S.h[7] == 0);
if (result) {
if (S.h[7].x == 0)
SETFOUND(gx.x);
if (S.h[7].y == 0)
SETFOUND(gx.y);
}
#else
// if (S.h[7] <= 0x000000FF) // from 0 to 255 low difficulty shares and above, maybe need to swap h[7] before, do not remember
if (S.h[7] == 0)
SETFOUND(gx);
#endif
}