bump, I'm wondering if anyone with an AMD card has tried AMD's bitwise rotation function, it's supposed to be much faster than coding for it in OCL:
//#pragma OPENCL EXTENSION cl_amd_media_ops : enable
//#define rot(x,y) amd_bitalign(x, x, (32-y))
edit: Appears someone uploaded an AMD optimized version of reaper.cl to pastebin, here it is:
typedef uint uint32_t;
typedef ulong uint64_t;
typedef uchar uint8_t;
typedef uint uint32;
typedef ulong uint64;
#define U8TO32(p) \
(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) ))
#define U8TO64(p) \
(((uint64_t)U8TO32(p) << 32) | (uint64_t)U8TO32((p) + 4))
#define U32TO8(p, v) \
(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
(p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) );
#define U64TO8(p, v) \
U32TO8((p), (uint32_t)((v) >> 32)); \
U32TO8((p) + 4, (uint32_t)((v) ));
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
/*typedef struct {
uint64_t h[8];
uint8_t buf[128];
} state;*/
__constant uint8_t sigma[256] =
{
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15 ,
14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3 ,
11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4 ,
7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8 ,
9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13 ,
2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9 ,
12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11 ,
13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10 ,
6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5 ,
10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0 ,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15 ,
14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3 ,
11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4 ,
7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8 ,
9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13 ,
2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9
};
__constant uint64_t cst[16] =
{
0x243F6A8885A308D3UL,0x13198A2E03707344UL,0xA4093822299F31D0UL,0x082EFA98EC4E6C89UL,
0x452821E638D01377UL,0xBE5466CF34E90C6CUL,0xC0AC29B7C97C50DDUL,0x3F84D5B5B5470917UL,
0x9216D5D98979FB1BUL,0xD1310BA698DFB5ACUL,0x2FFD72DBD01ADFB7UL,0xB8E1AFED6A267E96UL,
0xBA7C9045F12C7F99UL,0x24A19947B3916CF7UL,0x0801F2E2858EFC16UL,0x636920D871574E69UL
};
__constant uint K[64] =
{
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
//uint rotl(uint x, uint y)
//{
// return (x<>(32-y));
//}
#define rotl(x, y) amd_bitalign(x, x, (uint)(32 - y))
//#define Ch(x, y, z) (z ^ (x & (y ^ z)))
#define Ma(x, y, z) ((y & z) | (x & (y | z)))
#define Ch(x, y, z) bitselect(z,y,x)
// Ma can also be implemented in terms of bitselect
//#define Ma(y, z, x) bitselect(z^x,y,x)
#define Tr(x,a,b,c) (rotl(x,a)^rotl(x,b)^rotl(x,c))
#define R(x) (work[x] = (rotl(work[x-2],15)^rotl(work[x-2],13)^((work[x-2])>>10)) + work[x-7] + (rotl(work[x-15],25)^rotl(work[x-15],14)^((work[x-15])>>3)) + work[x-16])
#define sharound(a,b,c,d,e,f,g,h,x,K) h+=Tr(e,7,21,26)+Ch(e,f,g)+K+x; d+=h; h+=Tr(a,10,19,30)+Ma(a,b,c);
#define sharound_s(a,b,c,d,e,f,g,h,x) h+=Tr(e,7,21,26)+Ch(e,f,g)+x; d+=h; h+=Tr(a,10,19,30)+Ma(a,b,c);
uint EndianSwap(uint n)
{
return ((n&0xFF)<<24) | ((n&0xFF00)<<8) | ((n&0xFF0000)>>8) | ((n&0xFF000000)>>24);
}
void Sha256_round(uint* s, unsigned char* data)
{
uint work[64];
uint* udata = (uint*)data;
#pragma unroll
for(uint i=0; i<16; ++i)
{
work[i] = EndianSwap(udata[i]);
}
uint A = s[0];
uint B = s[1];
uint C = s[2];
uint D = s[3];
uint E = s[4];
uint F = s[5];
uint G = s[6];
uint H = s[7];
sharound(A,B,C,D,E,F,G,H,work[0],K[0]);
sharound(H,A,B,C,D,E,F,G,work[1],K[1]);
sharound(G,H,A,B,C,D,E,F,work[2],K[2]);
sharound(F,G,H,A,B,C,D,E,work[3],K[3]);
sharound(E,F,G,H,A,B,C,D,work[4],K[4]);
sharound(D,E,F,G,H,A,B,C,work[5],K[5]);
sharound(C,D,E,F,G,H,A,B,work[6],K[6]);
sharound(B,C,D,E,F,G,H,A,work[7],K[7]);
sharound(A,B,C,D,E,F,G,H,work[8],K[8]);
sharound(H,A,B,C,D,E,F,G,work[9],K[9]);
sharound(G,H,A,B,C,D,E,F,work[10],K[10]);
sharound(F,G,H,A,B,C,D,E,work[11],K[11]);
sharound(E,F,G,H,A,B,C,D,work[12],K[12]);
sharound(D,E,F,G,H,A,B,C,work[13],K[13]);
sharound(C,D,E,F,G,H,A,B,work[14],K[14]);
sharound(B,C,D,E,F,G,H,A,work[15],K[15]);
sharound(A,B,C,D,E,F,G,H,R(16),K[16]);
sharound(H,A,B,C,D,E,F,G,R(17),K[17]);
sharound(G,H,A,B,C,D,E,F,R(18),K[18]);
sharound(F,G,H,A,B,C,D,E,R(19),K[19]);
sharound(E,F,G,H,A,B,C,D,R(20),K[20]);
sharound(D,E,F,G,H,A,B,C,R(21),K[21]);
sharound(C,D,E,F,G,H,A,B,R(22),K[22]);
sharound(B,C,D,E,F,G,H,A,R(23),K[23]);
sharound(A,B,C,D,E,F,G,H,R(24),K[24]);
sharound(H,A,B,C,D,E,F,G,R(25),K[25]);
sharound(G,H,A,B,C,D,E,F,R(26),K[26]);
sharound(F,G,H,A,B,C,D,E,R(27),K[27]);
sharound(E,F,G,H,A,B,C,D,R(28),K[28]);
sharound(D,E,F,G,H,A,B,C,R(29),K[29]);
sharound(C,D,E,F,G,H,A,B,R(30),K[30]);
sharound(B,C,D,E,F,G,H,A,R(31),K[31]);
sharound(A,B,C,D,E,F,G,H,R(32),K[32]);
sharound(H,A,B,C,D,E,F,G,R(33),K[33]);
sharound(G,H,A,B,C,D,E,F,R(34),K[34]);
sharound(F,G,H,A,B,C,D,E,R(35),K[35]);
sharound(E,F,G,H,A,B,C,D,R(36),K[36]);
sharound(D,E,F,G,H,A,B,C,R(37),K[37]);
sharound(C,D,E,F,G,H,A,B,R(38),K[38]);
sharound(B,C,D,E,F,G,H,A,R(39),K[39]);
sharound(A,B,C,D,E,F,G,H,R(40),K[40]);
sharound(H,A,B,C,D,E,F,G,R(41),K[41]);
sharound(G,H,A,B,C,D,E,F,R(42),K[42]);
sharound(F,G,H,A,B,C,D,E,R(43),K[43]);
sharound(E,F,G,H,A,B,C,D,R(44),K[44]);
sharound(D,E,F,G,H,A,B,C,R(45),K[45]);
sharound(C,D,E,F,G,H,A,B,R(46),K[46]);
sharound(B,C,D,E,F,G,H,A,R(47),K[47]);
sharound(A,B,C,D,E,F,G,H,R(48),K[48]);
sharound(H,A,B,C,D,E,F,G,R(49),K[49]);
sharound(G,H,A,B,C,D,E,F,R(50),K[50]);
sharound(F,G,H,A,B,C,D,E,R(51),K[51]);
sharound(E,F,G,H,A,B,C,D,R(52),K[52]);
sharound(D,E,F,G,H,A,B,C,R(53),K[53]);
sharound(C,D,E,F,G,H,A,B,R(54),K[54]);
sharound(B,C,D,E,F,G,H,A,R(55),K[55]);
sharound(A,B,C,D,E,F,G,H,R(56),K[56]);
sharound(H,A,B,C,D,E,F,G,R(57),K[57]);
sharound(G,H,A,B,C,D,E,F,R(58),K[58]);
sharound(F,G,H,A,B,C,D,E,R(59),K[59]);
sharound(E,F,G,H,A,B,C,D,R(60),K[60]);
sharound(D,E,F,G,H,A,B,C,R(61),K[61]);
sharound(C,D,E,F,G,H,A,B,R(62),K[62]);
sharound(B,C,D,E,F,G,H,A,R(63),K[63]);
s[0] += A;
s[1] += B;
s[2] += C;
s[3] += D;
s[4] += E;
s[5] += F;
s[6] += G;
s[7] += H;
}
__constant uint P[64] =
{
0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19c0174,
0x649b69c1, 0xf9be478a, 0x0fe1edc6, 0x240ca60c, 0x4fe9346f, 0x4d1c84ab, 0x61b94f1e, 0xf6f993db,
0xe8465162, 0xad13066f, 0xb0214c0d, 0x695a0283, 0xa0323379, 0x2bd376e9, 0xe1d0537c, 0x03a244a0,
0xfc13a4a5, 0xfafda43e, 0x56bea8bb, 0x445ec9b6, 0x39907315, 0x8c0d4e9f, 0xc832dccc, 0xdaffb65b,
0x1fed4f61, 0x2f646808, 0x1ff32294, 0x2634ccd7, 0xb0ebdefa, 0xd6fc592b, 0xa63c5c8f, 0xbe9fbab9,
0x0158082c, 0x68969712, 0x51e1d7e1, 0x5cf12d0d, 0xc4be2155, 0x7d7c8a34, 0x611f2c60, 0x036324af,
0xa4f08d87, 0x9e3e8435, 0x2c6dae30, 0x11921afc, 0xb76d720e, 0x245f3661, 0xc3a65ecb, 0x43b9e908
};
void Sha256_round_padding(uint* s)
{
uint A = s[0];
uint B = s[1];
uint C = s[2];
uint D = s[3];
uint E = s[4];
uint F = s[5];
uint G = s[6];
uint H = s[7];
sharound_s(A,B,C,D,E,F,G,H,P[0]);
sharound_s(H,A,B,C,D,E,F,G,P[1]);
sharound_s(G,H,A,B,C,D,E,F,P[2]);
sharound_s(F,G,H,A,B,C,D,E,P[3]);
sharound_s(E,F,G,H,A,B,C,D,P[4]);
sharound_s(D,E,F,G,H,A,B,C,P[5]);
sharound_s(C,D,E,F,G,H,A,B,P[6]);
sharound_s(B,C,D,E,F,G,H,A,P[7]);
sharound_s(A,B,C,D,E,F,G,H,P[8]);
sharound_s(H,A,B,C,D,E,F,G,P[9]);
sharound_s(G,H,A,B,C,D,E,F,P[10]);
sharound_s(F,G,H,A,B,C,D,E,P[11]);
sharound_s(E,F,G,H,A,B,C,D,P[12]);
sharound_s(D,E,F,G,H,A,B,C,P[13]);
sharound_s(C,D,E,F,G,H,A,B,P[14]);
sharound_s(B,C,D,E,F,G,H,A,P[15]);
sharound_s(A,B,C,D,E,F,G,H,P[16]);
sharound_s(H,A,B,C,D,E,F,G,P[17]);
sharound_s(G,H,A,B,C,D,E,F,P[18]);
sharound_s(F,G,H,A,B,C,D,E,P[19]);
sharound_s(E,F,G,H,A,B,C,D,P[20]);
sharound_s(D,E,F,G,H,A,B,C,P[21]);
sharound_s(C,D,E,F,G,H,A,B,P[22]);
sharound_s(B,C,D,E,F,G,H,A,P[23]);
sharound_s(A,B,C,D,E,F,G,H,P[24]);
sharound_s(H,A,B,C,D,E,F,G,P[25]);
sharound_s(G,H,A,B,C,D,E,F,P[26]);
sharound_s(F,G,H,A,B,C,D,E,P[27]);
sharound_s(E,F,G,H,A,B,C,D,P[28]);
sharound_s(D,E,F,G,H,A,B,C,P[29]);
sharound_s(C,D,E,F,G,H,A,B,P[30]);
sharound_s(B,C,D,E,F,G,H,A,P[31]);
sharound_s(A,B,C,D,E,F,G,H,P[32]);
sharound_s(H,A,B,C,D,E,F,G,P[33]);
sharound_s(G,H,A,B,C,D,E,F,P[34]);
sharound_s(F,G,H,A,B,C,D,E,P[35]);
sharound_s(E,F,G,H,A,B,C,D,P[36]);
sharound_s(D,E,F,G,H,A,B,C,P[37]);
sharound_s(C,D,E,F,G,H,A,B,P[38]);
sharound_s(B,C,D,E,F,G,H,A,P[39]);
sharound_s(A,B,C,D,E,F,G,H,P[40]);
sharound_s(H,A,B,C,D,E,F,G,P[41]);
sharound_s(G,H,A,B,C,D,E,F,P[42]);
sharound_s(F,G,H,A,B,C,D,E,P[43]);
sharound_s(E,F,G,H,A,B,C,D,P[44]);
sharound_s(D,E,F,G,H,A,B,C,P[45]);
sharound_s(C,D,E,F,G,H,A,B,P[46]);
sharound_s(B,C,D,E,F,G,H,A,P[47]);
sharound_s(A,B,C,D,E,F,G,H,P[48]);
sharound_s(H,A,B,C,D,E,F,G,P[49]);
sharound_s(G,H,A,B,C,D,E,F,P[50]);
sharound_s(F,G,H,A,B,C,D,E,P[51]);
sharound_s(E,F,G,H,A,B,C,D,P[52]);
sharound_s(D,E,F,G,H,A,B,C,P[53]);
sharound_s(C,D,E,F,G,H,A,B,P[54]);
sharound_s(B,C,D,E,F,G,H,A,P[55]);
sharound_s(A,B,C,D,E,F,G,H,P[56]);
sharound_s(H,A,B,C,D,E,F,G,P[57]);
sharound_s(G,H,A,B,C,D,E,F,P[58]);
sharound_s(F,G,H,A,B,C,D,E,P[59]);
sharound_s(E,F,G,H,A,B,C,D,P[60]);
s[7] += H;
}
#define ROT(x,n) (((x)<<(64-n))|( (x)>>(n)))
#define G(m,a,b,c,d,e,i) \
v[a] += (m[sigma[i+e]] ^ cst[sigma[i+e+1]]) + v[b]; \
v[d] = ROT( v[d] ^ v[a],32); \
v[c] += v[d]; \
v[b] = ROT( v[b] ^ v[c],25); \
v[a] += (m[sigma[i+e+1]] ^ cst[sigma[i+e]])+v[b]; \
v[d] = ROT( v[d] ^ v[a],16); \
v[c] += v[d]; \
v[b] = ROT( v[b] ^ v[c],11);
//assumes input is 512 bytes
__kernel void search(__global uint8_t* in_param, __global uint* out_param, __global uint8_t* pad)
{
uchar in[512];
#pragma unroll
for(uint i=0; i<128; ++i)
in[i] = in_param[i];
uint nonce = get_global_id(0);
*(uint*)(in+108) = nonce;
uint64_t h[8];
h[0]=0x6A09E667F3BCC908UL;
h[1]=0xBB67AE8584CAA73BUL;
h[2]=0x3C6EF372FE94F82BUL;
h[3]=0xA54FF53A5F1D36F1UL;
h[4]=0x510E527FADE682D1UL;
h[5]=0x9B05688C2B3E6C1FUL;
h[6]=0x1F83D9ABFB41BD6BUL;
h[7]=0x5BE0CD19137E2179UL;
uint64_t v[16];
#pragma unroll
for(uint i=0; i< 8;++i) v[i] = h[i];
v[ 8] = 0x243F6A8885A308D3UL;
v[ 9] = 0x13198A2E03707344UL;
v[10] = 0xA4093822299F31D0UL;
v[11] = 0x082EFA98EC4E6C89UL;
v[12] = 0x452821E638D01777UL;
v[13] = 0xBE5466CF34E9086CUL;
v[14] = 0xC0AC29B7C97C50DDUL;
v[15] = 0x3F84D5B5B5470917UL;
{
uint64_t m[16];
#pragma unroll
for(uint i=0; i<16;++i) m[i] = U8TO64(in + i*8);
uint i=0;
G( m, 0, 4, 8,12, 0, i); G( m, 1, 5, 9,13, 2, i); G( m, 2, 6,10,14, 4, i); G( m, 3, 7,11,15, 6, i);
G( m, 3, 4, 9,14,14, i); G( m, 2, 7, 8,13,12, i); G( m, 0, 5,10,15, 8, i); G( m, 1, 6,11,12,10, i);
i+=16;
G( m, 0, 4, 8,12, 0, i); G( m, 1, 5, 9,13, 2, i); G( m, 2, 6,10,14, 4, i); G( m, 3, 7,11,15, 6, i);
G( m, 3, 4, 9,14,14, i); G( m, 2, 7, 8,13,12, i); G( m, 0, 5,10,15, 8, i); G( m, 1, 6,11,12,10, i);
i+=16;
G( m, 0, 4, 8,12, 0, i); G( m, 1, 5, 9,13, 2, i); G( m, 2, 6,10,14, 4, i); G( m, 3, 7,11,15, 6, i);
G( m, 3, 4, 9,14,14, i); G( m, 2, 7, 8,13,12, i); G( m, 0, 5,10,15, 8, i); G( m, 1, 6,11,12,10, i);
i+=16;
G( m, 0, 4, 8,12, 0, i); G( m, 1, 5, 9,13, 2, i); G( m, 2, 6,10,14, 4, i); G( m, 3, 7,11,15, 6, i);
G( m, 3, 4, 9,14,14, i); G( m, 2, 7, 8,13,12, i); G( m, 0, 5,10,15, 8, i); G( m, 1, 6,11,12,10, i);
i+=16;
G( m, 0, 4, 8,12, 0, i); G( m, 1, 5, 9,13, 2, i); G( m, 2, 6,10,14, 4, i); G( m, 3, 7,11,15, 6, i);
G( m, 3, 4, 9,14,14, i); G( m, 2, 7, 8,13,12, i); G( m, 0, 5,10,15, 8, i); G( m, 1, 6,11,12,10, i);
i+=16;
G( m, 0, 4, 8,12, 0, i); G( m, 1, 5, 9,13, 2, i); G( m, 2, 6,10,14, 4, i); G( m, 3, 7,11,15, 6, i);
G( m, 3, 4, 9,14,14, i); G( m, 2, 7, 8,13,12, i); G( m, 0, 5,10,15, 8, i); G( m, 1, 6,11,12,10, i);
i+=16;
G( m, 0, 4, 8,12, 0, i); G( m, 1, 5, 9,13, 2, i); G( m, 2, 6,10,14, 4, i); G( m, 3, 7,11,15, 6, i);
G( m, 3, 4, 9,14,14, i); G( m, 2, 7, 8,13,12, i); G( m, 0, 5,10,15, 8, i); G( m, 1, 6,11,12,10, i);
i+=16;
G( m, 0, 4, 8,12, 0, i); G( m, 1, 5, 9,13, 2, i); G( m, 2, 6,10,14, 4, i); G( m, 3, 7,11,15, 6, i);
G( m, 3, 4, 9,14,14, i); G( m, 2, 7, 8,13,12, i); G( m, 0, 5,10,15, 8, i); G( m, 1, 6,11,12,10, i);
i+=16;
G( m, 0, 4, 8,12, 0, i); G( m, 1, 5, 9,13, 2, i); G( m, 2, 6,10,14, 4, i); G( m, 3, 7,11,15, 6, i);
G( m, 3, 4, 9,14,14, i); G( m, 2, 7, 8,13,12, i); G( m, 0, 5,10,15, 8, i); G( m, 1, 6,11,12,10, i);
i+=16;
G( m, 0, 4, 8,12, 0, i); G( m, 1, 5, 9,13, 2, i); G( m, 2, 6,10,14, 4, i); G( m, 3, 7,11,15, 6, i);
G( m, 3, 4, 9,14,14, i); G( m, 2, 7, 8,13,12, i); G( m, 0, 5,10,15, 8, i); G( m, 1, 6,11,12,10, i);
i+=16;
G( m, 0, 4, 8,12, 0, i); G( m, 1, 5, 9,13, 2, i); G( m, 2, 6,10,14, 4, i); G( m, 3, 7,11,15, 6, i);
G( m, 3, 4, 9,14,14, i); G( m, 2, 7, 8,13,12, i); G( m, 0, 5,10,15, 8, i); G( m, 1, 6,11,12,10, i);
i+=16;
G( m, 0, 4, 8,12, 0, i); G( m, 1, 5, 9,13, 2, i); G( m, 2, 6,10,14, 4, i); G( m, 3, 7,11,15, 6, i);
G( m, 3, 4, 9,14,14, i); G( m, 2, 7, 8,13,12, i); G( m, 0, 5,10,15, 8, i); G( m, 1, 6,11,12,10, i);
i+=16;
G( m, 0, 4, 8,12, 0, i); G( m, 1, 5, 9,13, 2, i); G( m, 2, 6,10,14, 4, i); G( m, 3, 7,11,15, 6, i);
G( m, 3, 4, 9,14,14, i); G( m, 2, 7, 8,13,12, i); G( m, 0, 5,10,15, 8, i); G( m, 1, 6,11,12,10, i);
i+=16;
G( m, 0, 4, 8,12, 0, i); G( m, 1, 5, 9,13, 2, i); G( m, 2, 6,10,14, 4, i); G( m, 3, 7,11,15, 6, i);
G( m, 3, 4, 9,14,14, i); G( m, 2, 7, 8,13,12, i); G( m, 0, 5,10,15, 8, i); G( m, 1, 6,11,12,10, i);
i+=16;
G( m, 0, 4, 8,12, 0, i); G( m, 1, 5, 9,13, 2, i); G( m, 2, 6,10,14, 4, i); G( m, 3, 7,11,15, 6, i);
G( m, 3, 4, 9,14,14, i); G( m, 2, 7, 8,13,12, i); G( m, 0, 5,10,15, 8, i); G( m, 1, 6,11,12,10, i);
i+=16;
G( m, 0, 4, 8,12, 0, i); G( m, 1, 5, 9,13, 2, i); G( m, 2, 6,10,14, 4, i); G( m, 3, 7,11,15, 6, i);
G( m, 3, 4, 9,14,14, i); G( m, 2, 7, 8,13,12, i); G( m, 0, 5,10,15, 8, i); G( m, 1, 6,11,12,10, i);
}
#pragma unroll
for(uint i=0; i<16;++i) h[i&7] ^= v[i];
#pragma unroll
for(uint i=0; i< 8;++i) v[i] = h[i];
v[8] = 0x243F6A8885A308D3UL;
v[9] = 0x13198A2E03707344UL;
v[10] = 0xA4093822299F31D0UL;
v[11] = 0x082EFA98EC4E6C89UL;
v[12] = 0x452821E638D01377UL;
v[13] = 0xBE5466CF34E90C6CUL;
v[14] = 0xC0AC29B7C97C50DDUL;
v[15] = 0x3F84D5B5B5470917UL;
{
uint64_t m2[16] = {1UL << 63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0x400};
uint i=0;
G(m2, 0, 4, 8,12, 0, i); G(m2, 1, 5, 9,13, 2, i); G(m2, 2, 6,10,14, 4, i); G(m2, 3, 7,11,15, 6, i);
G(m2, 3, 4, 9,14,14, i); G(m2, 2, 7, 8,13,12, i); G(m2, 0, 5,10,15, 8, i); G(m2, 1, 6,11,12,10, i);
i+=16;
G(m2, 0, 4, 8,12, 0, i); G(m2, 1, 5, 9,13, 2, i); G(m2, 2, 6,10,14, 4, i); G(m2, 3, 7,11,15, 6, i);
G(m2, 3, 4, 9,14,14, i); G(m2, 2, 7, 8,13,12, i); G(m2, 0, 5,10,15, 8, i); G(m2, 1, 6,11,12,10, i);
i+=16;
G(m2, 0, 4, 8,12, 0, i); G(m2, 1, 5, 9,13, 2, i); G(m2, 2, 6,10,14, 4, i); G(m2, 3, 7,11,15, 6, i);
G(m2, 3, 4, 9,14,14, i); G(m2, 2, 7, 8,13,12, i); G(m2, 0, 5,10,15, 8, i); G(m2, 1, 6,11,12,10, i);
i+=16;
G(m2, 0, 4, 8,12, 0, i); G(m2, 1, 5, 9,13, 2, i); G(m2, 2, 6,10,14, 4, i); G(m2, 3, 7,11,15, 6, i);
G(m2, 3, 4, 9,14,14, i); G(m2, 2, 7, 8,13,12, i); G(m2, 0, 5,10,15, 8, i); G(m2, 1, 6,11,12,10, i);
i+=16;
G(m2, 0, 4, 8,12, 0, i); G(m2, 1, 5, 9,13, 2, i); G(m2, 2, 6,10,14, 4, i); G(m2, 3, 7,11,15, 6, i);
G(m2, 3, 4, 9,14,14, i); G(m2, 2, 7, 8,13,12, i); G(m2, 0, 5,10,15, 8, i); G(m2, 1, 6,11,12,10, i);
i+=16;
G(m2, 0, 4, 8,12, 0, i); G(m2, 1, 5, 9,13, 2, i); G(m2, 2, 6,10,14, 4, i); G(m2, 3, 7,11,15, 6, i);
G(m2, 3, 4, 9,14,14, i); G(m2, 2, 7, 8,13,12, i); G(m2, 0, 5,10,15, 8, i); G(m2, 1, 6,11,12,10, i);
i+=16;
G(m2, 0, 4, 8,12, 0, i); G(m2, 1, 5, 9,13, 2, i); G(m2, 2, 6,10,14, 4, i); G(m2, 3, 7,11,15, 6, i);
G(m2, 3, 4, 9,14,14, i); G(m2, 2, 7, 8,13,12, i); G(m2, 0, 5,10,15, 8, i); G(m2, 1, 6,11,12,10, i);
i+=16;
G(m2, 0, 4, 8,12, 0, i); G(m2, 1, 5, 9,13, 2, i); G(m2, 2, 6,10,14, 4, i); G(m2, 3, 7,11,15, 6, i);
G(m2, 3, 4, 9,14,14, i); G(m2, 2, 7, 8,13,12, i); G(m2, 0, 5,10,15, 8, i); G(m2, 1, 6,11,12,10, i);
i+=16;
G(m2, 0, 4, 8,12, 0, i); G(m2, 1, 5, 9,13, 2, i); G(m2, 2, 6,10,14, 4, i); G(m2, 3, 7,11,15, 6, i);
G(m2, 3, 4, 9,14,14, i); G(m2, 2, 7, 8,13,12, i); G(m2, 0, 5,10,15, 8, i); G(m2, 1, 6,11,12,10, i);
i+=16;
G(m2, 0, 4, 8,12, 0, i); G(m2, 1, 5, 9,13, 2, i); G(m2, 2, 6,10,14, 4, i); G(m2, 3, 7,11,15, 6, i);
G(m2, 3, 4, 9,14,14, i); G(m2, 2, 7, 8,13,12, i); G(m2, 0, 5,10,15, 8, i); G(m2, 1, 6,11,12,10, i);
i+=16;
G(m2, 0, 4, 8,12, 0, i); G(m2, 1, 5, 9,13, 2, i); G(m2, 2, 6,10,14, 4, i); G(m2, 3, 7,11,15, 6, i);
G(m2, 3, 4, 9,14,14, i); G(m2, 2, 7, 8,13,12, i); G(m2, 0, 5,10,15, 8, i); G(m2, 1, 6,11,12,10, i);
i+=16;
G(m2, 0, 4, 8,12, 0, i); G(m2, 1, 5, 9,13, 2, i); G(m2, 2, 6,10,14, 4, i); G(m2, 3, 7,11,15, 6, i);
G(m2, 3, 4, 9,14,14, i); G(m2, 2, 7, 8,13,12, i); G(m2, 0, 5,10,15, 8, i); G(m2, 1, 6,11,12,10, i);
i+=16;
G(m2, 0, 4, 8,12, 0, i); G(m2, 1, 5, 9,13, 2, i); G(m2, 2, 6,10,14, 4, i); G(m2, 3, 7,11,15, 6, i);
G(m2, 3, 4, 9,14,14, i); G(m2, 2, 7, 8,13,12, i); G(m2, 0, 5,10,15, 8, i); G(m2, 1, 6,11,12,10, i);
i+=16;
G(m2, 0, 4, 8,12, 0, i); G(m2, 1, 5, 9,13, 2, i); G(m2, 2, 6,10,14, 4, i); G(m2, 3, 7,11,15, 6, i);
G(m2, 3, 4, 9,14,14, i); G(m2, 2, 7, 8,13,12, i); G(m2, 0, 5,10,15, 8, i); G(m2, 1, 6,11,12,10, i);
i+=16;
G(m2, 0, 4, 8,12, 0, i); G(m2, 1, 5, 9,13, 2, i); G(m2, 2, 6,10,14, 4, i); G(m2, 3, 7,11,15, 6, i);
G(m2, 3, 4, 9,14,14, i); G(m2, 2, 7, 8,13,12, i); G(m2, 0, 5,10,15, 8, i); G(m2, 1, 6,11,12,10, i);
i+=16;
G(m2, 0, 4, 8,12, 0, i); G(m2, 1, 5, 9,13, 2, i); G(m2, 2, 6,10,14, 4, i); G(m2, 3, 7,11,15, 6, i);
G(m2, 3, 4, 9,14,14, i); G(m2, 2, 7, 8,13,12, i); G(m2, 0, 5,10,15, 8, i); G(m2, 1, 6,11,12,10, i);
}
#pragma unroll
for(uint i=0; i<16;++i) h[i&7] ^= v[i];
uint8_t* work2 = in+128;
U64TO8( work2 + 0, h[0]);
U64TO8( work2 + 8, h[1]);
U64TO8( work2 +16, h[2]);
U64TO8( work2 +24, h[3]);
U64TO8( work2 +32, h[4]);
U64TO8( work2 +40, h[5]);
U64TO8( work2 +48, h[6]);
U64TO8( work2 +56, h[7]);
uint8_t* work3 = work2+64;
//a = x-1, b = x, c = x&63
#define WORKINIT(a,b,c) work3[a] ^= work2[c]; \
if(work3[a]&0x80) work3[b]=in[(b+work3[a])&0x7F]; \
else work3[b]=work2[(b+work3[a])&0x3F];
work3[0] = work2[15];
WORKINIT(0,1,1);
WORKINIT(1,2,2);
WORKINIT(2,3,3);
#pragma unroll
for(int x=4;x<64;++x)
{
WORKINIT(x-1,x,x);
++x;
WORKINIT(x-1,x,x);
++x;
WORKINIT(x-1,x,x);
++x;
WORKINIT(x-1,x,x);
}
#pragma unroll
for(int x=64;x<320;++x)
{
WORKINIT(x-1,x,x&63);
++x;
WORKINIT(x-1,x,x&63);
++x;
WORKINIT(x-1,x,x&63);
++x;
WORKINIT(x-1,x,x&63);
}
#define READ_PAD32_R(offset) ((uint)pad[offset] | (((uint)pad[offset+1])<<8) | (((uint)pad[offset+2])<<16) | (((uint)pad[offset+3])<<24))
#define READ_W32(offset) ((uint)work3[offset] + (((uint)work3[(offset)+1])<<8) + (((uint)work3[(offset)+2]&0x3F)<<16))
ushort* shortptr = (ushort*)(work3+310);
uint64 qCount = shortptr[0];
qCount |= ((uint64)shortptr[3])<<48;
uint* uintptr = (uint*)(work3+312);
qCount |= ((uint64)*uintptr)<<16;
uint nExtra=(pad[(qCount+work3[300])&0x3FFFFF]>>3)+512;
#pragma unroll
for(uint x=1;x {
uint res = 0;
qCount += READ_PAD32_R((qCount&0x3FFFFF));
work3[qCount%320] += (qCount&0x87878700) ? 1 : 0;
qCount-= pad[(qCount+work3[qCount%160])&0x3FFFFF];
if(qCount&0x80000000) { qCount+= pad[qCount&0xFFFF]; }
else { res = qCount&0x20FAFB; qCount+= READ_PAD32_R(res); }
res = (qCount+work3[qCount%160]) & 0x3FFFFF;
qCount+= READ_PAD32_R(res);
if(qCount&0xF0000000) ++work3[qCount%320];
res = READ_W32(qCount&0xFF);
qCount+= READ_PAD32_R(res);
work3[x%320]=work2[x&63]^(qCount&0xFF);
res = ((qCount>>32)+work3[x%200]) & 0x3FFFFF;
qCount+= READ_PAD32_R(res);
#define OFFS (qCount&3)
uint* ram = (uint *)(work3+((qCount%316)-OFFS));
uint val = amd_bytealign((uint32)(qCount>>24), (uint32)(qCount>>24), (uint32)(4-OFFS));
ram[0] ^= val&(0xFFFFFFFFL<<(OFFS<<3));
ram[1] ^= val&(0xFFFFFFFFL>>(32-(OFFS<<3)));
x += ((qCount&7)==3);
qCount-= pad[x*x];
if((qCount&0x07)==0x01) ++x;
}
uint s[8]= {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};
Sha256_round(s, in);
Sha256_round(s, in+64);
Sha256_round(s, in+128);
Sha256_round(s, in+192);
Sha256_round(s, in+256);
Sha256_round(s, in+320);
Sha256_round(s, in+384);
Sha256_round(s, in+448);
Sha256_round_padding(s);
if ((s[7] & 0x80FFFF) == 0)
{
out_param[nonce&0xFF] = get_global_id(0);
}
}