Phatk2 Mod (Already seeing improvement!) - page 2.

gat3way

sr. member

Activity: 256

Merit: 250

prefetch is a noop on GPUs. It is useful in CPU kernels only to prefetch data in CPU cache (same as what _mm_prefetch() does).

Quote

if defined VECTORS4
(v.s0==g.s0) ? uint nonce = (W[3].s0); #endif : ();
(v.s1==g.s1) ? uint nonce = (W[3].s1); #endif : ();
(v.s2==g.s2) ? uint nonce = (W[3].s2); #endif : ();
(v.s3==g.s3) ? uint nonce = (W[3].s3); #endif : ();
...
#endif

This is also not possible, it's an illegal construction that would fail the compilation. (v.s0==g.s0) is evaluated at run-time and the results are unknown to the preprocessor. If you need to terminate execution before write, you can just do that:

if (!nonce) return;

I am not sure it would make much of a difference though.

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

Quote from: TimPrice on February 11, 2012, 02:18:18 PM

May I ask you d3m0n1q_733rz, what do you do for a living? I just know a little of NASM and CISCA and this post seems so fancy already, it intrigued me into try to help, but I can't understand almost anything Embarrassed

I'm presently disabled. And I started programming in assembly as well. I have a degree in Network Systems Administration and I would like to find something along those lines in work, but so far not able to do so.
OpenCL isn't TOO difficult to learn, but I have trouble with the syntax of some commands like prefetch and the like. I'm thinking about tossing a prefetch or two into the code to see if it'll increase the speed by much. In particular, just before sharoundC to prepare K and ConstW if it's not already. And then another prefetch to call the parts of H when it's needed for Vals. I don't know if this could shave off a few cycles or not, but I plan to find out. Wink

TimPrice

newbie

Activity: 12

Merit: 0

May I ask you d3m0n1q_733rz, what do you do for a living? I just know a little of NASM and CISCA and this post seems so fancy already, it intrigued me into try to help, but I can't understand almost anything Embarrassed

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

Very slightly faster for my GPU. This had some tweaking done in the order of addition of P1 and P3. The result is increase in GPRs, but a decent drop in cycles/ALUs. Also, an increase in hashing speed!

Code:

// This file is in the public domain

#ifdef VECTORS8
typedef uint8 u;
#elif defined VECTORS4
typedef uint4 u;
#elif defined VECTORS
typedef uint2 u;
#else
typedef uint u;
#endif

__constant uint K[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};

__constant uint ConstW[128] = {
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x80000000U, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000280U,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,

0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x80000000U, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000100U,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
};

__constant uint H[8] = {
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
};

#ifdef BITALIGN
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
#define rot(x, y) amd_bitalign(x, x, (uint)(32 - y))
#else
#define rot(x, y) rotate(x, (uint)y)
#endif

// Some AMD devices have the BFI_INT opcode, which behaves exactly like the
// SHA-256 Ch function, but provides it in exactly one instruction. If
// detected, use it for Ch. Otherwise, use bitselect() for Ch.

#ifdef BFI_INT
// Well, slight problem... It turns out BFI_INT isn't actually exposed to
// OpenCL (or CAL IL for that matter) in any way. However, there is
// a similar instruction, BYTE_ALIGN_INT, which is exposed to OpenCL via
// amd_bytealign, takes the same inputs, and provides the same output.
// We can use that as a placeholder for BFI_INT and have the application
// patch it after compilation.

// This is the BFI_INT function
#define Ch(x, y, z) amd_bytealign(x,y,z)
// Ma can also be implemented in terms of BFI_INT...
#define Ma(z, x, y) amd_bytealign(z^x,y,x)
#else
#define Ch(x, y, z) bitselect(z,y,x)
#define Ma(x, y, z) bitselect(x,y,(z^x))
#endif

//Various intermediate calculations for each SHA round
#define s0(n) (S0(Vals[(0 + 128 - (n)) % 8]))
#define S0(n) (rot(n, 30u)^rot(n, 19u)^rot(n,10u))

#define s1(n) (S1(Vals[(4 + 128 - (n)) % 8]))
#define S1(n) (rot(n, 26u)^rot(n, 21u)^rot(n, 7u))

#define ch(n) Ch(Vals[(4 + 128 - (n)) % 8],Vals[(5 + 128 - (n)) % 8],Vals[(6 + 128 - (n)) % 8])
#define maj(n) Ma(Vals[(1 + 128 - (n)) % 8],Vals[(2 + 128 - (n)) % 8],Vals[(0 + 128 - (n)) % 8])

//t1 calc when W is already calculated
#define t1(n) K[(n) % 64] + Vals[(7 + 128 - (n)) % 8] + W[(n)] + s1(n) + ch(n)

//t1 calc which calculates W
#define t1W(n) K[(n) % 64] + Vals[(7 + 128 - (n)) % 8] + W(n) + s1(n) + ch(n)

//Used for constant W Values (the compiler optimizes out zeros)
#define t1C(n) (K[(n) % 64]+ ConstW[(n)]) + Vals[(7 + 128 - (n)) % 8] + s1(n) + ch(n)

//t2 Calc
#define t2(n) maj(n) + s0(n)

#define rotC(x,n) (x<> (32-n))

//W calculation used for SHA round
#define W(n) (W[n] = P4(n) + P3(n) + P2(n) + P1(n))

//Partial W calculations (used for the begining where only some values are nonzero)
#define P1(n) ((rot(W[(n)-2],15u)^rot(W[(n)-2],13u)^((W[(n)-2])>>10U)))
#define P2(n) ((rot(W[(n)-15],25u)^rot(W[(n)-15],14u)^((W[(n)-15])>>3U)))
#define p1(x) ((rot(x,15u)^rot(x,13u)^((x)>>10U)))
#define p2(x) ((rot(x,25u)^rot(x,14u)^((x)>>3U)))
#define P3(n) W[n-7]
#define P4(n) W[n-16]

//Partial Calcs for constant W values
#define P1C(n) ((rotC(ConstW[(n)-2],15)^rotC(ConstW[(n)-2],13)^((ConstW[(n)-2])>>10U)))
#define P2C(n) ((rotC(ConstW[(n)-15],25)^rotC(ConstW[(n)-15],14)^((ConstW[(n)-15])>>3U)))
#define P3C(x) ConstW[x-7]
#define P4C(x) ConstW[x-16]

//SHA round with built in W calc
#define sharoundW(n) Barrier1(n); Vals[(3 + 128 - (n)) % 8] += t1W(n); Vals[(7 + 128 - (n)) % 8] = t1W(n) + t2(n);

//SHA round without W calc
#define sharound(n) Barrier2(n); Vals[(3 + 128 - (n)) % 8] += t1(n); Vals[(7 + 128 - (n)) % 8] = t1(n) + t2(n);

//SHA round for constant W values
#define sharoundC(n) Barrier2(n); Vals[(3 + 128 - (n)) % 8] += t1C(n); Vals[(7 + 128 - (n)) % 8] = t1C(n) + t2(n);

//The compiler is stupid... I put this in there only to stop the compiler from (de)optimizing the order
#define Barrier1(n) t1 = t1C((n+1))
#define Barrier2(n) t1 = t1C((n))

__kernel
//removed this to allow detection of invalid work size
//__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
void search( const uint state0, const uint state1, const uint state2, const uint state3,
const uint state4, const uint state5, const uint state6, const uint state7,
const uint B1, const uint C1, const uint D1,
const uint F1, const uint G1, const uint H1,
#ifndef GOFFSET
const u base,
#endif
const uint W16, const uint W17,
const uint PreVal4, const uint PreVal0,
const uint PreW31, const uint PreW32,
const uint PreW19, const uint PreW20,
__global uint * output)
{

u W[124];
u Vals[8];

//Dummy Variable to prevent compiler from reordering between rounds
u t1;

W[16] = W16;
W[17] = W17;

#ifdef VECTORS8
#ifdef GOFFSET
W[3] = ((uint)get_global_id(0) << 3) + (u)(0, 1, 2, 3, 4, 5, 6, 7);
uint r = rot(W[3].s0,25u)^rot(W[3].s0,14u)^((W[3].s0)>>3U);
#else
W[3] = base + (uint)(get_local_id(0)) * 8u + (uint)(get_group_id(0)) * (WORKSIZE * 8u);
uint r = rot(W[3].s0,25u)^rot(W[3].s0,14u)^((W[3].s0)>>3U);
//Since only the 2 LSB is opposite between the nonces, we can save an instruction by flipping the 4 bits in W18 rather than the 1 bit in W3
W[18] = PreW20 + (u){r, r ^ 0x2004000U, r ^ 0x4008000U, r ^ 0x600C000U, r ^ 0x8010000U, r ^ 0xA050000U, r ^ 0xC090000U, r ^ 0xE0D0000U};
#endif

#elif defined VECTORS4
#ifdef GOFFSET
W[3] = ((uint)get_global_id(0) << 2) + (u)(0, 1, 2, 3);
uint r = rot(W[3].s0,25u)^rot(W[3].s0,14u)^((W[3].s0)>>3U);
#else

//Less dependencies to get both the local id and group id and then add them
W[3] = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u);
uint r = rot(W[3].s0,25u)^rot(W[3].s0,14u)^((W[3].s0)>>3U);
//Since only the 2 LSB is opposite between the nonces, we can save an instruction by flipping the 4 bits in W18 rather than the 1 bit in W3
W[18] = PreW20 + (u){r, r ^ 0x2004000U, r ^ 0x4008000U, r ^ 0x600C000U};
#endif
#elif defined VECTORS
#ifdef GOFFSET
W[3] = ((uint)get_global_id(0) << 1) + (u)(0, 1,);
uint r = rot(W[3].s0,25u)^rot(W[3].s0,14u)^((W[3].s0)>>3U);
#else
W[3] = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u);
uint r = rot(W[3].s0,25u)^rot(W[3].s0,14u)^((W[3].s0)>>3U);
W[18] = PreW20 + (u){r, r ^ 0x2004000U};
#endif
#else
#ifdef GOFFSET
W[3] = ((uint)get_global_id(0));
uint r = rot(W[3].s0,25u)^rot(W[3].s0,14u)^((W[3].s0)>>3U);
#else
W[3] = base + get_local_id(0) + get_group_id(0) * (WORKSIZE);
u r = rot(W[3],25u)^rot(W[3],14u)^((W[3])>>3U);
W[18] = PreW20 + r;
#endif
#endif
//the order of the W calcs and Rounds is like this because the compiler needs help finding how to order the instructions

//Vals[0]=state0;
Vals[0] = PreVal0 + W[3];
Vals[1]=B1;
Vals[2]=C1;
Vals[3]=D1;
//Vals[4]=PreVal4;
Vals[4] = PreVal4 + W[3];
Vals[5]=F1;
Vals[6]=G1;
Vals[7]=H1;

sharoundC(4);
W[19] = PreW19 + W[3];
sharoundC(5);
W[20] = P4C(20) + P1(20);
sharoundC(6);
W[21] = P1(21);
sharoundC(7);
W[22] = P3C(22) + P1(22);
sharoundC(8);
W[23] = W[16] + P1(23);
sharoundC(9);
W[24] = W[17] + P1(24);
sharoundC(10);
W[25] = P3(25) + P1(25);
W[26] = P3(26) + P1(26);
sharoundC(11);
W[27] = P3(27) + P1(27);
W[28] = P3(28) + P1(28);
sharoundC(12);
W[29] = P3(29) + P1(29);
sharoundC(13);
W[30] = P3(30) + P2C(30) + P1(30);
W[31] = PreW31 + (P3(31) + P1(31));
sharoundC(14);
W[32] = PreW32 + (P3(32) + P1(32));
sharoundC(15);
sharound(16);
sharound(17);
sharound(18);
sharound(19);
sharound(20);
sharound(21);
sharound(22);
sharound(23);
sharound(24);
sharound(25);
sharound(26);
sharound(27);
sharound(28);
sharound(29);
sharound(30);
sharound(31);
sharound(32);
sharoundW(33);
sharoundW(34);
sharoundW(35);
sharoundW(36);
sharoundW(37);
sharoundW(38);
sharoundW(39);
sharoundW(40);
sharoundW(41);
sharoundW(42);
sharoundW(43);
sharoundW(44);
sharoundW(45);
sharoundW(46);
sharoundW(47);
sharoundW(48);
sharoundW(49);
sharoundW(50);
sharoundW(51);
sharoundW(52);
sharoundW(53);
sharoundW(54);
sharoundW(55);
sharoundW(56);
sharoundW(57);
sharoundW(58);
sharoundW(59);
sharoundW(60);
sharoundW(61);
sharoundW(62);
sharoundW(63);

W[64]=state0+Vals[0];
W[65]=state1+Vals[1];
W[66]=state2+Vals[2];
W[67]=state3+Vals[3];
W[68]=state4+Vals[4];
W[69]=state5+Vals[5];
W[70]=state6+Vals[6];
W[71]=state7+Vals[7];

const u Temp = (0xb0edbdd0U + K[0]) + W[64];
Vals[0]=H[0];
Vals[1]=H[1];
Vals[2]=H[2];
Vals[3]=0xa54ff53aU + Temp;
Vals[4]=H[4];
Vals[5]=H[5];
Vals[6]=H[6];
Vals[7]=0x08909ae5U + Temp;

#define P124(n) P2(n) + P1(n) + P4(n)

W[80] = P2(80) + P4(80);
sharound(65);
W[81] = P1C(81) + P2(81) + P4(81);
sharound(66);
W[82] = P124(82);
sharound(67);
W[83] = P124(83);
sharound(68);
W[84] = P124(84);
sharound(69);
W[85] = P124(85);
sharound(70);
W[86] = P4(86) + P3C(86) + P2(86) + P1(86);
sharound(71);
W[87] = P4(87) + P3(87) + P2C(87) + P1(87);
sharoundC(72);
W[88] = P1(88) + P4C(88) + P3(88);
sharoundC(73);
W[89] = P3(89) + P1(89);
sharoundC(74);
W[90] = P3(90) + P1(90);
sharoundC(75);
W[91] = P3(91) + P1(91);
sharoundC(76);
W[92] = P3(92) + P1(92);
sharoundC(77);
W[93] = P3(93) + P1(93);
W[94] = P3(94) + P2C(94) + P1(94);
sharoundC(78);
W[95] = P4C(95) + P3(95) + P2(95) + P1(95);
sharoundC(79);
sharound(80);
sharound(81);
sharound(82);
sharound(83);
sharound(84);
sharound(85);
sharound(86);
sharound(87);
sharound(88);
sharound(89);
sharound(90);
sharound(91);
sharound(92);
sharound(93);
sharound(94);
sharound(95);
sharoundW(96);
sharoundW(97);
sharoundW(98);
sharoundW(99);
sharoundW(100);
sharoundW(101);
sharoundW(102);
sharoundW(103);
sharoundW(104);
sharoundW(105);
sharoundW(106);
sharoundW(107);
sharoundW(108);
sharoundW(109);
sharoundW(110);
sharoundW(111);
sharoundW(112);
sharoundW(113);
sharoundW(114);
sharoundW(115);
sharoundW(116);
sharoundW(117);
sharoundW(118);
sharoundW(119);
sharoundW(120);
sharoundW(121);
sharoundW(122);

u v = W[117] + W[108] + Vals[3] + Vals[7] + P2(124) + P1(124) + Ch((Vals[0] + Vals[4]) + (K[59] + W(123)) + s1(123)+ ch(123),Vals[1],Vals[2]);
u g = -(K[60] + H[7]) - S1((Vals[0] + Vals[4]) + (K[59] + W(123)) + s1(123)+ ch(123));

// uint nonce = 0;
#ifdef VECTORS8
uint nonce = (v.s0==g.s0) ? W[3].s0 : 0;
nonce = (v.s1==g.s1) ? W[3].s1 : nonce;
nonce = (v.s2==g.s2) ? W[3].s2 : nonce;
nonce = (v.s3==g.s3) ? W[3].s3 : nonce;
nonce = (v.s4==g.s4) ? W[3].s4 : nonce;
nonce = (v.s5==g.s5) ? W[3].s5 : nonce;
nonce = (v.s6==g.s6) ? W[3].s6 : nonce;
nonce = (v.s7==g.s7) ? W[3].s7 : nonce;
#elif defined VECTORS4
uint nonce = (v.s0==g.s0) ? W[3].s0 : 0;
nonce = (v.s1==g.s1) ? W[3].s1 : nonce;
nonce = (v.s2==g.s2) ? W[3].s2 : nonce;
nonce = (v.s3==g.s3) ? W[3].s3 : nonce;
#elif defined VECTORS
uint nonce = (v.s0==g.s0) ? W[3].s0 : 0;
nonce = (v.s1==g.s1) ? W[3].s1 : nonce;
#else
uint nonce = (v==g) ? W[3] : 0
#endif
if(nonce!=0)
{
//Faster to shift the nonce by 2 due to 4-DWORD addressing and does not add more collisions
output[WORKSIZE] = nonce;
output[get_local_id(0)] = nonce;
}
}

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

Quote from: bulanula on February 11, 2012, 09:09:56 AM

Quote from: d3m0n1q_733rz on February 11, 2012, 04:27:04 AM

Quote from: niooron on February 11, 2012, 01:51:26 AM

So this new kernel should be used with 2.6 or 2.1 sdk?

It's not a new kernel, it's a mod to the existing one. I'm trying to add Vectors8 support and disabling of global offset for increased output for SDK 2.6 while maintaining the functionality for SDK 2.1. This method has already been used with Diapolo's GCN miner and has seen some improvement (with his kernel) on VLIW-based cards. You can see the test I've done with hashing rate in his thread.
The problem I'm running into is that Phatk2 seems to have a difficult time handling 8 vectors at once due to memory constraints which effectively cut the number of hashes in half. So, I'm hoping to fix this little problem and introduce the ability to disable global offset in order to maximize output. I also hope to fix the problem of outputting multiple found nonce each round in the case that more than one acceptable share is found while hashing. But I'm going to need people's help with this project. It'll be the people's kernel so to speak. ^_^

Please post a link if you can. I know Diapolo's thread but don't know exactly which page etc.

Thanks !

https://bitcointalk.org/index.php?topic=61406.20

DiakGCN results on ATI HD5450

VECTORS4 WORKSIZE=128 with GOFFSET=false 14.45 Mhash/s
VECTORS4 WORKSIZE=128 without GOFFSET=false 14.46 Mhash/s
VECTORS8 WORKSIZE=128 with GOFFSET=false 14.46 Mhash/s
VECTORS8 WORKSIZE=128 without GOFFSET=false 14.47 Mhash/s

VECTORS4 WORKSIZE=64 with GOFFSET=false 14.49 Mhash/s
VECTORS4 WORKSIZE=64 without GOFFSET=false 14.50 Mhash/s
VECTORS8 WORKSIZE=64 with GOFFSET=false 14.55 Mhash/s
VECTORS8 WORKSIZE=64 without GOFFSET=false 14.50 Mhash/s

VECTORS4 WORKSIZE=32 with GOFFSET=false 14.46 Mhash/s
VECTORS4 WORKSIZE=32 without GOFFSET=false 14.47 Mhash/s
VECTORS8 WORKSIZE=32 with GOFFSET=false 14.50 Mhash/s
VECTORS8 WORKSIZE=32 without GOFFSET=false 14.48 Mhash/s

bulanula

hero member

Activity: 518

Merit: 500

Quote from: d3m0n1q_733rz on February 11, 2012, 04:27:04 AM

Quote from: niooron on February 11, 2012, 01:51:26 AM

So this new kernel should be used with 2.6 or 2.1 sdk?

It's not a new kernel, it's a mod to the existing one. I'm trying to add Vectors8 support and disabling of global offset for increased output for SDK 2.6 while maintaining the functionality for SDK 2.1. This method has already been used with Diapolo's GCN miner and has seen some improvement (with his kernel) on VLIW-based cards. You can see the test I've done with hashing rate in his thread.
The problem I'm running into is that Phatk2 seems to have a difficult time handling 8 vectors at once due to memory constraints which effectively cut the number of hashes in half. So, I'm hoping to fix this little problem and introduce the ability to disable global offset in order to maximize output. I also hope to fix the problem of outputting multiple found nonce each round in the case that more than one acceptable share is found while hashing. But I'm going to need people's help with this project. It'll be the people's kernel so to speak. ^_^

Please post a link if you can. I know Diapolo's thread but don't know exactly which page etc.

Thanks !

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

Hey, anyone know how to end an if statement in the middle of a series of statements once a condition has been filled without having to check a variable? Here's an example of what I WANT to work.

if defined VECTORS4
(v.s0==g.s0) ? uint nonce = (W[3].s0); #endif : ();
(v.s1==g.s1) ? uint nonce = (W[3].s1); #endif : ();
(v.s2==g.s2) ? uint nonce = (W[3].s2); #endif : ();
(v.s3==g.s3) ? uint nonce = (W[3].s3); #endif : ();
...
#endif

I thought about trying while, but that meant I would be required to use a write to make nonce exist. That's another instruction I didn't need to include. In short, I'm avoiding any writes that are not required and, once nonce is found, end the if statements immediately since no other checks will be useful.

Now, I know the above code doesn't work, but could someone tell me how to write it so that it does? I'll also need to know if the "if (exists(nonce))" statement will work.

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

Quote from: niooron on February 11, 2012, 01:51:26 AM

So this new kernel should be used with 2.6 or 2.1 sdk?

It's not a new kernel, it's a mod to the existing one. I'm trying to add Vectors8 support and disabling of global offset for increased output for SDK 2.6 while maintaining the functionality for SDK 2.1. This method has already been used with Diapolo's GCN miner and has seen some improvement (with his kernel) on VLIW-based cards. You can see the test I've done with hashing rate in his thread.
The problem I'm running into is that Phatk2 seems to have a difficult time handling 8 vectors at once due to memory constraints which effectively cut the number of hashes in half. So, I'm hoping to fix this little problem and introduce the ability to disable global offset in order to maximize output. I also hope to fix the problem of outputting multiple found nonce each round in the case that more than one acceptable share is found while hashing. But I'm going to need people's help with this project. It'll be the people's kernel so to speak. ^_^

ssateneth

legendary

Activity: 1344

Merit: 1004

Quote from: niooron on February 11, 2012, 01:51:26 AM

So this new kernel should be used with 2.6 or 2.1 sdk?

interested in this too. i'd prefer an improvement to my miners using 2.1 sdk.

niooron

full member

Activity: 193

Merit: 100

So this new kernel should be used with 2.6 or 2.1 sdk?

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

Actually, I wouldn't mind if DiabloD3 were to help on this project. The more minds, the better the outcome.

rjk

sr. member

Activity: 462

Merit: 250

1ngldh

Quote from: d3m0n1q_733rz on February 11, 2012, 12:23:51 AM

Once this is overcome, it should be the fastest kernel available.

Until DiabloD3 steps out of his lair with more voodoo magic. Grin

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

Well, we hope to make it that. But I just rechecked my results since I was working on it in the middle of the night, and it seems it's not that different from the original. Hoping to fix that once I get GOFFSET working properly with VECTORS8. But we keep running into the same register spill problem. Once this is overcome, it should be the fastest kernel available.

ssateneth

legendary

Activity: 1344

Merit: 1004

So I see the thread title got changed to "seeing improvement" meaining its faster than current phatk2. Could you host the files somewhere so I can plug them into phoenix2 and see how much faster? There are so many code changes in this thread I have no clue what to copy paste, so I'd appreciate it if you had it uploaded somewhere

bulanula

hero member

Activity: 518

Merit: 500

So how much increase can we expect on a 5870 ?

Is this really the fastest kernel for a 5870 ?

Thanks !

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

Quote from: gat3way on February 10, 2012, 09:36:24 AM

I am kinda surprised that predication worked better than select(), usually it's just the opposite. Perhaps if you can send me both ISA dumps I can see what can be done to further improve that.

For the second part:

Quote

u v = W[117] + W[108] + Vals[3] + Vals[7] + P2(124) + P1(124) + Ch((Vals[0] + Vals[4]) + (K[59] + W(123)) + s1(123)+ ch(123),Vals[1],Vals[2]);
u g = -(K[60] + H[7]) - S1((Vals[0] + Vals[4]) + (K[59] + W(123)) + s1(123)+ ch(123));

Can we simplify these since they both contain (Vals[0] + Vals[4]) + (K[59] + W(123)) + s1(123)+ ch(123)) ? It would certainly reduce calculations a bit. The only problem I see is Vals[1] and Vals[2] is inside of the parenthesis. Now, I'm not familiar with the comma symbolization here, but if the parenthesis can be put on the inside next to the ch(123), it's as easy as dividing by ((Vals[0] + Vals[4]) + (K[59] + W(123)) + s1(123)+ ch(123)) to remove it and make the math simpler for the GPU.

I don't think it's worth trying.

P.S I don't think ALU ops is a good performance metric. Of course that's important, but there are other factors. GPR usage and number of clauses is also very important, so you have to profile the overall result. I've seen many times situations where you have two kernels, one has a bit less ALU ops, other has just one more clause and the second one behaves much worse. Similarily, the situation with GPR usage. I am currently working on a RAR password cracking kernel and that poses some fucking paradoxes. For example I have several kernels, one keeping everything in __private memory with large GPR usage, another one that shifts some to __local memory and a third one that keeps a small lookup table in __global memory. Paradox is that the first one is the slowest, GPR usage is ~90, performance is disgusting. The one that keeps part of the data in __local memory behaves much better, 36 GPRs used, much better occupancy, but performance still not what I expected. The kernel that uses an intermediate __global memory buffer is currently the fastest one, mostly because of the cached global memory with SDK 2.6. It's twice faster than the second one and times faster than the first one. I would never expect that.

I've already tested it via the kernel analyzer and it seems to be the best way to accomplish the task so far. The analyzer tells cycles, ALUs, GPRs, output, etc.

gat3way

sr. member

Activity: 256

Merit: 250

I am kinda surprised that predication worked better than select(), usually it's just the opposite. Perhaps if you can send me both ISA dumps I can see what can be done to further improve that.

For the second part:

Quote

u v = W[117] + W[108] + Vals[3] + Vals[7] + P2(124) + P1(124) + Ch((Vals[0] + Vals[4]) + (K[59] + W(123)) + s1(123)+ ch(123),Vals[1],Vals[2]);
u g = -(K[60] + H[7]) - S1((Vals[0] + Vals[4]) + (K[59] + W(123)) + s1(123)+ ch(123));

Can we simplify these since they both contain (Vals[0] + Vals[4]) + (K[59] + W(123)) + s1(123)+ ch(123)) ? It would certainly reduce calculations a bit. The only problem I see is Vals[1] and Vals[2] is inside of the parenthesis. Now, I'm not familiar with the comma symbolization here, but if the parenthesis can be put on the inside next to the ch(123), it's as easy as dividing by ((Vals[0] + Vals[4]) + (K[59] + W(123)) + s1(123)+ ch(123)) to remove it and make the math simpler for the GPU.

I don't think it's worth trying.

P.S I don't think ALU ops is a good performance metric. Of course that's important, but there are other factors. GPR usage and number of clauses is also very important, so you have to profile the overall result. I've seen many times situations where you have two kernels, one has a bit less ALU ops, other has just one more clause and the second one behaves much worse. Similarily, the situation with GPR usage. I am currently working on a RAR password cracking kernel and that poses some fucking paradoxes. For example I have several kernels, one keeping everything in __private memory with large GPR usage, another one that shifts some to __local memory and a third one that keeps a small lookup table in __global memory. Paradox is that the first one is the slowest, GPR usage is ~90, performance is disgusting. The one that keeps part of the data in __local memory behaves much better, 36 GPRs used, much better occupancy, but performance still not what I expected. The kernel that uses an intermediate __global memory buffer is currently the fastest one, mostly because of the cached global memory with SDK 2.6. It's twice faster than the second one and times faster than the first one. I would never expect that.

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

Quote from: d3m0n1q_733rz on February 10, 2012, 08:49:50 AM

Next part!

u v = W[117] + W[108] + Vals[3] + Vals[7] + P2(124) + P1(124) + Ch((Vals[0] + Vals[4]) + (K[59] + W(123)) + s1(123)+ ch(123),Vals[1],Vals[2]);
u g = -(K[60] + H[7]) - S1((Vals[0] + Vals[4]) + (K[59] + W(123)) + s1(123)+ ch(123));

Can we simplify these since they both contain (Vals[0] + Vals[4]) + (K[59] + W(123)) + s1(123)+ ch(123)) ? It would certainly reduce calculations a bit. The only problem I see is Vals[1] and Vals[2] is inside of the parenthesis. Now, I'm not familiar with the comma symbolization here, but if the parenthesis can be put on the inside next to the ch(123), it's as easy as dividing by ((Vals[0] + Vals[4]) + (K[59] + W(123)) + s1(123)+ ch(123)) to remove it and make the math simpler for the GPU.

I suppose not since Ch depends on that first value. >_> And then S1 rotates it. Darn, not so easy.

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

Next part!

u v = W[117] + W[108] + Vals[3] + Vals[7] + P2(124) + P1(124) + Ch((Vals[0] + Vals[4]) + (K[59] + W(123)) + s1(123)+ ch(123),Vals[1],Vals[2]);
u g = -(K[60] + H[7]) - S1((Vals[0] + Vals[4]) + (K[59] + W(123)) + s1(123)+ ch(123));

Can we simplify these since they both contain (Vals[0] + Vals[4]) + (K[59] + W(123)) + s1(123)+ ch(123)) ? It would certainly reduce calculations a bit. The only problem I see is Vals[1] and Vals[2] is inside of the parenthesis. Now, I'm not familiar with the comma symbolization here, but if the parenthesis can be put on the inside next to the ch(123), it's as easy as dividing by ((Vals[0] + Vals[4]) + (K[59] + W(123)) + s1(123)+ ch(123)) to remove it and make the math simpler for the GPU.

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

Code:

// This file is in the public domain

#ifdef VECTORS8
typedef uint8 u;
#elif defined VECTORS4
typedef uint4 u;
#elif defined VECTORS
typedef uint2 u;
#else
typedef uint u;
#endif

__constant uint K[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};

__constant uint ConstW[128] = {
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x80000000U, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000280U,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,

0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x80000000U, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000100U,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
};

__constant uint H[8] = {
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
};

#ifdef BITALIGN
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
#define rot(x, y) amd_bitalign(x, x, (uint)(32 - y))
#else
#define rot(x, y) rotate(x, (uint)y)
#endif

// Some AMD devices have the BFI_INT opcode, which behaves exactly like the
// SHA-256 Ch function, but provides it in exactly one instruction. If
// detected, use it for Ch. Otherwise, use bitselect() for Ch.

#ifdef BFI_INT
// Well, slight problem... It turns out BFI_INT isn't actually exposed to
// OpenCL (or CAL IL for that matter) in any way. However, there is
// a similar instruction, BYTE_ALIGN_INT, which is exposed to OpenCL via
// amd_bytealign, takes the same inputs, and provides the same output.
// We can use that as a placeholder for BFI_INT and have the application
// patch it after compilation.

// This is the BFI_INT function
#define Ch(x, y, z) amd_bytealign(x,y,z)
// Ma can also be implemented in terms of BFI_INT...
#define Ma(z, x, y) amd_bytealign(z^x,y,x)
#else
#define Ch(x, y, z) bitselect(z,y,x)
#define Ma(x, y, z) bitselect(x,y,(z^x))
#endif

//Various intermediate calculations for each SHA round
#define s0(n) (S0(Vals[(0 + 128 - (n)) % 8]))
#define S0(n) (rot(n, 30u)^rot(n, 19u)^rot(n,10u))

#define s1(n) (S1(Vals[(4 + 128 - (n)) % 8]))
#define S1(n) (rot(n, 26u)^rot(n, 21u)^rot(n, 7u))

#define ch(n) Ch(Vals[(4 + 128 - (n)) % 8],Vals[(5 + 128 - (n)) % 8],Vals[(6 + 128 - (n)) % 8])
#define maj(n) Ma(Vals[(1 + 128 - (n)) % 8],Vals[(2 + 128 - (n)) % 8],Vals[(0 + 128 - (n)) % 8])

//t1 calc when W is already calculated
#define t1(n) K[(n) % 64] + Vals[(7 + 128 - (n)) % 8] + W[(n)] + s1(n) + ch(n)

//t1 calc which calculates W
#define t1W(n) K[(n) % 64] + Vals[(7 + 128 - (n)) % 8] + W(n) + s1(n) + ch(n)

//Used for constant W Values (the compiler optimizes out zeros)
#define t1C(n) (K[(n) % 64]+ ConstW[(n)]) + Vals[(7 + 128 - (n)) % 8] + s1(n) + ch(n)

//t2 Calc
#define t2(n) maj(n) + s0(n)

#define rotC(x,n) (x<> (32-n))

//W calculation used for SHA round
#define W(n) (W[n] = P4(n) + P3(n) + P2(n) + P1(n))

//Partial W calculations (used for the begining where only some values are nonzero)
#define P1(n) ((rot(W[(n)-2],15u)^rot(W[(n)-2],13u)^((W[(n)-2])>>10U)))
#define P2(n) ((rot(W[(n)-15],25u)^rot(W[(n)-15],14u)^((W[(n)-15])>>3U)))
#define p1(x) ((rot(x,15u)^rot(x,13u)^((x)>>10U)))
#define p2(x) ((rot(x,25u)^rot(x,14u)^((x)>>3U)))
#define P3(n) W[n-7]
#define P4(n) W[n-16]

//Partial Calcs for constant W values
#define P1C(n) ((rotC(ConstW[(n)-2],15)^rotC(ConstW[(n)-2],13)^((ConstW[(n)-2])>>10U)))
#define P2C(n) ((rotC(ConstW[(n)-15],25)^rotC(ConstW[(n)-15],14)^((ConstW[(n)-15])>>3U)))
#define P3C(x) ConstW[x-7]
#define P4C(x) ConstW[x-16]

//SHA round with built in W calc
#define sharoundW(n) Barrier1(n); Vals[(3 + 128 - (n)) % 8] += t1W(n); Vals[(7 + 128 - (n)) % 8] = t1W(n) + t2(n);

//SHA round without W calc
#define sharound(n) Barrier2(n); Vals[(3 + 128 - (n)) % 8] += t1(n); Vals[(7 + 128 - (n)) % 8] = t1(n) + t2(n);

//SHA round for constant W values
#define sharoundC(n) Barrier2(n); Vals[(3 + 128 - (n)) % 8] += t1C(n); Vals[(7 + 128 - (n)) % 8] = t1C(n) + t2(n);

//The compiler is stupid... I put this in there only to stop the compiler from (de)optimizing the order
#define Barrier1(n) t1 = t1C((n+1))
#define Barrier2(n) t1 = t1C((n))

__kernel
//removed this to allow detection of invalid work size
//__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
void search( const uint state0, const uint state1, const uint state2, const uint state3,
const uint state4, const uint state5, const uint state6, const uint state7,
const uint B1, const uint C1, const uint D1,
const uint F1, const uint G1, const uint H1,
#ifndef GOFFSET
const u base,
#endif
const uint W16, const uint W17,
const uint PreVal4, const uint PreVal0,
const uint PreW31, const uint PreW32,
const uint PreW19, const uint PreW20,
__global uint * output)
{

u W[124];
u Vals[8];

//Dummy Variable to prevent compiler from reordering between rounds
u t1;

W[16] = W16;
W[17] = W17;

#ifdef VECTORS8
#ifdef GOFFSET
W[3] = ((uint)get_global_id(0) << 3) + (u)(0, 1, 2, 3, 4, 5, 6, 7);
uint r = rot(W[3].s0,25u)^rot(W[3].s0,14u)^((W[3].s0)>>3U);
#else
W[3] = base + (uint)(get_local_id(0)) * 8u + (uint)(get_group_id(0)) * (WORKSIZE * 8u);
uint r = rot(W[3].s0,25u)^rot(W[3].s0,14u)^((W[3].s0)>>3U);
//Since only the 2 LSB is opposite between the nonces, we can save an instruction by flipping the 4 bits in W18 rather than the 1 bit in W3
W[18] = PreW20 + (u){r, r ^ 0x2004000U, r ^ 0x4008000U, r ^ 0x600C000U, r ^ 0x8010000U, r ^ 0xA050000U, r ^ 0xC090000U, r ^ 0xE0D0000U};
#endif

#elif defined VECTORS4
#ifdef GOFFSET
W[3] = ((uint)get_global_id(0) << 2) + (u)(0, 1, 2, 3);
uint r = rot(W[3].s0,25u)^rot(W[3].s0,14u)^((W[3].s0)>>3U);
#else

//Less dependencies to get both the local id and group id and then add them
W[3] = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u);
uint r = rot(W[3].s0,25u)^rot(W[3].s0,14u)^((W[3].s0)>>3U);
//Since only the 2 LSB is opposite between the nonces, we can save an instruction by flipping the 4 bits in W18 rather than the 1 bit in W3
W[18] = PreW20 + (u){r, r ^ 0x2004000U, r ^ 0x4008000U, r ^ 0x600C000U};
#endif
#elif defined VECTORS
#ifdef GOFFSET
W[3] = ((uint)get_global_id(0) << 1) + (u)(0, 1,);
uint r = rot(W[3].s0,25u)^rot(W[3].s0,14u)^((W[3].s0)>>3U);
#else
W[3] = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u);
uint r = rot(W[3].s0,25u)^rot(W[3].s0,14u)^((W[3].s0)>>3U);
W[18] = PreW20 + (u){r, r ^ 0x2004000U};
#endif
#else
#ifdef GOFFSET
W[3] = ((uint)get_global_id(0));
uint r = rot(W[3].s0,25u)^rot(W[3].s0,14u)^((W[3].s0)>>3U);
#else
W[3] = base + get_local_id(0) + get_group_id(0) * (WORKSIZE);
u r = rot(W[3],25u)^rot(W[3],14u)^((W[3])>>3U);
W[18] = PreW20 + r;
#endif
#endif
//the order of the W calcs and Rounds is like this because the compiler needs help finding how to order the instructions

//Vals[0]=state0;
Vals[0] = PreVal0 + W[3];
Vals[1]=B1;
Vals[2]=C1;
Vals[3]=D1;
//Vals[4]=PreVal4;
Vals[4] = PreVal4 + W[3];
Vals[5]=F1;
Vals[6]=G1;
Vals[7]=H1;

sharoundC(4);
W[19] = PreW19 + W[3];
sharoundC(5);
W[20] = P4C(20) + P1(20);
sharoundC(6);
W[21] = P1(21);
sharoundC(7);
W[22] = P3C(22) + P1(22);
sharoundC(8);
W[23] = W[16] + P1(23);
sharoundC(9);
W[24] = W[17] + P1(24);
sharoundC(10);
W[25] = P1(25) + P3(25);
W[26] = P1(26) + P3(26);
sharoundC(11);
W[27] = P1(27) + P3(27);
W[28] = P1(28) + P3(28);
sharoundC(12);
W[29] = P1(29) + P3(29);
sharoundC(13);
W[30] = P1(30) + P2C(30) + P3(30);
W[31] = PreW31 + (P1(31) + P3(31));
sharoundC(14);
W[32] = PreW32 + (P1(32) + P3(32));
sharoundC(15);
sharound(16);
sharound(17);
sharound(18);
sharound(19);
sharound(20);
sharound(21);
sharound(22);
sharound(23);
sharound(24);
sharound(25);
sharound(26);
sharound(27);
sharound(28);
sharound(29);
sharound(30);
sharound(31);
sharound(32);
sharoundW(33);
sharoundW(34);
sharoundW(35);
sharoundW(36);
sharoundW(37);
sharoundW(38);
sharoundW(39);
sharoundW(40);
sharoundW(41);
sharoundW(42);
sharoundW(43);
sharoundW(44);
sharoundW(45);
sharoundW(46);
sharoundW(47);
sharoundW(48);
sharoundW(49);
sharoundW(50);
sharoundW(51);
sharoundW(52);
sharoundW(53);
sharoundW(54);
sharoundW(55);
sharoundW(56);
sharoundW(57);
sharoundW(58);
sharoundW(59);
sharoundW(60);
sharoundW(61);
sharoundW(62);
sharoundW(63);

W[64]=state0+Vals[0];
W[65]=state1+Vals[1];
W[66]=state2+Vals[2];
W[67]=state3+Vals[3];
W[68]=state4+Vals[4];
W[69]=state5+Vals[5];
W[70]=state6+Vals[6];
W[71]=state7+Vals[7];

const u Temp = (0xb0edbdd0U + K[0]) + W[64];
Vals[0]=H[0];
Vals[1]=H[1];
Vals[2]=H[2];
Vals[3]=0xa54ff53aU + Temp;
Vals[4]=H[4];
Vals[5]=H[5];
Vals[6]=H[6];
Vals[7]=0x08909ae5U + Temp;

#define P124(n) P2(n) + P1(n) + P4(n)

W[80] = P2(80) + P4(80);
sharound(65);
W[81] = P1C(81) + P2(81) + P4(81);
sharound(66);
W[82] = P124(82);
sharound(67);
W[83] = P124(83);
sharound(68);
W[84] = P124(84);
sharound(69);
W[85] = P124(85);
sharound(70);
W[86] = P4(86) + P3C(86) + P2(86) + P1(86);
sharound(71);
W[87] = P4(87) + P3(87) + P2C(87) + P1(87);
sharoundC(72);
W[88] = P1(88) + P4C(88) + P3(88);
sharoundC(73);
W[89] = P3(89) + P1(89);
sharoundC(74);
W[90] = P3(90) + P1(90);
sharoundC(75);
W[91] = P3(91) + P1(91);
sharoundC(76);
W[92] = P3(92) + P1(92);
sharoundC(77);
W[93] = P1(93) + P3(93);
W[94] = P3(94) + P2C(94) + P1(94);
sharoundC(78);
W[95] = P4C(95) + P3(95) + P2(95) + P1(95);
sharoundC(79);
sharound(80);
sharound(81);
sharound(82);
sharound(83);
sharound(84);
sharound(85);
sharound(86);
sharound(87);
sharound(88);
sharound(89);
sharound(90);
sharound(91);
sharound(92);
sharound(93);
sharound(94);
sharound(95);
sharoundW(96);
sharoundW(97);
sharoundW(98);
sharoundW(99);
sharoundW(100);
sharoundW(101);
sharoundW(102);
sharoundW(103);
sharoundW(104);
sharoundW(105);
sharoundW(106);
sharoundW(107);
sharoundW(108);
sharoundW(109);
sharoundW(110);
sharoundW(111);
sharoundW(112);
sharoundW(113);
sharoundW(114);
sharoundW(115);
sharoundW(116);
sharoundW(117);
sharoundW(118);
sharoundW(119);
sharoundW(120);
sharoundW(121);
sharoundW(122);

u v = W[117] + W[108] + Vals[3] + Vals[7] + P2(124) + P1(124) + Ch((Vals[0] + Vals[4]) + (K[59] + W(123)) + s1(123)+ ch(123),Vals[1],Vals[2]);
u g = -(K[60] + H[7]) - S1((Vals[0] + Vals[4]) + (K[59] + W(123)) + s1(123)+ ch(123));

// uint nonce = 0;
#ifdef VECTORS8
uint nonce = (v.s0==g.s0) ? W[3].s0 : 0;
nonce = (v.s1==g.s1) ? W[3].s1 : nonce;
nonce = (v.s2==g.s2) ? W[3].s2 : nonce;
nonce = (v.s3==g.s3) ? W[3].s3 : nonce;
nonce = (v.s4==g.s4) ? W[3].s4 : nonce;
nonce = (v.s5==g.s5) ? W[3].s5 : nonce;
nonce = (v.s6==g.s6) ? W[3].s6 : nonce;
nonce = (v.s7==g.s7) ? W[3].s7 : nonce;
#elif defined VECTORS4
uint nonce = (v.s0==g.s0) ? W[3].s0 : 0;
nonce = (v.s1==g.s1) ? W[3].s1 : nonce;
nonce = (v.s2==g.s2) ? W[3].s2 : nonce;
nonce = (v.s3==g.s3) ? W[3].s3 : nonce;
#elif defined VECTORS
uint nonce = (v.s0==g.s0) ? W[3].s0 : 0;
nonce = (v.s1==g.s1) ? W[3].s1 : nonce;
#else
uint nonce = (v==g) ? W[3] : 0
#endif
if(nonce>0)
{
//Faster to shift the nonce by 2 due to 4-DWORD addressing and does not add more collisions
output[WORKSIZE] = nonce;
output[get_local_id(0)] = nonce;
}
}

Fastest code so far.
Now, is there anything that can be done for multiple found nonce values? Outputting each one separately?

Topic: Phatk2 Mod (Already seeing improvement!) - page 2. (Read 8099 times)