Phoenix - Efficient, fast, modular miner - page 3.

jedi95

full member

Activity: 219

Merit: 120

Quote from: d3m0n1q_733rz on February 02, 2012, 02:18:28 AM

Would someone mind fixing the nonces for me? I'm modding it for 128-bit and seem to have hit a snag. And, just so you know, uint8 DOES work with a worksize of 32! However, without the nonces finished, I don't know how well. I would try uint16 as it works too with a worksize of 1, but the bitshifting is a bit more involved.
After this, I'm going to have to compute some different constants for a SHA-512/256 kernel. I'll need the fractional parts of the seventeenth through twenty-fourth prime numbers as stated here: http://eprint.iacr.org/2010/548.pdf
From that point, I'll attempt a 64-bit, 128-bit and 256-bit version of the SHA-512 algorithm (in one) and then truncate the results to SHA-256. ^_^
The problem is, I'm no math genius so I'm going to need loads of help with this little project.

The problem with uint8 is that you are going to use way more GPRs. Even if you get better efficiency per thread, the higher GPR use it going to reduce the maximum number of threads that can be processed at once. This problem can be minimized with high RAM clocks, but the higher power use might not be worth it.

Also, truncated SHA512 isn't the same as SHA256.

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

Code:

// This file is taken and modified from the public-domain poclbm project, and
// I have therefore decided to keep it public-domain.
#ifdef VECTORS8 
	typedef uint8 u;
#else
#ifdef VECTORS4
	typedef uint4 u;
#else 
	#ifdef VECTORS
		typedef uint2 u;
	#else
		typedef uint u;
	#endif
#endif
#endif

__constant uint K[64] = { 
	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};

__constant uint ConstW[128] = {
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x80000000U, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000280U,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,

0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x80000000U, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000100U,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
};

__constant uint H[8] = { 
	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
};


#ifdef BITALIGN
	#pragma OPENCL EXTENSION cl_amd_media_ops : enable
	#define rot(x, y) amd_bitalign(x, x, (uint)(32 - y))
#else
	#define rot(x, y) rotate(x, (uint)y)
#endif

// This part is not from the stock poclbm kernel. It's part of an optimization
// added in the Phoenix Miner.

// Some AMD devices have the BFI_INT opcode, which behaves exactly like the
// SHA-256 Ch function, but provides it in exactly one instruction. If
// detected, use it for Ch. Otherwise, construct Ch out of simpler logical
// primitives.

#ifdef BFI_INT
	// Well, slight problem... It turns out BFI_INT isn't actually exposed to
	// OpenCL (or CAL IL for that matter) in any way. However, there is 
	// a similar instruction, BYTE_ALIGN_INT, which is exposed to OpenCL via
	// amd_bytealign, takes the same inputs, and provides the same output. 
	// We can use that as a placeholder for BFI_INT and have the application 
	// patch it after compilation.
	
	// This is the BFI_INT function
	#define Ch(x, y, z) amd_bytealign(x,y,z)
	// Ma can also be implemented in terms of BFI_INT...
	#define Ma(z, x, y) amd_bytealign(z^x,y,x)
#else
	#define Ch(x, y, z) bitselect(z,y,x)
	#define Ma(x, y, z) bitselect(x,y,(z^x))
#endif

//Various intermediate calculations for each SHA round
#define s0(n) (S0(Vals[(0 + 128 - (n)) % 8]))
#define S0(n) (rot(n, 30u)^rot(n, 19u)^rot(n,10u))

#define s1(n) (S1(Vals[(4 + 128 - (n)) % 8]))
#define S1(n) (rot(n, 26u)^rot(n, 21u)^rot(n, 7u))

#define ch(n) Ch(Vals[(4 + 128 - (n)) % 8],Vals[(5 + 128 - (n)) % 8],Vals[(6 + 128 - (n)) % 8])
#define maj(n) Ma(Vals[(1 + 128 - (n)) % 8],Vals[(2 + 128 - (n)) % 8],Vals[(0 + 128 - (n)) % 8])

//t1 calc when W is already calculated
#define t1(n) K[(n) % 64] + Vals[(7 + 128 - (n)) % 8] +  W[(n)] + s1(n) + ch(n) 

//t1 calc which calculates W
#define t1W(n) K[(n) % 64] + Vals[(7 + 128 - (n)) % 8] +  W(n) + s1(n) + ch(n)

//Used for constant W Values (the compiler optimizes out zeros)
#define t1C(n) (K[(n) % 64]+ ConstW[(n)]) + Vals[(7 + 128 - (n)) % 8] + s1(n) + ch(n)

//t2 Calc
#define t2(n)  maj(n) + s0(n)

#define rotC(x,n) (x<> (32-n))

//W calculation used for SHA round
#define W(n) (W[n] = P4(n) + P3(n) + P2(n) + P1(n))


//Partial W calculations (used for the begining where only some values are nonzero)
#define P1(n) ((rot(W[(n)-2],15u)^rot(W[(n)-2],13u)^((W[(n)-2])>>10U)))
#define P2(n) ((rot(W[(n)-15],25u)^rot(W[(n)-15],14u)^((W[(n)-15])>>3U)))
#define p1(x) ((rot(x,15u)^rot(x,13u)^((x)>>10U)))
#define p2(x) ((rot(x,25u)^rot(x,14u)^((x)>>3U)))
#define P3(n)  W[n-7]
#define P4(n)  W[n-16]

//Partial Calcs for constant W values
#define P1C(n) ((rotC(ConstW[(n)-2],15)^rotC(ConstW[(n)-2],13)^((ConstW[(n)-2])>>10U)))
#define P2C(n) ((rotC(ConstW[(n)-15],25)^rotC(ConstW[(n)-15],14)^((ConstW[(n)-15])>>3U)))
#define P3C(x)  ConstW[x-7]
#define P4C(x)  ConstW[x-16]

//SHA round with built in W calc
#define sharoundW(n) Barrier1(n);  Vals[(3 + 128 - (n)) % 8] += t1W(n); Vals[(7 + 128 - (n)) % 8] = t1W(n) + t2(n);  

//SHA round without W calc
#define sharound(n)  Barrier2(n); Vals[(3 + 128 - (n)) % 8] += t1(n); Vals[(7 + 128 - (n)) % 8] = t1(n) + t2(n);

//SHA round for constant W values
#define sharoundC(n)  Barrier2(n); Vals[(3 + 128 - (n)) % 8] += t1C(n); Vals[(7 + 128 - (n)) % 8] = t1C(n) + t2(n);

//The compiler is stupid... I put this in there only to stop the compiler from (de)optimizing the order
#define Barrier1(n) t1 = t1C((n+1))
#define Barrier2(n) t1 = t1C((n))
//#define Barrier3(n) Barrier2(n)

__kernel
//removed this to allow detection of invalid work size
//__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
void search(	const uint state0, const uint state1, const uint state2, const uint state3,
						const uint state4, const uint state5, const uint state6, const uint state7,
						const uint B1, const uint C1, const uint D1,
						const uint F1, const uint G1, const uint H1,
						const u base,
						const uint W16, const uint W17,
						const uint PreVal4, const uint PreVal0,
						const uint PreW31, const uint PreW32,
						const uint PreW19, const uint PreW20,
						__global uint * output)
{

	u W[124];
	u Vals[8];

//Dummy Variable to prevent compiler from reordering between rounds
	u t1;
	
	//Vals[0]=state0;
	Vals[1]=B1;
	Vals[2]=C1;
	Vals[3]=D1;
	//Vals[4]=PreVal4;
	Vals[5]=F1;
	Vals[6]=G1;
	Vals[7]=H1;

	W[16] = W16;
	W[17] = W17;

#ifdef VECTORS8
	W[3] = base + (uint)(get_local_id(0)) * 8u + (uint)(get_group_id(0)) * (WORKSIZE * 8u);
	uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U);
	//Since only the 2 LSB is opposite between the nonces, we can save an instruction by flipping the 4 bits in W18 rather than the 1 bit in W3
	W[18] = PreW20 + (u){r, r ^ 0x2004000U, r ^ 0x4008000U, r ^ 0x600C000U, r ^ 0x8010000U, r ^ 0xA050000U, r ^ 0xC090000U, r ^ 0xE0D0000U};
#else


#ifdef VECTORS4
	//Less dependencies to get both the local id and group id and then add them
	W[3] = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u);
	uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U);
	//Since only the 2 LSB is opposite between the nonces, we can save an instruction by flipping the 4 bits in W18 rather than the 1 bit in W3
	W[18] = PreW20 + (u){r, r ^ 0x2004000U, r ^ 0x4008000U, r ^ 0x600C000U};
#else
	#ifdef VECTORS
		W[3] = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u);
		uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U);
		W[18] = PreW20 + (u){r, r ^ 0x2004000U};
	#else
		W[3] = base + get_local_id(0) + get_group_id(0) * (WORKSIZE);
		u r = rot(W[3],25u)^rot(W[3],14u)^((W[3])>>3U);
		W[18] = PreW20 + r;
	#endif
#endif
#endif

	//the order of the W calcs and Rounds is like this because the compiler needs help finding how to order the instructions

	Vals[4] = PreVal4 + W[3];
	Vals[0] = PreVal0 + W[3];

	sharoundC(4);
	W[19] = PreW19 + W[3];
	sharoundC(5);
	W[20] = P4C(20) + P1(20);
	sharoundC(6);
	W[21] = P1(21);
	sharoundC(7);
	W[22] = P3C(22) + P1(22);
	sharoundC(8);
	W[23] = W[16] + P1(23);
	sharoundC(9);
	W[24] = W[17] + P1(24);
	sharoundC(10);
	W[25] = P1(25) + P3(25);
	W[26] = P1(26) + P3(26);
	sharoundC(11);
	W[27] = P1(27) + P3(27);
	W[28] = P1(28) + P3(28);
	sharoundC(12);
	W[29] = P1(29) + P3(29);
	sharoundC(13);
	W[30] = P1(30) + P2C(30) + P3(30);
	W[31] = PreW31 + (P1(31) + P3(31));
	sharoundC(14);
	W[32] = PreW32 + (P1(32) + P3(32));
	sharoundC(15);
	sharound(16);
	sharound(17);
	sharound(18);
	sharound(19);
	sharound(20);
	sharound(21);
	sharound(22);
	sharound(23);
	sharound(24);
	sharound(25);
	sharound(26);
	sharound(27);
	sharound(28);
	sharound(29);
	sharound(30);
	sharound(31);
	sharound(32);
	sharoundW(33);
	sharoundW(34);
	sharoundW(35);
	sharoundW(36);
	sharoundW(37);
	sharoundW(38);
	sharoundW(39);
	sharoundW(40);
	sharoundW(41);
	sharoundW(42);
	sharoundW(43);
	sharoundW(44);
	sharoundW(45);
	sharoundW(46);
	sharoundW(47);
	sharoundW(48);
	sharoundW(49);
	sharoundW(50);
	sharoundW(51);
	sharoundW(52);
	sharoundW(53);
	sharoundW(54);
	sharoundW(55);
	sharoundW(56);
	sharoundW(57);
	sharoundW(58);
	sharoundW(59);
	sharoundW(60);
	sharoundW(61);
	sharoundW(62);
	sharoundW(63);

	W[64]=state0+Vals[0];
	W[65]=state1+Vals[1];
	W[66]=state2+Vals[2];
	W[67]=state3+Vals[3];
	W[68]=state4+Vals[4];
	W[69]=state5+Vals[5];
	W[70]=state6+Vals[6];
	W[71]=state7+Vals[7];

//const u Temp = (0xb0edbdd0U + K[0]) +  W[64];

	Vals[0]=H[0];
	Vals[1]=H[1];
	Vals[2]=H[2];
	Vals[3] = 0xa54ff53aU + (0xb0edbdd0U + K[0]) +  W[64];
//Vals[3]=H[3];
	Vals[4]=H[4];
	Vals[5]=H[5];
	Vals[6]=H[6];
	Vals[7] = 0x08909ae5U + (0xb0edbdd0U + K[0]) +  W[64];
//Vals[7]=H[7];

	
#define P124(n) P2(n) + P1(n) + P4(n)

	W[64 + 16] = + P2(64 + 16) + P4(64 + 16);
	sharound(64 + 1);
	W[64 + 17] = P1C(64 + 17) + P2(64 + 17) + P4(64 + 17);
	sharound(64 + 2);
	W[64 + 18] = P124(64 + 18);
	sharound(64 + 3);
	W[64 + 19] = P124(64 + 19);
	sharound(64 + 4);
	W[64 + 20] = P124(64 + 20);
	sharound(64 + 5);
	W[64 + 21] = P124(64 + 21);
	sharound(64 + 6);
	W[64 + 22] = P4(64 + 22) + P3C(64 + 22) + P2(64 + 22) + P1(64 + 22);
	sharound(64 + 7);
	W[64 + 23] = P4(64 + 23) + P3(64 + 23) + P2C(64 + 23) + P1(64 + 23);
	sharoundC(64 + 8);
	W[64 + 24] =   P1(64 + 24) + P4C(64 + 24) + P3(64 + 24);
	sharoundC(64 + 9);
	W[64 + 25] = P3(64 + 25) + P1(64 + 25);
	sharoundC(64 + 10);
	W[64 + 26] = P3(64 + 26) + P1(64 + 26);
	sharoundC(64 + 11);
	W[64 + 27] = P3(64 + 27) + P1(64 + 27);
	sharoundC(64 + 12);
	W[64 + 28] = P3(64 + 28) + P1(64 + 28);
	sharoundC(64 + 13);
	W[64 + 29] = P1(64 + 29) + P3(64 + 29);
	W[64 + 30] = P3(64 + 30) + P2C(64 + 30) + P1(64 + 30);
	sharoundC(64 + 14);
	W[64 + 31] = P4C(64 + 31) + P3(64 + 31) + P2(64 + 31) + P1(64 + 31);
	sharoundC(64 + 15);
	sharound(64 + 16);
	sharound(64 + 17);
	sharound(64 + 18);
	sharound(64 + 19);
	sharound(64 + 20);
	sharound(64 + 21);
	sharound(64 + 22);
	sharound(64 + 23);
	sharound(64 + 24);
	sharound(64 + 25);
	sharound(64 + 26);
	sharound(64 + 27);
	sharound(64 + 28);
	sharound(64 + 29);
	sharound(64 + 30);
	sharound(64 + 31);
	sharoundW(64 + 32);
	sharoundW(64 + 33);
	sharoundW(64 + 34);
	sharoundW(64 + 35);
	sharoundW(64 + 36);
	sharoundW(64 + 37);
	sharoundW(64 + 38);
	sharoundW(64 + 39);
	sharoundW(64 + 40);
	sharoundW(64 + 41);
	sharoundW(64 + 42);
	sharoundW(64 + 43);
	sharoundW(64 + 44);
	sharoundW(64 + 45);
	sharoundW(64 + 46);
	sharoundW(64 + 47);
	sharoundW(64 + 48);
	sharoundW(64 + 49);
	sharoundW(64 + 50);
	sharoundW(64 + 51);
	sharoundW(64 + 52);
	sharoundW(64 + 53);
	sharoundW(64 + 54);
	sharoundW(64 + 55);
	sharoundW(64 + 56);
	sharoundW(64 + 57);
	sharoundW(64 + 58);

	
	u g = -(K[60] + H[7]) - S1((Vals[0] + Vals[4]) + (K[59] + W(59+64)) + s1(64+59)+ ch(59+64));
	u v = W[117] + W[108] + Vals[3] + Vals[7] + P2(124) + P1(124) + Ch((Vals[0] + Vals[4]) + (K[59] + W(59+64)) + s1(64+59)+ ch(59+64),Vals[1],Vals[2]); 
	
	uint nonce = 0;

//#ifdef VECTORS8
//		if (v.x == g.x)
//	{
//		nonce = W[3].x;
//	}
//	if (v.y == g.y)
//	{
//		nonce = W[3].y;
//	}
//	if (v.z == g.z)
//	{
//		nonce = W[3].z;
//	}
//	if (v.w == g.w)
//	{
//		nonce = W[3].w;
//	}
#ifdef VECTORS4
	if (v.x == g.x)
	{
		nonce = W[3].x;
	}
	if (v.y == g.y)
	{
		nonce = W[3].y;
	}
	if (v.z == g.z)
	{
		nonce = W[3].z;
	}
	if (v.w == g.w)
	{
		nonce = W[3].w;
	}
#else
	#ifdef VECTORS
		if (v.x == g.x)
		{
			nonce = W[3].x;
		}
		if (v.y == g.y)
		{
			nonce = W[3].y;
		}
	#else
		if (v == g)
		{
			nonce = W[3];
		}
	#endif
#endif
//#endif
	if(nonce)
	{
		//Faster to shift the nonce by 2 due to 4-DWORD addressing and does not add more collisions
		output[OUTPUT_SIZE] = nonce;
		output[get_local_id(0)] = nonce;
	}
}

Would someone mind fixing the nonces for me? I'm modding it for 128-bit and seem to have hit a snag. And, just so you know, uint8 DOES work with a worksize of 32! However, without the nonces finished, I don't know how well. I would try uint16 as it works too with a worksize of 1, but the bitshifting is a bit more involved.
After this, I'm going to have to compute some different constants for a SHA-512/256 kernel. I'll need the fractional parts of the seventeenth through twenty-fourth prime numbers as stated here: http://eprint.iacr.org/2010/548.pdf
From that point, I'll attempt a 64-bit, 128-bit and 256-bit version of the SHA-512 algorithm (in one) and then truncate the results to SHA-256. ^_^
The problem is, I'm no math genius so I'm going to need loads of help with this little project.

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

So umm...I was looking at my code and it seems that it's actually not faster. : /
If there was some way to combine some of the writes (around the Vals), then I think it could be sped up a little. But right now, I don't think I'm smarter than the compiler. I've been working with assembly too long and I find that the rules are backward. So thanks for taking the time.

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

Quote from: Diapolo on January 29, 2012, 05:21:17 AM

The fast-math thing is only for floating point operations, should be pointless.

By the way, if you guys like to play around with new kernels you could have a look here: https://bitcointalksearch.org/topic/diakgcn-kernel-for-cgminer-phoenix-2-79xx-78xx-77xx-gcn-2012-05-25-61406

Dia

Ironically, I was going to be asking when the heck a GCN miner was going to come out. Granted, I don't have the hardware and I doubt many people will at this point. But, I'll see what I can accomplish with the HD5450. And thanks for the info on floating point ops. I'll see if I can "organize" the code if it isn't already and start editing things out. Granted, I can't find any decent free resources for OpenCL operations and accepted code, but I'm looking. It's not like assembly where I can just go to a reference guide and "Oh! That's what I'm looking for!"
Any resources would be helpful if you can provide me with them. I'm still at novice level, but I'm trying. I get confused on how to follow the rotations, xors, etc. I just use logic to edit out already used variables or cut down on the amount of work the GPU has to do and let you handle the bigger math until I figure out a way to remember it. : /

Diapolo

hero member

Activity: 772

Merit: 500

The fast-math thing is only for floating point operations, should be pointless.

By the way, if you guys like to play around with new kernels you could have a look here: https://bitcointalksearch.org/topic/diakgcn-kernel-for-cgminer-phoenix-2-79xx-78xx-77xx-gcn-2012-05-25-61406

Dia

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

jedi95

full member

Activity: 219

Merit: 120

deepceleron

legendary

Activity: 1512

Merit: 1036

Quote from: d3m0n1q_733rz on January 28, 2012, 08:16:05 AM

Okay, I think I have the kinds worked out mostly. Give it a try and tell me what you think. It's phak2 with a few simple logic mods. I would try the AMD Kernel Analyzer tool, but it doesn't seem to work right for me. So, I just go off of hash rate and accepts or rejects/errors. So far, no errors. But I'm squeezing out a few more decimal place hashes.

Here's an online diff of original phatk2 and yours, I'll check it out on my card that I've already done five-sig-fig benchmarking on. The kernel is from Phateus, you might update his kernel thread with your findings.

edit: You've done it - you now have the fastest kernel. 341.19 ⇒ 341.60 on a 5830 using the same parameters and simply using the new kernel. phatk2 at the below settings was the highest possible Mhash/s that could be extracted from the card previously.

Sapphire 5830 -2L @ 1050/380 (benchmarking core speed; peak phatk2 output @ core speed)
System OpenCL 2.5.793.1, driver 11.11, WinXP
kernel parameters: VECTORS AGGRESSION=12 FASTLOOP=False WORKSIZE=256

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

Okay, I think I have the kinds worked out mostly. Give it a try and tell me what you think. It's phak2 with a few simple logic mods. I would try the AMD Kernel Analyzer tool, but it doesn't seem to work right for me. So, I just go off of hash rate and accepts or rejects/errors. So far, no errors. But I'm squeezing out a few more decimal place hashes.

Code:

// This file is taken and modified from the public-domain poclbm project, and
// I have therefore decided to keep it public-domain.


#ifdef VECTORS4
	typedef uint4 u;
#else 
	#ifdef VECTORS
		typedef uint2 u;
	#else
		typedef uint u;
	#endif
#endif

__constant uint K[64] = { 
	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};

__constant uint ConstW[128] = {
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x80000000U, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000280U,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,

0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x80000000U, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000100U,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
};

__constant uint H[8] = { 
	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
};


#ifdef BITALIGN
	#pragma OPENCL EXTENSION cl_amd_media_ops : enable
	
	#define rot(x, y) amd_bitalign(x, x, (uint)(32 - y))
#else
	#define rot(x, y) rotate(x, (uint)y)
#endif

// This part is not from the stock poclbm kernel. It's part of an optimization
// added in the Phoenix Miner.

// Some AMD devices have the BFI_INT opcode, which behaves exactly like the
// SHA-256 Ch function, but provides it in exactly one instruction. If
// detected, use it for Ch. Otherwise, construct Ch out of simpler logical
// primitives.

#ifdef BFI_INT
	// Well, slight problem... It turns out BFI_INT isn't actually exposed to
	// OpenCL (or CAL IL for that matter) in any way. However, there is 
	// a similar instruction, BYTE_ALIGN_INT, which is exposed to OpenCL via
	// amd_bytealign, takes the same inputs, and provides the same output. 
	// We can use that as a placeholder for BFI_INT and have the application 
	// patch it after compilation.
	
	// This is the BFI_INT function
	#define Ch(x, y, z) amd_bytealign(x,y,z)
	// Ma can also be implemented in terms of BFI_INT...
	#define Ma(z, x, y) amd_bytealign(z^x,y,x)
#else
	#define Ch(x, y, z) bitselect(z,y,x)
	#define Ma(x, y, z) bitselect(x,y,(z^x))
#endif

//Various intermediate calculations for each SHA round
#define s0(n) (S0(Vals[(128 - (n)) % 8]))
#define S0(n) (rot(n, 30u)^rot(n, 19u)^rot(n,10u))

#define s1(n) (S1(Vals[(132 - (n)) % 8]))
#define S1(n) (rot(n, 26u)^rot(n, 21u)^rot(n, 7u))

#define ch(n) Ch(Vals[(132 - (n)) % 8],Vals[(133 - (n)) % 8],Vals[(134 - (n)) % 8])
#define maj(n) Ma(Vals[(129 - (n)) % 8],Vals[(130 - (n)) % 8],Vals[(128 - (n)) % 8])

//t1 calc when W is already calculated
#define t1(n) K[(n) % 64] + Vals[(135 - (n)) % 8] +  W[(n)] + s1(n) + ch(n) 

//t1 calc which calculates W
#define t1W(n) K[(n) % 64] + Vals[(135 - (n)) % 8] +  W(n) + s1(n) + ch(n)

//Used for constant W Values (the compiler optimizes out zeros)
#define t1C(n) (K[(n) % 64] + ConstW[(n)]) + Vals[(135 - (n)) % 8] + s1(n) + ch(n)

//t2 Calc
#define t2(n)  maj(n) + s0(n)

#define rotC(x,n) (x<> (32-n))

//W calculation used for SHA round
#define W(n) (W[n] = P4(n) + P3(n) + P2(n) + P1(n))


//Partial W calculations (used for the begining where only some values are nonzero)
#define P1(n) ((rot(W[(n)-2],15u)^rot(W[(n)-2],13u)^((W[(n)-2])>>10U)))
#define P2(n) ((rot(W[(n)-15],25u)^rot(W[(n)-15],14u)^((W[(n)-15])>>3U)))
#define p1(x) ((rot(x,15u)^rot(x,13u)^((x)>>10U)))
#define p2(x) ((rot(x,25u)^rot(x,14u)^((x)>>3U)))
#define P3(n)  W[n-7]
#define P4(n)  W[n-16]

//Partial Calcs for constant W values
#define P1C(n) ((rotC(ConstW[(n)-2],15)^rotC(ConstW[(n)-2],13)^((ConstW[(n)-2])>>10U)))
#define P2C(n) ((rotC(ConstW[(n)-15],25)^rotC(ConstW[(n)-15],14)^((ConstW[(n)-15])>>3U)))
#define P3C(x)  ConstW[x-7]
#define P4C(x)  ConstW[x-16]

//SHA round with built in W calc
#define sharoundW(n) Barrier1(n);  Vals[(131 - (n)) % 8] += t1W(n); Vals[(135 - (n)) % 8] = t1W(n) + t2(n);  

//SHA round without W calc
#define sharound(n)  Barrier2(n); Vals[(131 - (n)) % 8] += t1(n); Vals[(135 - (n)) % 8] = t1(n) + t2(n);

//SHA round for constant W values
#define sharoundC(n)  Barrier3(n); Vals[(131 - (n)) % 8] += t1C(n); Vals[(135 - (n)) % 8] = t1C(n) + t2(n);

//The compiler is stupid... I put this in there only to stop the compiler from (de)optimizing the order

#define Barrier1(n) t1 = t1C((n+1))
#define Barrier2(n) t1 = t1C((n))
#define Barrier3(n) Barrier2(n)

__kernel
//removed this to allow detection of invalid work size
//__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
void search(	const uint state0, const uint state1, const uint state2, const uint state3,
						const uint state4, const uint state5, const uint state6, const uint state7,
						const uint B1, const uint C1, const uint D1,
						const uint F1, const uint G1, const uint H1,
						const u base,
						const uint W16, const uint W17,
						const uint PreVal4, const uint PreVal0,
						const uint PreW31, const uint PreW32,
						const uint PreW19, const uint PreW20,
						__global uint * output)
{

	u W[124];
	u Vals[8];

//Dummy Variable to prevent compiler from reordering between rounds
	u t1;
	
	W[16] = W16;
	W[17] = W17;

#ifdef VECTORS4
	//Less dependencies to get both the local id and group id and then add them
	W[3] = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u);
	uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U);
	//Since only the 2 LSB is opposite between the nonces, we can save an instruction by flipping the 4 bits in W18 rather than the 1 bit in W3
	W[18] = PreW20 + (u){r, r ^ 0x2004000U, r ^ 0x4008000U, r ^ 0x600C000U};
#else
	#ifdef VECTORS
		W[3] = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u);
		uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U);
		W[18] = PreW20 + (u){r, r ^ 0x2004000U};
	#else
		W[3] = base + get_local_id(0) + get_group_id(0) * (WORKSIZE);
		u r = PreW20 + rot(W[3],25u)^rot(W[3],14u)^((W[3])>>3U);
		W[18] = PreW20 + r;
	#endif
#endif
	//the order of the W calcs and Rounds is like this because the compiler needs help finding how to order the instructions

	Vals[0]=PreVal0 + W[3];
	Vals[1]=B1;
	Vals[2]=C1;
	Vals[3]=D1;
	Vals[4]=PreVal4 + W[3];	
	Vals[5]=F1;
	Vals[6]=G1;
	Vals[7]=H1;

	sharoundC(4);
	W[19] = PreW19 + W[3];
	sharoundC(5);
	W[20] = P1(20) + P4C(20);
	sharoundC(6);
	W[21] = P1(21);
	sharoundC(7);
	W[22] = P1(22) + P3C(22);
	sharoundC(8);
	W[23] = W[16] + P1(23);
	sharoundC(9);
	W[24] = W[17] + P1(24);
	sharoundC(10);
	W[25] = P1(25) + P3(25);
	W[26] = P1(26) + P3(26);
	sharoundC(11);
	W[27] = P1(27) + P3(27);
	W[28] = P1(28) + P3(28);
	sharoundC(12);
	W[29] = P1(29) + P3(29);
	sharoundC(13);
	W[30] = P1(30) + P3(30) + P2C(30);
	W[31] = PreW31 + P1(31) + P3(31);
	sharoundC(14);
	W[32] = PreW32 + P1(32) + P3(32);
	sharoundC(15);
	sharound(16);
	sharound(17);
	sharound(18);
	sharound(19);
	sharound(20);
	sharound(21);
	sharound(22);
	sharound(23);
	sharound(24);
	sharound(25);
	sharound(26);
	sharound(27);
	sharound(28);
	sharound(29);
	sharound(30);
	sharound(31);
	sharound(32);
	sharoundW(33);
	sharoundW(34);
	sharoundW(35);
	sharoundW(36);
	sharoundW(37);
	sharoundW(38);
	sharoundW(39);
	sharoundW(40);
	sharoundW(41);
	sharoundW(42);
	sharoundW(43);
	sharoundW(44);
	sharoundW(45);
	sharoundW(46);
	sharoundW(47);
	sharoundW(48);
	sharoundW(49);
	sharoundW(50);
	sharoundW(51);
	sharoundW(52);
	sharoundW(53);
	sharoundW(54);
	sharoundW(55);
	sharoundW(56);
	sharoundW(57);
	sharoundW(58);
	sharoundW(59);
	sharoundW(60);
	sharoundW(61);
	sharoundW(62);
	sharoundW(63);

	W[64]=state0+Vals[0];
	W[65]=state1+Vals[1];
	W[66]=state2+Vals[2];
	W[67]=state3+Vals[3];
	W[68]=state4+Vals[4];
	W[69]=state5+Vals[5];
	W[70]=state6+Vals[6];
	W[71]=state7+Vals[7];

	const u Temp = (0xb0edbdd0U + K[0]) +  W[64];

//Vals 5 & 6 are not used but gives us a complete chunk instead of a partial

	Vals[0]=H[0];
	Vals[1]=H[1];
	Vals[2]=H[2];
	Vals[3]=Temp + 0xa54ff53aU;
	Vals[4]=H[4];
	Vals[5]=H[5];
	Vals[6]=H[6];
	Vals[7]=Temp + 0x08909ae5U;



//#define P124(n) P1(n) + P2(n) + P4(n)

	W[80] = P2(80) + P4(80);
	sharound(65);
	W[81] = P1C(81) + P2(81) + P4(81);
	sharound(66);
	W[82] = P1(82) + P2(82) + P4(82);
	sharound(67);
	W[83] = P1(83) + P2(83) + P4(83);
	sharound(68);
	W[84] = P1(84) + P2(84) + P4(84);
	sharound(69);
	W[85] = P1(85) + P2(85) + P4(85);
	sharound(70);
	W[86] = P1(86) + P2(86) + P4(86) + P3C(86);
	sharound(71);
	W[87] = P1(87) + P2C(87) + P3(87) + P4(87);
	sharoundC(72);
	W[88] = P1(88) + P3(88) + P4C(88);
	sharoundC(73);
	W[89] = P1(89) + P3(89);
	sharoundC(74);
	W[90] = P1(90) + P3(90);
	sharoundC(75);
	W[91] = P1(91) + P3(91);
	sharoundC(76);
	W[92] = P1(92) + P3(92);
	sharoundC(77);
	W[93] = P1(93) + P3(93);
	W[94] = P1(94) + P3(94) + P2C(94);
	sharoundC(78);
	W[95] = P1(95) + P2(95) + P3(95) + P4C(95);
	sharoundC(79);
	sharound(80);
	sharound(81);
	sharound(82);
	sharound(83);
	sharound(84);
	sharound(85);
	sharound(86);
	sharound(87);
	sharound(88);
	sharound(89);
	sharound(90);
	sharound(91);
	sharound(92);
	sharound(93);
	sharound(94);
	sharound(95);
	sharoundW(96);
	sharoundW(97);
	sharoundW(98);
	sharoundW(99);
	sharoundW(100);
	sharoundW(101);
	sharoundW(102);
	sharoundW(103);
	sharoundW(104);
	sharoundW(105);
	sharoundW(106);
	sharoundW(107);
	sharoundW(108);
	sharoundW(109);
	sharoundW(110);
	sharoundW(111);
	sharoundW(112);
	sharoundW(113);
	sharoundW(114);
	sharoundW(115);
	sharoundW(116);
	sharoundW(117);
	sharoundW(118);
	sharoundW(119);
	sharoundW(120);
	sharoundW(121);
	sharoundW(122);

	const u precomp = (Vals[0] + Vals[4]) + (K[59] + W(123)) + s1(123) + ch(123);

	u v = W[117] + W[108] + Vals[3] + Vals[7] + P2(124) + P1(124) + Ch((precomp),Vals[1],Vals[2]); 
	u g = -(K[60] + H[7]) - S1(precomp);

	uint nonce = 0;
#ifdef VECTORS4
	if (v.x == g.x)
	{
		nonce = W[3].x;
	}
	if (v.y == g.y)
	{
		nonce = W[3].y;
	}
	if (v.z == g.z)
	{
		nonce = W[3].z;
	}
	if (v.w == g.w)
	{
		nonce = W[3].w;
	}
#else
	#ifdef VECTORS
		if (v.x == g.x)
		{
			nonce = W[3].x;
		}
		if (v.y == g.y)
		{
			nonce = W[3].y;
		}
	#else
		if (v == g)
		{
			nonce = W[3];
		}
	#endif
#endif
	if(nonce)
	{
		//Faster to shift the nonce by 2 due to 4-DWORD addressing and does not add more collisions
		output[OUTPUT_SIZE] = nonce;
		output[get_local_id(0)] = nonce;
	}
}

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

I'm working on some optimizations to the phatk2 kernel, but I hit a small hitch. Working out the kinks now. Grr...

TurdHurdur

full member

Activity: 216

Merit: 100

Quote from: jedi95 on January 26, 2012, 06:34:38 PM

Looks like I missed updating the "work from previous block" check when I added X-Work-Identifier. Basically the error occurs under P2Pool because self.lastblock is actually the P2Pool identifier from the previous 'block', and not the bitcoin previous block hash.

I have fixed this in 1.7.5 on GitHub, but I am currently away from my main computer to compile a Windows binary. (will be up later tonight) Let me know if this resolves the issue.

It's working now.

jedi95

full member

Activity: 219

Merit: 120

Quote from: TurdHurdur on January 26, 2012, 06:04:16 PM

While attempting to use p2pool I get this error:

Code:

/usr/local/src/phoenix-1.7.4/WorkQueue.py:80: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
  if self.lastBlock is not None and (wu.data[4:36] == self.lastBlock):

Any idea why/solution?

Looks like I missed updating the "work from previous block" check when I added X-Work-Identifier. Basically the error occurs under P2Pool because self.lastblock is actually the P2Pool identifier from the previous 'block', and not the bitcoin previous block hash.

I have fixed this in 1.7.5 on GitHub, but I am currently away from my main computer to compile a Windows binary. (will be up later tonight) Let me know if this resolves the issue.

TurdHurdur

full member

Activity: 216

Merit: 100

While attempting to use p2pool I get this error:

Code:

/usr/local/src/phoenix-1.7.4/WorkQueue.py:80: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
  if self.lastBlock is not None and (wu.data[4:36] == self.lastBlock):

Any idea why/solution?

allinvain

legendary

Activity: 3080

Merit: 1083

Quote from: pooler on January 24, 2012, 07:01:37 AM

At least with versions 1.7.2 and 1.7.4, Phoenix doesn't quit when I hit Ctrl-C. I am using Gentoo Linux and GNU screen.
Version 1.6.4, however, doesn't have this problem. I haven't tried versions between 1.6.4 and 1.7.2.

It's funny you mention that cause I actually liked it when it quit if I sent it Ctrl+C. For the longest time that was like sending it a BREAK signal. Now the new way to send it a break signal is Ctrl-\

pooler

hero member

Activity: 849

Merit: 507

At least with versions 1.7.2 and 1.7.4, Phoenix doesn't quit when I hit Ctrl-C. I am using Gentoo Linux and GNU screen.
Version 1.6.4, however, doesn't have this problem. I haven't tried versions between 1.6.4 and 1.7.2.

jedi95

full member

Activity: 219

Merit: 120

Quote from: Diapolo on January 23, 2012, 01:30:57 AM

Quote from: deepceleron on January 21, 2012, 09:11:05 PM

Just for curiosity I thought I'd see if there's any hashrate to be gained with compiling pyOpenCL in Visual Studio 2010 with Stream SDK 2.6 RC3:

Python 2.7.2
Base-12.1.1.win32-py2.7
numpy-MKL-1.6.1.win32-py2.7-2011-10-29
scipy-0.10.0.win32-py2.7
zope.interface-3.8.0.win32-py2.7
Twisted-11.1.0.win32-py2.7

I compiled boost_1_48_0 multithreaded in msvc-10.0, I now have boost_python-vc100-mt-1_48.dll. Compiled pyopencl-0.92 after doing the manifest tweaks and env variables to get it to work.

Results? Exactly the same 224.00 MHash/s as Phoenix 1.7.4 exe gives me. Yay. Three hours I won't get back... At least my python isn't slower than the exe's python any more.

I did some further tests with 1.7.3 and the latest pyOpenCL, too and have to say I never got a miner idle message nor other problems. So it would be nice if we were able to chose, which version pyOpenCL version we want to download jedi

.

Thanks,
Dia

The bugs that caused the miner to idle were all related to the RPC implementation. Using twisted.web for the RPC backend always caused problems so that's why we re-wrote it to use httplib instead for 1.7.0. The extra delay getting work won't be enough to cause the miner to idle thanks to the work queue. (which is around 10 seconds of stored work on a fast miner like a 5870)

For those who want to use other versions of PyOpenCL I suggest you run Phoenix from source rather than use the compiled binaries. Supporting more than one official binary is not something I want to deal with. If you want a binary with the newer versions you can always compile one yourself.

Diapolo

hero member

Activity: 772

Merit: 500

Quote from: deepceleron on January 21, 2012, 09:11:05 PM

Just for curiosity I thought I'd see if there's any hashrate to be gained with compiling pyOpenCL in Visual Studio 2010 with Stream SDK 2.6 RC3:

Python 2.7.2
Base-12.1.1.win32-py2.7
numpy-MKL-1.6.1.win32-py2.7-2011-10-29
scipy-0.10.0.win32-py2.7
zope.interface-3.8.0.win32-py2.7
Twisted-11.1.0.win32-py2.7

I compiled boost_1_48_0 multithreaded in msvc-10.0, I now have boost_python-vc100-mt-1_48.dll. Compiled pyopencl-0.92 after doing the manifest tweaks and env variables to get it to work.

Results? Exactly the same 224.00 MHash/s as Phoenix 1.7.4 exe gives me. Yay. Three hours I won't get back... At least my python isn't slower than the exe's python any more.

I did some further tests with 1.7.3 and the latest pyOpenCL, too and have to say I never got a miner idle message nor other problems. So it would be nice if we were able to chose, which version pyOpenCL version we want to download jedi

.

Thanks,
Dia

Bananington

sr. member

Activity: 1470

Merit: 428

I use this miner. It's pretty great with Diapolo's modified Kernel for SDK 2.6.

Math Man

full member

Activity: 150

Merit: 100

I'm running 1.7.2. What does the following mean regarding a rejected share?

Code:

TypeError in RPC sendResult callback

jedi95

full member

Activity: 219

Merit: 120

Quote from: allinvain on January 21, 2012, 05:36:40 PM

Quote from: jedi95 on January 21, 2012, 04:21:40 PM

Version 1.7.4 has been released.

Changes:
- Added X-Work-Identifier support to RPC for better compatibility with P2Pool
- Tweaked kernel WORKSIZE validation

Download

Latest version: 1.7.4
Windows binaries
Source code/Linux release (requires Python, Twisted, and PyOpenCL)

GitHub:
https://github.com/jedi95/Phoenix-Miner

Ummm, I think you forgot to either change the version number or did not upload the 1.7.4 source yet cause when I dl the latest git tarball it still says 1.7.3..just thought I'd let ya know :p

Thanks, fixed.

Topic: Phoenix - Efficient, fast, modular miner - page 3. (Read 760945 times)