Phoenix - Efficient, fast, modular miner - page 2.

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

Last phatk2 kernel update for those with GCN (HD79xx) cards. This is a mod that I've tossed together that allows the use of 8 out of the 16 available vectors in these cards' SIMD. I don't know how well it'll work for you, but I need someone with a 79xx series card to test it out for me. I'll port it over to Phoenix 2 later, but this is what I have now. Play around with the settings if you would to "dial in" the right combination. If anyone wants to donate bitcoin toward a GCN card so I can test the kernel mods myself as I go, I would appreciate it. I'm trying to expand the miners to the full capabilities of the cards. So, this means that I might be attempting a new kernel as well though I can't promise anything that big. But I have some ideas on what I want to achieve and how.

Code:

// This file is taken and modified from the public-domain poclbm project, and
// I have therefore decided to keep it public-domain.
#ifdef VECTORS8 
	typedef uint8 u;
#elif defined VECTORS4
	typedef uint4 u;
#elif defined VECTORS
		typedef uint2 u;
	#else
		typedef uint u;
#endif

__constant uint K[64] = { 
	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};

__constant uint ConstW[128] = {
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x80000000U, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000280U,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,

0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x80000000U, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000100U,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
};

__constant uint H[8] = { 
	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
};


#ifdef BITALIGN
	#pragma OPENCL EXTENSION cl_amd_media_ops : enable
	#define rot(x, y) amd_bitalign(x, x, (uint)(32 - y))
#else
	#define rot(x, y) rotate(x, (uint)y)
#endif

// This part is not from the stock poclbm kernel. It's part of an optimization
// added in the Phoenix Miner.

// Some AMD devices have the BFI_INT opcode, which behaves exactly like the
// SHA-256 Ch function, but provides it in exactly one instruction. If
// detected, use it for Ch. Otherwise, construct Ch out of simpler logical
// primitives.

#ifdef BFI_INT
	// Well, slight problem... It turns out BFI_INT isn't actually exposed to
	// OpenCL (or CAL IL for that matter) in any way. However, there is 
	// a similar instruction, BYTE_ALIGN_INT, which is exposed to OpenCL via
	// amd_bytealign, takes the same inputs, and provides the same output. 
	// We can use that as a placeholder for BFI_INT and have the application 
	// patch it after compilation.
	
	// This is the BFI_INT function
	#define Ch(x, y, z) amd_bytealign(x,y,z)
	// Ma can also be implemented in terms of BFI_INT...
	#define Ma(z, x, y) amd_bytealign(z^x,y,x)
#else
	#define Ch(x, y, z) bitselect(z,y,x)
	#define Ma(x, y, z) bitselect(x,y,(z^x))
#endif

//Various intermediate calculations for each SHA round
#define s0(n) (S0(Vals[(0 + 128 - (n)) % 8]))
#define S0(n) (rot(n, 30u)^rot(n, 19u)^rot(n,10u))

#define s1(n) (S1(Vals[(4 + 128 - (n)) % 8]))
#define S1(n) (rot(n, 26u)^rot(n, 21u)^rot(n, 7u))

#define ch(n) Ch(Vals[(4 + 128 - (n)) % 8],Vals[(5 + 128 - (n)) % 8],Vals[(6 + 128 - (n)) % 8])
#define maj(n) Ma(Vals[(1 + 128 - (n)) % 8],Vals[(2 + 128 - (n)) % 8],Vals[(0 + 128 - (n)) % 8])

//t1 calc when W is already calculated
#define t1(n) K[(n) % 64] + Vals[(7 + 128 - (n)) % 8] +  W[(n)] + s1(n) + ch(n) 

//t1 calc which calculates W
#define t1W(n) K[(n) % 64] + Vals[(7 + 128 - (n)) % 8] +  W(n) + s1(n) + ch(n)

//Used for constant W Values (the compiler optimizes out zeros)
#define t1C(n) (K[(n) % 64]+ ConstW[(n)]) + Vals[(7 + 128 - (n)) % 8] + s1(n) + ch(n)

//t2 Calc
#define t2(n)  maj(n) + s0(n)

#define rotC(x,n) (x<> (32-n))

//W calculation used for SHA round
#define W(n) (W[n] = P4(n) + P3(n) + P2(n) + P1(n))


//Partial W calculations (used for the begining where only some values are nonzero)
#define P1(n) ((rot(W[(n)-2],15u)^rot(W[(n)-2],13u)^((W[(n)-2])>>10U)))
#define P2(n) ((rot(W[(n)-15],25u)^rot(W[(n)-15],14u)^((W[(n)-15])>>3U)))
#define p1(x) ((rot(x,15u)^rot(x,13u)^((x)>>10U)))
#define p2(x) ((rot(x,25u)^rot(x,14u)^((x)>>3U)))
#define P3(n)  W[n-7]
#define P4(n)  W[n-16]

//Partial Calcs for constant W values
#define P1C(n) ((rotC(ConstW[(n)-2],15)^rotC(ConstW[(n)-2],13)^((ConstW[(n)-2])>>10U)))
#define P2C(n) ((rotC(ConstW[(n)-15],25)^rotC(ConstW[(n)-15],14)^((ConstW[(n)-15])>>3U)))
#define P3C(x)  ConstW[x-7]
#define P4C(x)  ConstW[x-16]

//SHA round with built in W calc
#define sharoundW(n) Barrier1(n);  Vals[(3 + 128 - (n)) % 8] += t1W(n); Vals[(7 + 128 - (n)) % 8] = t1W(n) + t2(n);  

//SHA round without W calc
#define sharound(n)  Barrier2(n); Vals[(3 + 128 - (n)) % 8] += t1(n); Vals[(7 + 128 - (n)) % 8] = t1(n) + t2(n);

//SHA round for constant W values
#define sharoundC(n)  Barrier2(n); Vals[(3 + 128 - (n)) % 8] += t1C(n); Vals[(7 + 128 - (n)) % 8] = t1C(n) + t2(n);

//The compiler is stupid... I put this in there only to stop the compiler from (de)optimizing the order
#define Barrier1(n) t1 = t1C((n+1))
#define Barrier2(n) t1 = t1C((n))
//#define Barrier3(n) Barrier2(n)

__kernel
//removed this to allow detection of invalid work size
//__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
void search(	const uint state0, const uint state1, const uint state2, const uint state3,
						const uint state4, const uint state5, const uint state6, const uint state7,
						const uint B1, const uint C1, const uint D1,
						const uint F1, const uint G1, const uint H1,
						const u base,
						const uint W16, const uint W17,
						const uint PreVal4, const uint PreVal0,
						const uint PreW31, const uint PreW32,
						const uint PreW19, const uint PreW20,
						__global uint * output)
{

	u W[124];
	u Vals[8];

//Dummy Variable to prevent compiler from reordering between rounds
	u t1;
	
	//Vals[0]=state0;
	Vals[1]=B1;
	Vals[2]=C1;
	Vals[3]=D1;
	//Vals[4]=PreVal4;
	Vals[5]=F1;
	Vals[6]=G1;
	Vals[7]=H1;

	W[16] = W16;
	W[17] = W17;

#ifdef VECTORS8
//	#ifdef GOFFSET
//	W[3] = ((uint)get_global_id(0) << 3) + (u)(0, 1, 2, 3, 4, 5, 6, 7);
//	uint r = rot(W[3].s0,25u)^rot(W[3].s0,14u)^((W[3].s0)>>3U);
//	#else
	W[3] = base + (uint)(get_local_id(0)) * 8u + (uint)(get_group_id(0)) * (WORKSIZE * 8u);
	uint r = rot(W[3].s0,25u)^rot(W[3].s0,14u)^((W[3].s0)>>3U);
	//Since only the 2 LSB is opposite between the nonces, we can save an instruction by flipping the 4 bits in W18 rather than the 1 bit in W3
	W[18] = PreW20 + (u){r, r ^ 0x2004000U, r ^ 0x4008000U, r ^ 0x600C000U, r ^ 0x8010000U, r ^ 0xA050000U, r ^ 0xC090000U, r ^ 0xE0D0000U};

#elif defined VECTORS4
	//Less dependencies to get both the local id and group id and then add them
	W[3] = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u);
	uint r = rot(W[3].s0,25u)^rot(W[3].s0,14u)^((W[3].s0)>>3U);
	//Since only the 2 LSB is opposite between the nonces, we can save an instruction by flipping the 4 bits in W18 rather than the 1 bit in W3
	W[18] = PreW20 + (u){r, r ^ 0x2004000U, r ^ 0x4008000U, r ^ 0x600C000U};
#elif defined VECTORS
		W[3] = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u);
		uint r = rot(W[3].s0,25u)^rot(W[3].s0,14u)^((W[3].s0)>>3U);
		W[18] = PreW20 + (u){r, r ^ 0x2004000U};
	#else
		W[3] = base + get_local_id(0) + get_group_id(0) * (WORKSIZE);
		u r = rot(W[3],25u)^rot(W[3],14u)^((W[3])>>3U);
		W[18] = PreW20 + r;
#endif

	//the order of the W calcs and Rounds is like this because the compiler needs help finding how to order the instructions

	Vals[4] = PreVal4 + W[3];
	Vals[0] = PreVal0 + W[3];

	sharoundC(4);
	W[19] = PreW19 + W[3];
	sharoundC(5);
	W[20] = P4C(20) + P1(20);
	sharoundC(6);
	W[21] = P1(21);
	sharoundC(7);
	W[22] = P3C(22) + P1(22);
	sharoundC(8);
	W[23] = W[16] + P1(23);
	sharoundC(9);
	W[24] = W[17] + P1(24);
	sharoundC(10);
	W[25] = P1(25) + P3(25);
	W[26] = P1(26) + P3(26);
	sharoundC(11);
	W[27] = P1(27) + P3(27);
	W[28] = P1(28) + P3(28);
	sharoundC(12);
	W[29] = P1(29) + P3(29);
	sharoundC(13);
	W[30] = P1(30) + P2C(30) + P3(30);
	W[31] = PreW31 + (P1(31) + P3(31));
	sharoundC(14);
	W[32] = PreW32 + (P1(32) + P3(32));
	sharoundC(15);
	sharound(16);
	sharound(17);
	sharound(18);
	sharound(19);
	sharound(20);
	sharound(21);
	sharound(22);
	sharound(23);
	sharound(24);
	sharound(25);
	sharound(26);
	sharound(27);
	sharound(28);
	sharound(29);
	sharound(30);
	sharound(31);
	sharound(32);
	sharoundW(33);
	sharoundW(34);
	sharoundW(35);
	sharoundW(36);
	sharoundW(37);
	sharoundW(38);
	sharoundW(39);
	sharoundW(40);
	sharoundW(41);
	sharoundW(42);
	sharoundW(43);
	sharoundW(44);
	sharoundW(45);
	sharoundW(46);
	sharoundW(47);
	sharoundW(48);
	sharoundW(49);
	sharoundW(50);
	sharoundW(51);
	sharoundW(52);
	sharoundW(53);
	sharoundW(54);
	sharoundW(55);
	sharoundW(56);
	sharoundW(57);
	sharoundW(58);
	sharoundW(59);
	sharoundW(60);
	sharoundW(61);
	sharoundW(62);
	sharoundW(63);

	W[64]=state0+Vals[0];
	W[65]=state1+Vals[1];
	W[66]=state2+Vals[2];
	W[67]=state3+Vals[3];
	W[68]=state4+Vals[4];
	W[69]=state5+Vals[5];
	W[70]=state6+Vals[6];
	W[71]=state7+Vals[7];

//const u Temp = (0xb0edbdd0U + K[0]) +  W[64];

	Vals[0]=H[0];
	Vals[1]=H[1];
	Vals[2]=H[2];
	Vals[3] = 0xa54ff53aU + (0xb0edbdd0U + K[0]) +  W[64];
//Vals[3]=H[3];
	Vals[4]=H[4];
	Vals[5]=H[5];
	Vals[6]=H[6];
	Vals[7] = 0x08909ae5U + (0xb0edbdd0U + K[0]) +  W[64];
//Vals[7]=H[7];

	
#define P124(n) P2(n) + P1(n) + P4(n)

	W[64 + 16] = + P2(64 + 16) + P4(64 + 16);
	sharound(64 + 1);
	W[64 + 17] = P1C(64 + 17) + P2(64 + 17) + P4(64 + 17);
	sharound(64 + 2);
	W[64 + 18] = P124(64 + 18);
	sharound(64 + 3);
	W[64 + 19] = P124(64 + 19);
	sharound(64 + 4);
	W[64 + 20] = P124(64 + 20);
	sharound(64 + 5);
	W[64 + 21] = P124(64 + 21);
	sharound(64 + 6);
	W[64 + 22] = P4(64 + 22) + P3C(64 + 22) + P2(64 + 22) + P1(64 + 22);
	sharound(64 + 7);
	W[64 + 23] = P4(64 + 23) + P3(64 + 23) + P2C(64 + 23) + P1(64 + 23);
	sharoundC(64 + 8);
	W[64 + 24] =   P1(64 + 24) + P4C(64 + 24) + P3(64 + 24);
	sharoundC(64 + 9);
	W[64 + 25] = P3(64 + 25) + P1(64 + 25);
	sharoundC(64 + 10);
	W[64 + 26] = P3(64 + 26) + P1(64 + 26);
	sharoundC(64 + 11);
	W[64 + 27] = P3(64 + 27) + P1(64 + 27);
	sharoundC(64 + 12);
	W[64 + 28] = P3(64 + 28) + P1(64 + 28);
	sharoundC(64 + 13);
	W[64 + 29] = P1(64 + 29) + P3(64 + 29);
	W[64 + 30] = P3(64 + 30) + P2C(64 + 30) + P1(64 + 30);
	sharoundC(64 + 14);
	W[64 + 31] = P4C(64 + 31) + P3(64 + 31) + P2(64 + 31) + P1(64 + 31);
	sharoundC(64 + 15);
	sharound(64 + 16);
	sharound(64 + 17);
	sharound(64 + 18);
	sharound(64 + 19);
	sharound(64 + 20);
	sharound(64 + 21);
	sharound(64 + 22);
	sharound(64 + 23);
	sharound(64 + 24);
	sharound(64 + 25);
	sharound(64 + 26);
	sharound(64 + 27);
	sharound(64 + 28);
	sharound(64 + 29);
	sharound(64 + 30);
	sharound(64 + 31);
	sharoundW(64 + 32);
	sharoundW(64 + 33);
	sharoundW(64 + 34);
	sharoundW(64 + 35);
	sharoundW(64 + 36);
	sharoundW(64 + 37);
	sharoundW(64 + 38);
	sharoundW(64 + 39);
	sharoundW(64 + 40);
	sharoundW(64 + 41);
	sharoundW(64 + 42);
	sharoundW(64 + 43);
	sharoundW(64 + 44);
	sharoundW(64 + 45);
	sharoundW(64 + 46);
	sharoundW(64 + 47);
	sharoundW(64 + 48);
	sharoundW(64 + 49);
	sharoundW(64 + 50);
	sharoundW(64 + 51);
	sharoundW(64 + 52);
	sharoundW(64 + 53);
	sharoundW(64 + 54);
	sharoundW(64 + 55);
	sharoundW(64 + 56);
	sharoundW(64 + 57);
	sharoundW(64 + 58);

	
	u g = -(K[60] + H[7]) - S1((Vals[0] + Vals[4]) + (K[59] + W(59+64)) + s1(64+59)+ ch(59+64));
	u v = W[117] + W[108] + Vals[3] + Vals[7] + P2(124) + P1(124) + Ch((Vals[0] + Vals[4]) + (K[59] + W(59+64)) + s1(64+59)+ ch(59+64),Vals[1],Vals[2]); 
	
	uint nonce = 0;

#ifdef VECTORS8
	if (v.s0 == g.s0)
	{
		nonce = W[3].s0;
	}
	if (v.s1 == g.s1)
	{
		nonce = W[3].s1;
	}
	if (v.s2 == g.s2)
	{
		nonce = W[3].s2;
	}
	if (v.s3 == g.s3)
	{
		nonce = W[3].s3;
	}
	if (v.s4 == g.s4)
	{
		nonce = W[3].s4;
	}
	if (v.s5 == g.s5)
	{
		nonce = W[3].s5;
	}
	if (v.s6 == g.s6)
	{
		nonce = W[3].s6;
	}
	if (v.s7 == g.s7)
	{
		nonce = W[3].s7;
	}
#elif defined VECTORS4
	if (v.s0 == g.s0)
	{
		nonce = W[3].s0;
	}
	if (v.s1 == g.s1)
	{
		nonce = W[3].s1;
	}
	if (v.s2 == g.s2)
	{
		nonce = W[3].s2;
	}
	if (v.s3 == g.s3)
	{
		nonce = W[3].s3;
	}
#elif defined VECTORS
		if (v.s0 == g.s0)
		{
			nonce = W[3].s0;
		}
		if (v.s1 == g.s1)
		{
			nonce = W[3].s1;
		}
	#else
		if (v == g)
		{
			nonce = W[3];
		}
#endif
	if(nonce)
	{
		//Faster to shift the nonce by 2 due to 4-DWORD addressing and does not add more collisions
		output[OUTPUT_SIZE] = nonce;
		output[get_local_id(0)] = nonce;
	}
}

Credit goes to Diapolo for all of their help on the kernel. Some of their ideas and coding style are placed into this. GOFFSET is a work in progress here, but totally their idea.

Also, as this has created a new page,

Phoenix 2 beta has been released:
https://bitcointalksearch.org/topic/phoenix-2-beta-discussion-62765

jedi95

full member

Activity: 219

Merit: 120

Phoenix 2 beta has been released:
https://bitcointalksearch.org/topic/phoenix-2-beta-discussion-62765

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

Quote from: Diapolo on February 03, 2012, 01:19:15 PM

Quote from: d3m0n1q_733rz on February 03, 2012, 09:37:04 AM

Hey, small little thing I noticed about my mod to add VECTORS8 to the phatk2 kernel. I just tried it on my CPU that would normally get around 775 KHash/sec and it shot up to around 4.06 MHash/sec. There might be something here to modding CPU mining for underprivileged miners like myself. I would be really grateful if someone could toss some nonce calculations into my mod for me. Grin

Please?
I wanted to add that I'm aware that this is not a CPU miner. But if the CPU handles 8 vectors (256-bit) faster than 4 or even 2 (64-bit), we may have found a good starting point for a new approach.

Now that's an interesting observation, hm perhaps I should add uint8 support, too ... just for the sake of it Cheesy

.

Edit: Check this out!

AMD A8-3850 (CPU device) - default settings (AGGRESSION=5 WORKSIZE=1024) - DiaKGCN kernel with VECTORS8 support added:
no vectors: 2,6 MH/s
VECTORS2: 1,1 MH/s
VECTORS4: 7,4 MH/s
VECTORS8: 2,9 MH/s

I'll let it run for a while to check if valid nonces are found.

Dia

Once AVX2 comes out, we'll be seeing VECTORS8 higher with CPUs. Anyhow, I've attempted to use Dia's GOFFSET=false code with phatk2, but I can't seem to match the two up. Combining with VECTORS8 (once I can get the nonces figured out) and a worksize of 64 will probably end up with the fastest code for VLIW5 processors sporting 256-byte capability. At least this held true for Dia's new kernel. I enjoy trying to combine the best of all kernels into one to see what the result is, but translation of variables is a pain sometimes.
If anyone wants to take a stab at it, the results could be amazing.

greatwolf

full member

Activity: 230

Merit: 100

Quote from: Math Man on January 22, 2012, 03:02:09 PM

I'm running 1.7.2. What does the following mean regarding a rejected share?

Code:

TypeError in RPC sendResult callback

I'm also getting this rejected share error with phoenix 1.7.4. What's the cause for this? Any possible fix or workarounds?

dishwara

legendary

Activity: 1855

Merit: 1016

ya, i have to tell correctly even in GUI AOCLBF

TurdHurdur

full member

Activity: 216

Merit: 100

Quote from: dishwara on February 04, 2012, 10:31:42 AM

I don't know accurate values as i am using AOCLBF gui for phoenix.

My settings are vectors, HD5870, worksize 256, aggerrsion=12 BFI_INT.
Actually if i increase memory speed above 400 my system crashes.

It'd be AGGRESSION=12 not "aggerrsion=12".

dishwara

legendary

Activity: 1855

Merit: 1016

I don't know accurate values as i am using AOCLBF gui for phoenix.

My settings are vectors, HD5870, worksize 256, aggerrsion=12 BFI_INT.
Actually if i increase memory speed above 400 my system crashes.

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

Quote from: dishwara on February 04, 2012, 05:32:10 AM

phatk2 gives lesser than phatk for some unknown reason.

I get 380 with phatk2 & 414 with phatk on 5870 with 945/300

What are your settings? I recommend using VECTORS and WORKSIZE=128 for that card at that memory speed. If you increase the memory speed, you should get better results with VECTORS4 and WORKSIZE=64...in theory.

dishwara

legendary

Activity: 1855

Merit: 1016

phatk2 gives lesser than phatk for some unknown reason.

I get 380 with phatk2 & 414 with phatk on 5870 with 945/300

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

Quote from: Diapolo on February 03, 2012, 01:19:15 PM

Quote from: d3m0n1q_733rz on February 03, 2012, 09:37:04 AM

Hey, small little thing I noticed about my mod to add VECTORS8 to the phatk2 kernel. I just tried it on my CPU that would normally get around 775 KHash/sec and it shot up to around 4.06 MHash/sec. There might be something here to modding CPU mining for underprivileged miners like myself. I would be really grateful if someone could toss some nonce calculations into my mod for me. Grin

Please?
I wanted to add that I'm aware that this is not a CPU miner. But if the CPU handles 8 vectors (256-bit) faster than 4 or even 2 (64-bit), we may have found a good starting point for a new approach.

Now that's an interesting observation, hm perhaps I should add uint8 support, too ... just for the sake of it Cheesy

.

Edit: Check this out!

AMD A8-3850 (CPU device) - default settings (AGGRESSION=5 WORKSIZE=1024) - DiaKGCN kernel with VECTORS8 support added:
no vectors: 2,6 MH/s
VECTORS2: 1,1 MH/s
VECTORS4: 7,4 MH/s
VECTORS8: 2,9 MH/s

I'll let it run for a while to check if valid nonces are found.

Dia

I probably should have mentioned I'm using an Core2 Quad. I'm surprised at what you have for the worksize. I really don't know what good some of the CPU information is going to do, but I'm sort of hoping it'll put a little more oomph into CPU mining if it can be incorporated into CPU miners. Right now, my CPU is mining faster than my GPU! I'm seeing 15.6 with Ufasoft and only about 14.96 with phatk2.

Diapolo

hero member

Activity: 772

Merit: 500

Quote from: d3m0n1q_733rz on February 03, 2012, 09:37:04 AM

Hey, small little thing I noticed about my mod to add VECTORS8 to the phatk2 kernel. I just tried it on my CPU that would normally get around 775 KHash/sec and it shot up to around 4.06 MHash/sec. There might be something here to modding CPU mining for underprivileged miners like myself. I would be really grateful if someone could toss some nonce calculations into my mod for me. Grin

Please?
I wanted to add that I'm aware that this is not a CPU miner. But if the CPU handles 8 vectors (256-bit) faster than 4 or even 2 (64-bit), we may have found a good starting point for a new approach.

Now that's an interesting observation, hm perhaps I should add uint8 support, too ... just for the sake of it Cheesy

.

Edit: Check this out!

AMD A8-3850 (CPU device) - default settings (AGGRESSION=5 WORKSIZE=1024) - DiaKGCN kernel with VECTORS8 support added:
no vectors: 2,6 MH/s
VECTORS2: 1,1 MH/s
VECTORS4: 7,4 MH/s
VECTORS8: 2,9 MH/s

I'll let it run for a while to check if valid nonces are found.

Dia

pogi23

member

Activity: 266

Merit: 10

Quote from: pogi23 on February 03, 2012, 02:13:58 AM

Has anyone fried a 5770 using AGGRESSION=7?

The skinny: I was using poclbm, SDK2.1, and 5770 (stock clocks) when I thought I'd try out phoenix. I saw the 'Recommended' settings for High-End cards and thought they were merely guidelines. It would run for 5 minutes then it froze twice then it wouldn't even POST. Bummer, because it had a twin.

It is likely that there is(was) some problem with that card, such as the heatsink not being mounted correctly and the GPU die overheating within seconds (or you had it ridiculously overclocked or overvolted). My 5770 runs fine at aggression 12 overclocked to 980MHz, cranking out 225+ MHash/s at 75% fan and 75C in a case.
[/quote]

I think it must have been really bad luck as I was running it minutes before switching to phoenix without making any changes to the card. Or it may have been on the way out but I couldn't see why since I kept it at stock 850 at a cool 59C. Oh well, thanks for listening--I'm just grieving over the loss of a loved one.

deepceleron

legendary

Activity: 1512

Merit: 1036

Quote from: d3m0n1q_733rz on February 03, 2012, 04:36:55 AM

Out of curiosity, have you already unlocked the extra shaders on your card by flashing it to a 5870?

You are thinking of the 6xxx series. The 5850 is its own die and product that is architecturally different than 5830 and 5870 (which share a die and reference PCB). The 5770 is a similar VLIW5 with 800 shaders. None can be "unlocked".

Quote from: pogi23 on February 03, 2012, 02:13:58 AM

Has anyone fried a 5770 using AGGRESSION=7?

The skinny: I was using poclbm, SDK2.1, and 5770 (stock clocks) when I thought I'd try out phoenix. I saw the 'Recommended' settings for High-End cards and thought they were merely guidelines. It would run for 5 minutes then it froze twice then it wouldn't even POST. Bummer, because it had a twin.

It is likely that there is(was) some problem with that card, such as the heatsink not being mounted correctly and the GPU die overheating within seconds (or you had it ridiculously overclocked or overvolted). My 5770 runs fine at aggression 12 overclocked to 980MHz, cranking out 225+ MHash/s at 75% fan and 75C in a case.

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

Hey, small little thing I noticed about my mod to add VECTORS8 to the phatk2 kernel. I just tried it on my CPU that would normally get around 775 KHash/sec and it shot up to around 4.06 MHash/sec. There might be something here to modding CPU mining for underprivileged miners like myself. I would be really grateful if someone could toss some nonce calculations into my mod for me. Grin

Please?
I wanted to add that I'm aware that this is not a CPU miner. But if the CPU handles 8 vectors (256-bit) faster than 4 or even 2 (64-bit), we may have found a good starting point for a new approach.

FreeBit

member

Activity: 106

Merit: 10

Quote from: jedi95 on February 02, 2012, 04:51:04 PM

In general this is what you will need to do:

1. Install drivers + APP SDK
There are many guides for this around, so I won't go into the details here. In general SDK versions 2.1, 2.4, 2.5 are good, and 2.2, 2.3, 2.6 are bad.

2. Install prerequisites:
sudo apt-get install python-twisted python-numpy python-pyopencl

....

Thank you. I did these steps. but ...

Quote

./phoenix.py -u http://xxx:[email protected]:8332 -k phatk2 AGGRESSION=6 WORKSIZE=128 VECTORS DEVICE=0

Quote

FATAL: Error inserting nvidia_current (/lib/modules/3.0.0-15-generic/updates/dkms/nvidia_current.ko): No such device
No device specified or device not found, use DEVICE=ID to specify one of the following

AMD Sempron(tm) 140 Processor

[0 Khash/sec] [0 Accepted] [0 Rejected]bb@hayeh:~/phoenix$

Quote

FATAL: Error inserting nvidia_current (/lib/modules/3.0.0-15-generic/updates/dkms/nvidia_current.ko): No such device
[03/02/2012 15:19:44] FATAL kernel error: Failed to apply BFI_INT patch to kernel! Is BFI_INT supported on this hardware?

I am reading about this, but I don't know and don't understand, what AMD APP, catalyst, aticonfig, blablabla are and what they are doing ...

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

Quote from: jedi95 on February 02, 2012, 04:51:04 PM

Quote from: FreeBit on February 02, 2012, 01:52:36 PM

I want to use this miner.

How do I install it on an Ubunut 11.10 32bit system with one 5850?
I could not found any installation advice, readme or howto.

Thank you ...

In general this is what you will need to do:

1. Install drivers + APP SDK
There are many guides for this around, so I won't go into the details here. In general SDK versions 2.1, 2.4, 2.5 are good, and 2.2, 2.3, 2.6 are bad.

2. Install prerequisites:
sudo apt-get install python-twisted python-numpy python-pyopencl

3. Download Phoenix
You can do this one of 2 ways:
A. Clone the Git repo using:
git clone [email protected]:jedi95/Phoenix-Miner.git /home/username/phoenix
Obviously you will want to modify the directory to suit your needs.
B. Download
https://github.com/jedi95/Phoenix-Miner/tarball/master
Extract to the directory of your choice

4. Start Phoenix:
./phoenix.py -u http://username:[email protected]:8332 -k phatk2 AGGRESSION=6 WORKSIZE=128 VECTORS DEVICE=0
You will need modify the URL to connect to the pool of your choice. The above settings should be optimal for a 5850 running at normal memory clocks. (substitute WORKSIZE=256 if underclocking the memory)

On that last part, I recommend using VECTORS4 and WORKSIZE=64.
Out of curiosity, have you already unlocked the extra shaders on your card by flashing it to a 5870? Using VECTORS4 will allow for the full 128-bit transfers which will increase throughput to it's theoretical limit of 122-128 Gb/s. Granted that limit is easier to reach using floating points, but let's not get into that.
Play around with it and find out what works best. But definitely unlock those shaders!

pogi23

member

Activity: 266

Merit: 10

Has anyone fried a 5770 using AGGRESSION=7?

The skinny: I was using poclbm, SDK2.1, and 5770 (stock clocks) when I thought I'd try out phoenix. I saw the 'Recommended' settings for High-End cards and thought they were merely guidelines. It would run for 5 minutes then it froze twice then it wouldn't even POST. Bummer, because it had a twin.

Just how much of a difference is there between AGGRESSION 5 and 7?

jedi95

full member

Activity: 219

Merit: 120

Quote from: FreeBit on February 02, 2012, 01:52:36 PM

I want to use this miner.

How do I install it on an Ubunut 11.10 32bit system with one 5850?
I could not found any installation advice, readme or howto.

Thank you ...

In general this is what you will need to do:

1. Install drivers + APP SDK
There are many guides for this around, so I won't go into the details here. In general SDK versions 2.1, 2.4, 2.5 are good, and 2.2, 2.3, 2.6 are bad.

2. Install prerequisites:
sudo apt-get install python-twisted python-numpy python-pyopencl

3. Download Phoenix
You can do this one of 2 ways:
A. Clone the Git repo using:
git clone [email protected]:jedi95/Phoenix-Miner.git /home/username/phoenix
Obviously you will want to modify the directory to suit your needs.
B. Download
https://github.com/jedi95/Phoenix-Miner/tarball/master
Extract to the directory of your choice

4. Start Phoenix:
./phoenix.py -u http://username:[email protected]:8332 -k phatk2 AGGRESSION=6 WORKSIZE=128 VECTORS DEVICE=0
You will need modify the URL to connect to the pool of your choice. The above settings should be optimal for a 5850 running at normal memory clocks. (substitute WORKSIZE=256 if underclocking the memory)

FreeBit

member

Activity: 106

Merit: 10

I want to use this miner.

How do I install it on an Ubunut 11.10 32bit system with one 5850?
I could not found any installation advice, readme or howto.

Thank you ...

d3m0n1q_733rz

sr. member

Activity: 378

Merit: 250

Quote from: jedi95 on February 02, 2012, 11:52:37 AM

Quote from: d3m0n1q_733rz on February 02, 2012, 02:18:28 AM

Would someone mind fixing the nonces for me? I'm modding it for 128-bit and seem to have hit a snag. And, just so you know, uint8 DOES work with a worksize of 32! However, without the nonces finished, I don't know how well. I would try uint16 as it works too with a worksize of 1, but the bitshifting is a bit more involved.
After this, I'm going to have to compute some different constants for a SHA-512/256 kernel. I'll need the fractional parts of the seventeenth through twenty-fourth prime numbers as stated here: http://eprint.iacr.org/2010/548.pdf
From that point, I'll attempt a 64-bit, 128-bit and 256-bit version of the SHA-512 algorithm (in one) and then truncate the results to SHA-256. ^_^
The problem is, I'm no math genius so I'm going to need loads of help with this little project.

The problem with uint8 is that you are going to use way more GPRs. Even if you get better efficiency per thread, the higher GPR use it going to reduce the maximum number of threads that can be processed at once. This problem can be minimized with high RAM clocks, but the higher power use might not be worth it.

Also, truncated SHA512 isn't the same as SHA256.

Yeah, I found out that the SHA-512/256 algorithm doesn't output an actual SHA-256 hash.
As for the uint8, I was trying to figure out if the increased efficiency will make up for the lower hash speed IF it doesn't increase after the nonce values are fixed. If it does increase efficiency significantly, as it is processing twice as many vectors [with nonce values set], then the increased RAM clocks might not be that big of an issue. Though, I did find out that the GPRs are designed to handle 128-bit moves more efficiently than 64-bit or 256-bit. So this means that 4 vectors are optimum for the Evergreen (HD5xxx) GPUs. However, I'm not certain on the Northern Islands (HD6xxx) GPUs and beyond. Their memory interface is 256-bit which may mean that 8 vectors could be optimum. Unfortunately, I don't have the hardware to test the theory out. And that's all it is. Just a theory.

Topic: Phoenix - Efficient, fast, modular miner - page 2. (Read 760945 times)