As Diablo has already pointed out, GLSL (version 1.2 and earlier) has no support for 32-bit integers nor bitwise operators. GLSL 1.2 corresponds to OpenGL 2.1, which is what you'll currently get with FLOSS drivers (i.e. Mesa/Gallium). It is in theory possible to do equivalent calculations using float-pairs (16-bits in each) and do conditional-arithmetic equivalents for XOR, bitshift, rotation etc. (f.e. division with a power of two is the same as right-shift, rotation may be implemented by moving fractional bits after a shift, and so on). Look, it *might* work, but there will be absolutely no gain at all. You'll be lucky if you get a few Mhash/s from it.
GLSL versions 1.3 and above have support for 32-bit unsigned integers as well as bitwise operators. I'm in progress of writing a GLSL 1.3 shader, and it should be completed in a few days. The downside with this is that it requires (at least partial support of) OpenGL 3.0. There is no complete FLOSS OpenGL 3.0 implementation. AFAIK, the proprietary ATI/AMD driver has OpenGL 3.0 support for R600 and later. On the other hand, in practice, only the extension GL_EXT_gpu_shader4 is necessary, not the complete OpenGL 3.0 (at least I think that's right). If any of the FLOSS drivers implements that extension, then it would/should be possible to run my (to be written) shader on those drivers as well. In any case, the proprietary ATI/AMD driver should run it, which means it will become possible to mine on R600 and R700 hardware which does not support OpenCL.
Unfortunately it will take a few days before I get access to hardware to test this out. I do have a HD3850, but nowhere to plug it in.
See below (sorry, couldn't attach it) for a ridiculous GLSL 1.2 shader that (partially) calculates SHA256 hashes. (NOTE: it's incomplete, and may even be incorrect since it's untested -- it also crashes my system which has an old Intel IGP with partial GLSL support)
#version 120
/*
32-bit integers are represented by a vec2. GLSL 2 integers may only have up
to 16-bit precision (in portable code), and they are likely to be implemented
with floats anyway. Instead we use float-pairs, with 16-bit in each (although
floats fit 24-bit precision). A vec4 is also used instead of two vec2, where
possible.
*/
uniform vec4 data[8]; /* Second part of data */
uniform vec4 hash1[4]; /* Second part of hash1 */
uniform vec4 midstate[4];
uniform vec4 target[4];
uniform vec2 nonce_base;
/* Note: N is the width of the buffer and should only be between 1 and 2048 or
so. Preferably less -- around 128 or 256. */
uniform float N;
/* Note: offset is two independent floats, with values between 0 and N. */
varying vec2 varying_nonce_offset;
const vec4 stdstate[4] = vec4[](
vec4 (float (0x6a09), float (0xe667), float (0xbb67), float (0xae85)),
vec4 (float (0x3c6e), float (0xf372), float (0xa54f), float (0xf53a)),
vec4 (float (0x510e), float (0x527f), float (0x9b05), float (0x688c)),
vec4 (float (0x1f83), float (0xd9ab), float (0x5be0), float (0xcd19)));
const vec4 k[32] = vec4[](
vec4 (float (0x428a), float (0x2f98), float (0x7137), float (0x4491)),
vec4 (float (0xb5c0), float (0xfbcf), float (0xe9b5), float (0xdba5)),
vec4 (float (0x3956), float (0xc25b), float (0x59f1), float (0x11f1)),
vec4 (float (0x923f), float (0x82a4), float (0xab1c), float (0x5ed5)),
vec4 (float (0xd807), float (0xaa98), float (0x1283), float (0x5b01)),
vec4 (float (0x2431), float (0x85be), float (0x550c), float (0x7dc3)),
vec4 (float (0x72be), float (0x5d74), float (0x80de), float (0xb1fe)),
vec4 (float (0x9bdc), float (0x06a7), float (0xc19b), float (0xf174)),
vec4 (float (0xe49b), float (0x69c1), float (0xefbe), float (0x4786)),
vec4 (float (0x0fc1), float (0x9dc6), float (0x240c), float (0xa1cc)),
vec4 (float (0x2de9), float (0x2c6f), float (0x4a74), float (0x84aa)),
vec4 (float (0x5cb0), float (0xa9dc), float (0x76f9), float (0x88da)),
vec4 (float (0x983e), float (0x5152), float (0xa831), float (0xc66d)),
vec4 (float (0xb003), float (0x27c8), float (0xbf59), float (0x7fc7)),
vec4 (float (0xc6e0), float (0x0bf3), float (0xd5a7), float (0x9147)),
vec4 (float (0x06ca), float (0x6351), float (0x1429), float (0x2967)),
vec4 (float (0x27b7), float (0x0a85), float (0x2e1b), float (0x2138)),
vec4 (float (0x4d2c), float (0x6dfc), float (0x5338), float (0x0d13)),
vec4 (float (0x650a), float (0x7354), float (0x766a), float (0x0abb)),
vec4 (float (0x81c2), float (0xc92e), float (0x9272), float (0x2c85)),
vec4 (float (0xa2bf), float (0xe8a1), float (0xa81a), float (0x664b)),
vec4 (float (0xc24b), float (0x8b70), float (0xc76c), float (0x51a3)),
vec4 (float (0xd192), float (0xe819), float (0xd699), float (0x0624)),
vec4 (float (0xf40e), float (0x3585), float (0x106a), float (0xa070)),
vec4 (float (0x19a4), float (0xc116), float (0x1e37), float (0x6c08)),
vec4 (float (0x2748), float (0x774c), float (0x34b0), float (0xbcb5)),
vec4 (float (0x391c), float (0x0cb3), float (0x4ed8), float (0xaa4a)),
vec4 (float (0x5b9c), float (0xca4f), float (0x682e), float (0x6ff3)),
vec4 (float (0x748f), float (0x82ee), float (0x78a5), float (0x636f)),
vec4 (float (0x84c8), float (0x7814), float (0x8cc7), float (0x0208)),
vec4 (float (0x90be), float (0xfffa), float (0xa450), float (0x6ceb)),
vec4 (float (0xbef9), float (0xa3f7), float (0xc671), float (0x78f2)));
/* For rotr (>>) use division with appropriate power of 2. */
/* Do not let overflow happen with this function, or use sum_c instead! */
vec2 sum (vec2 a, vec2 b)
{
vec2 ret;
ret.x = a.x + b.x;
ret.y = a.y + b.y;
if (ret.y >= float(0x10000))
{
ret.y -= float(0x10000);
ret.x += 1.0;
}
if (ret.x >= float(0x10000))
ret.x -= float(0x10000);
return ret;
}
vec2 sum_c (vec2 a, vec2 b, out float carry)
{
vec2 ret;
ret.x = a.x + b.x;
ret.y = a.y + b.y;
if (ret.y >= float(0x10000))
{
ret.y -= float(0x10000);
ret.x += 1.0;
}
if (ret.x >= float(0x10000))
{
ret.x -= float(0x10000);
carry = 1.0;
}
return ret;
}
vec2 prod (float a, float b)
{
vec2 ret;
ret.x = 0;
ret.y = a * b;
if (ret.y >= float(0x10000))
{
float c = floor (ret.y / float(0x10000));
ret.x += c;
ret.y -= c * float(0x10000);
}
return ret;
}
/* Note: shift should be a power of two, e.g. to shift 3 steps, use 2^3. */
vec2 sftr (vec2 a, float shift)
{
vec2 ret = a / shift;
ret = vec2 (floor (ret.x), floor (ret.y) + fract (ret.x) * float (0x10000));
return ret;
}
/* Note: shift should be a power of two, e.g. to rotate 3 steps, use 2^3. */
vec2 rotr (vec2 a, float shift)
{
vec2 ret = a / shift;
ret = floor (ret) + fract (ret.yx) * float (0x10000);
return ret;
}
float xor16 (float a, float b)
{
float ret = 0;
float fact = float (0x8000);
while (fact > 0)
{
if ((a >= fact || b >= fact) && (a < fact || b < fact))
ret += fact;
if (a >= fact)
a -= fact;
if (b >= fact)
b -= fact;
fact /= 2.0;
}
return ret;
}
vec2 xor (vec2 a, vec2 b)
{
return vec2 (xor16 (a.x, b.x), xor16 (a.y, b.y));
}
float and16 (float a, float b)
{
float ret = 0;
float fact = float (0x8000);
while (fact > 0)
{
/* TODO: This still does XOR */
if ((a >= fact || b >= fact) && (a < fact || b < fact))
ret += fact;
if (a >= fact)
a -= fact;
if (b >= fact)
b -= fact;
fact /= 2.0;
}
return ret;
}
vec2 and (vec2 a, vec2 b)
{
return vec2 (and16 (a.x, b.x), and16 (a.y, b.y));
}
/* Logical complement ("not") */
vec2 cpl (vec2 a)
{
return vec2 (float (0x10000), float (0x10000)) - a;
}
#define POW_2_01 2.0
#define POW_2_02 4.0
#define POW_2_03 8.0
#define POW_2_06 64.0
#define POW_2_07 128.0
#define POW_2_09 512.0
#define POW_2_10 1024.0
#define POW_2_11 2048.0
#define POW_2_13 8192.0
vec2 blend (vec2 m16, vec2 m15, vec2 m07, vec2 m02)
{
vec2 s0 = xor (rotr (m15 , POW_2_07), xor (rotr (m15.yx, POW_2_02), sftr (m15, POW_2_03)));
vec2 s1 = xor (rotr (m02.yx, POW_2_01), xor (rotr (m02.yx, POW_2_03), sftr (m02, POW_2_10)));
return sum (sum (m16, s0), sum (m07, s1));
}
vec2 e0 (vec2 a)
{
return xor (rotr (a, POW_2_02), xor (rotr (a, POW_2_13), rotr (a.yx, POW_2_06)));
}
vec2 e1 (vec2 a)
{
return xor (rotr (a, POW_2_06), xor (rotr (a, POW_2_11), rotr (a.yx, POW_2_09)));
}
vec2 ch (vec2 a, vec2 b, vec2 c)
{
return xor (and (a, b), and (cpl (a), c));
}
vec2 maj (vec2 a, vec2 b, vec2 c)
{
return xor (xor (and (a, b), and (a, c)), and (b, c));
}
void main ()
{
vec2 nonce_offset = floor (varying_nonce_offset);
vec2 nonce = sum (nonce_base, sum(prod(nonce_offset.y, N), vec2 (0.0, nonce_offset.x)));
vec4 w[24];
vec4 hash0[4];
vec4 tmp[4];
#define a (tmp[0].xy)
#define b (tmp[0].zw)
#define c (tmp[1].xy)
#define d (tmp[1].zw)
#define e (tmp[2].xy)
#define f (tmp[2].zw)
#define g (tmp[3].xy)
#define h (tmp[3].zw)
vec2 t1, t2;
/* TODO: Using midstate as state, calculate hash "hash0" of data with nonce applied */
w[0].xy = blend (data[0].xy, data[0].zw, data[4].zw, data[7].xy);
w[0].zw = blend (data[0].zw, data[1].xy, data[5].xy, data[7].zw);
w[1].xy = blend (data[1].xy, data[1].zw, data[5].zw, w[0].xy);
w[1].zw = blend (data[1].zw, data[2].xy, data[6].xy, w[0].zw);
w[2].xy = blend (data[2].xy, data[2].zw, data[6].zw, w[1].xy);
w[2].zw = blend (data[2].zw, nonce.xy, data[7].xy, w[1].zw);
w[3].xy = blend (nonce.xy, nonce.zw, data[7].zw, w[2].xy);
w[3].zw = blend (nonce.zw, data[4].xy, w[0].xy, w[2].zw);
w[4].xy = blend (data[4].xy, data[4].zw, w[0].zw, w[3].xy);
w[4].zw = blend (data[4].zw, data[5].xy, w[1].xy, w[3].zw);
w[5].xy = blend (data[5].xy, data[5].zw, w[1].zw, w[4].xy);
w[5].zw = blend (data[5].zw, data[6].xy, w[2].xy, w[4].zw);
w[6].xy = blend (data[6].xy, data[6].zw, w[2].zw, w[5].xy);
w[6].zw = blend (data[6].zw, data[7].xy, w[3].xy, w[5].zw);
w[7].xy = blend (data[7].xy, data[7].zw, w[3].zw, w[6].xy);
w[7].zw = blend (data[7].zw, w[0].xy, w[4].xy, w[6].zw);
for (int i = 8; i < 24; ++i)
{
w[i].xy = blend (w[i-8].xy, w[i-8].zw, w[i-4].zw, w[i-1].xy);
w[i].zw = blend (w[i-8].zw, w[i-7].xy, w[i-3].xy, w[i-1].zw);
}
tmp = midstate;
/* TODO: Add loop-unrolled of i = 0 to 3, where data is used instead of w. */
/*for (int i = 4; i < 32; i+=4)
{
t1 = sum (sum (sum (sum (h, e1(e)), ch(e,f,g)), k[i+0].xy), w[i-4+0].xy);
t2 = sum (e0(a), maj(a,b,c)); d = sum (d, t1); h = sum (t1, t2);
t1 = sum (sum (sum (sum (g, e1(d)), ch(d,e,f)), k[i+0].zw), w[i-4+0].zw);
t2 = sum (e0(h), maj(h,a,b)); c = sum (c, t1); g = sum (t1, t2);
t1 = sum (sum (sum (sum (f, e1(c)), ch(c,d,e)), k[i+1].xy), w[i-4+1].xy);
t2 = sum (e0(g), maj(g,h,a)); b = sum (b, t1); f = sum (t1, t2);
t1 = sum (sum (sum (sum (e, e1(b)), ch(b,c,d)), k[i+1].zw), w[i-4+1].zw);
t2 = sum (e0(f), maj(f,g,h)); a = sum (a, t1); e = sum (t1, t2);
t1 = sum (sum (sum (sum (d, e1(a)), ch(a,b,c)), k[i+2].xy), w[i-4+2].xy);
t2 = sum (e0(e), maj(e,f,g)); h = sum (h, t1); d = sum (t1, t2);
t1 = sum (sum (sum (sum (c, e1(h)), ch(h,a,b)), k[i+2].zw), w[i-4+2].zw);
t2 = sum (e0(d), maj(d,e,f)); g = sum (g, t1); c = sum (t1, t2);
t1 = sum (sum (sum (sum (b, e1(g)), ch(g,h,a)), k[i+3].xy), w[i-4+3].xy);
t2 = sum (e0(c), maj(c,d,e)); f = sum (f, t1); b = sum (t1, t2);
t1 = sum (sum (sum (sum (a, e1(f)), ch(f,g,h)), k[i+3].zw), w[i-4+3].zw);
t2 = sum (e0(b), maj(b,c,d)); e = sum (e, t1); a = sum (t1, t2);
}*/
/* TODO: More iterations... Copy-paste block and fix k-index and W-value. */
for (int i = 0; i < 4; ++i)
{
hash0[i].xy = sum (midstate[i].xy, tmp[i].xy);
hash0[i].zw = sum (midstate[i].zw, tmp[i].zw);
}
vec4 hash[4];
/* TODO: Using stdstate as state, calculate the hash of (hash0, hash1) */
/* TODO: Compare with target. */
gl_FragColor.r = nonce.y / 255.0;
if (mod (nonce.y, 2.0) == 0.0)
gl_FragColor.r = 0;
else
gl_FragColor.r = 1;
}