Klaus_t's cuda port is doing 355MHASH on the 750ti(64 bit cuda 8.0). Genoils opencl kernal does around 303MHASH.
But the klaus_t version is getting alot of reject's on the pool.
https://github.com/KlausT/Sia-CUDA-MinerWhat does it do with the funnel shift asm? Or did you already paste that in?
(btw it is NOT
my kernel. i only provided windows build files and added pool support.)
my fork also is still single threaded. i wasn't actually planning to build a proper miner around it. just a bit of kernel play.
I managed to add some more hash by rewriting to uint2 like this:
#undef rotr64
#undef __byte_perm_64
#undef __swap_hilo
__inline__ __device__ uint2 rotr64(const uint2 a, const int offset)
{
uint2 result;
if (offset < 32) {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
}
else {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
}
return result;
}
__device__ __forceinline__
uint2 __byte_perm_64(const uint2 source, const uint32_t grab1, const uint32_t grab2)
{
uint2 r;
asm("prmt.b32 %0, %1, %2, %3;" : "=r"(r.x) : "r"(source.x), "r"(source.y), "r"(grab1));
asm("prmt.b32 %0, %1, %2, %3;" : "=r"(r.y) : "r"(source.x), "r"(source.y), "r"(grab2));
return r;
}
__device__ __forceinline__
uint2 __swap_hilo(const uint2 source)
{
uint2 r;
r.x = source.y;
r.y = source.x;
return r;
}
__global__ void __launch_bounds__(blocksize, 4) nonceGrind_SP(const uint2 * __restrict__ headerIn, uint2 * __restrict__ hashOut, uint2 * __restrict__ nonceOut, const uint2 * __restrict__ v1, uint32_t target)
{
uint2 header[10], h[4], v[16];
uint32_t id = (blockDim.x * blockIdx.x + threadIdx.x)*npt;
#pragma unroll
for (int i = 0; i < 10; i++)
header[i] = headerIn[i];
for (int n = id; n < id + npt; n++)
{
((uint32_t*)header)[8] = n;
v[2] = vectorize(0x5BF2CD1EF9D6B596u) + header[4]; v[14] = __swap_hilo(~vectorize(0x1f83d9abfb41bd6bu) ^ v[2]); v[10] = vectorize(0x3c6ef372fe94f82bu) + v[14]; v[6] = __byte_perm_64(vectorize(0x1f83d9abfb41bd6bu) ^ v[10], 0x6543, 0x2107);
v[2] = v[2] + v[6] + header[5]; v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = rotr64(v[6] ^ v[10], 63);
v[3] = vectorize(0x130C253729B586Au) + header[6]; v[15] = __swap_hilo(vectorize(0x5be0cd19137e2179u) ^ v[3]); v[11] = vectorize(0xa54ff53a5f1d36f1u) + v[15]; v[7] = __byte_perm_64(vectorize(0x5be0cd19137e2179u) ^ v[11], 0x6543, 0x2107);
v[3] = v[3] + v[7] + header[7]; v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = rotr64(v[7] ^ v[11], 63);
v[0] = v1[0] + v1[5] + header[8]; v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(v1[5] ^ v[10], 0x6543, 0x2107);
v[0] = v[0] + v[5] + header[9]; v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = rotr64(v[5] ^ v[10], 63);
....
etc
You Also need to copy some functions from ccminer.
__device__ __forceinline__ uint64_t devectorize(uint2 x)
{
uint64_t result;
asm("mov.b64 %0,{%1,%2}; \n\t"
: "=l"(result) : "r"(x.x), "r"(x.y));
return result;
}
__device__ __forceinline__ uint2 vectorize(const uint64_t x)
{
uint2 result;
asm("mov.b64 {%0,%1},%2; \n\t"
: "=r"(result.x), "=r"(result.y) : "l"(x));
return result;
}
static __device__ __forceinline__ uint2 operator^ (uint2 a, uint32_t b) { return make_uint2(a.x^ b, a.y); }
static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); }
static __device__ __forceinline__ uint2 operator& (uint2 a, uint2 b) { return make_uint2(a.x & b.x, a.y & b.y); }
static __device__ __forceinline__ uint2 operator| (uint2 a, uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); }
static __device__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2(~a.x, ~a.y); }
static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; }
static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b)
{
uint2 result;
asm(
"add.cc.u32 %0,%2,%4; \n\t"
"addc.u32 %1,%3,%5; \n\t"
: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
return result;
}
static __device__ __forceinline__ uint2 operator+ (uint2 a, uint32_t b)
{
uint2 result;
asm("add.cc.u32 %0,%2,%4; \n\t"
"addc.u32 %1,%3,%5; \n\t"
: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b), "r"(0));
return result;
}
static __device__ __forceinline__ uint2 operator- (uint2 a, uint32_t b)
{
uint2 result;
asm("sub.cc.u32 %0,%2,%4; \n\t"
"subc.u32 %1,%3,%5; \n\t"
: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b), "r"(0));
return result;
}
static __device__ __forceinline__ uint2 operator- (uint2 a, uint2 b)
{
uint2 result;
asm("sub.cc.u32 %0,%2,%4; \n\t"
"subc.u32 %1,%3,%5; \n\t"
: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
return result;
}