Thanks
I am looking at the Etherum miner now. I have some improvements.
Very curious what you come up with. I hope you can challenge me to look at the code once again, Kind of lost interest with the whole TLB trashing thing going on on Windows.
in the dagger.cuh:
__device__ uint4 fnv4(uint4 a, uint4 b)
{
uint4 c;
c.x = a.x * FNV_PRIME ^ b.x;
c.y = a.y * FNV_PRIME ^ b.y;
c.z = a.z * FNV_PRIME ^ b.z;
c.w = a.w * FNV_PRIME ^ b.w;
return c;
}
Since a.x*2^24= a.x<<24
This can be rewritten to:
__device__ uint4 fnv4(uint4 a, uint4 b)
{
c.x = sharedmemprecalc[a.x&0xff]^ b.x;
c.y = sharedmemprecalc[a.y&0xff] ^ b.y;
c.z = sharedmemprecalc[a.z&0xff] ^ b.z;
c.w = sharedmemprecalc[a.w&0xff] ^ b.w;
return c;
}
The precalcbuffer must be 32bit (256*4 bytes) and the values shifted by 24 bits (shared mem level1cache):
xx000000
__shared__ uint32_t sharedmemprecalc[256 * 4];
for (int i = 0; i<256; i++)
{
sharedmemprecalc[i] = (193 * i) << 24; // Since the FNV_PRIME is a high number the 24 highest bits of the product are ignored. We only need to know the 8 low bits.
}
since you ony need to read 1 byte and not the whole 4 bytes (32 bits), you might be able to solve it with 1/4th of the memory reads...
__device__ uint4 fnv4(uchar4 a, uint4 b)
{
c.x = sharedmemprecalc[a.x]^ b.x;
c.y = sharedmemprecalc[a.y] ^ b.y;
c.z = sharedmemprecalc[a.z] ^ b.z;
c.w = sharedmemprecalc[a.w] ^ b.w;
return c;
}
But you might have to reorganize /scramble the memory. and read 32 bit lineary in one read to fill the uchar4.
I looked into this a for a bit. I rewrote it like this:
block size == 128
lower byte of FNV_PRIME is 147, not 193. (0x01000193 & 0xFF = 0x93 == 147)
__shared__ uint32_t sharedmemprecalc[256];
In compute_hash_shuffle:
sharedmemprecalc[threadIdx.x] = (147 * threadIdx.x) << 24;
sharedmemprecalc[threadIdx.x + 128] = (147 * (threadIdx.x + 128)) << 24;
__syncthreads();
And this unmodified:
__device__ uint4 fnv4s(uint4 a, uint4 b)
{
uint4 c;
c.x = sharedmemprecalc[a.x & 0xff] ^ b.x;
c.y = sharedmemprecalc[a.y & 0xff] ^ b.y;
c.z = sharedmemprecalc[a.z & 0xff] ^ b.z;
c.w = sharedmemprecalc[a.w & 0xff] ^ b.w;
return c;
}
It doesn't work. But more importantly, your suggestion of only requiring the lower byte from the DAG entry seems wrong, since that's param b in the fnv function...