inpHash[0] = cuda_swab32(devectorize(msg1[0 + 8]));
And it will not work because devectorize produces uint64_t not uint32_t
right. but Still broken..
__constant__ uint64_t BMW512_IV[] = {
(0x8081828384858687), (0x88898A8B8C8D8E8F),
(0x9091929394959697), (0x98999A9B9C9D9E9F),
(0xA0A1A2A3A4A5A6A7), (0xA8A9AAABACADAEAF),
(0xB0B1B2B3B4B5B6B7), (0xB8B9BABBBCBDBEBF),
(0xC0C1C2C3C4C5C6C7), (0xC8C9CACBCCCDCECF),
(0xD0D1D2D3D4D5D6D7), (0xD8D9DADBDCDDDEDF),
(0xE0E1E2E3E4E5E6E7), (0xE8E9EAEBECEDEEEF),
(0xF0F1F2F3F4F5F6F7), (0xF8F9FAFBFCFDFEFF)
};
__constant__ uint64_t BMW512_FINAL[16] =
{
0xAAAAAAAAAAAAAAA0UL, 0xAAAAAAAAAAAAAAA1UL, 0xAAAAAAAAAAAAAAA2UL, 0xAAAAAAAAAAAAAAA3UL,
0xAAAAAAAAAAAAAAA4UL, 0xAAAAAAAAAAAAAAA5UL, 0xAAAAAAAAAAAAAAA6UL, 0xAAAAAAAAAAAAAAA7UL,
0xAAAAAAAAAAAAAAA8UL, 0xAAAAAAAAAAAAAAA9UL, 0xAAAAAAAAAAAAAAAAUL, 0xAAAAAAAAAAAAAAABUL,
0xAAAAAAAAAAAAAAACUL, 0xAAAAAAAAAAAAAAADUL, 0xAAAAAAAAAAAAAAAEUL, 0xAAAAAAAAAAAAAAAFUL
};
__global__ __launch_bounds__(32, 8)
void quark_bmw512_gpu_hash_128(uint32_t threads, uint64_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t hashPosition = (g_nonceVector == NULL) ? thread : g_nonceVector[thread];
uint64_t *inpHash = &g_hash[8 * hashPosition];
uint32_t *outHash = (uint32_t *)inpHash;
uint2 __align__(16) msg0[16];
uint2 __align__(16) msg1[16] = { 0 };
uint2 __align__(16) h[16];
uint2x4* phash = (uint2x4*)inpHash;
uint2x4* outpt = (uint2x4*)msg0;
outpt[0] = __ldg4(&phash[0]);
outpt[1] = __ldg4(&phash[1]);
// bmw
msg1[0] = vectorize(0x80UL);
msg1[15] = vectorize(1024UL);
for (int i = 0; i < 16; ++i) h[i] = vectorize(BMW512_IV[i]);
Compression512(msg0, h);
Compression512(msg1, msg0);
for (int i = 0; i < 16; ++i) h[i] = vectorize(BMW512_FINAL[i]);
Compression512(msg1, h);
outHash[0] = msg1[0 + 8].x;
outHash[1] = msg1[0 + 8].y;
outHash[2] = msg1[1 + 8].x;
outHash[3] = msg1[1 + 8].y;
outHash[4] = msg1[2 + 8].x;
outHash[5] = msg1[2 + 8].y;
outHash[6] = msg1[3 + 8].x;
outHash[7] = msg1[3 + 8].y;
outHash[8] = msg1[4 + 8].x;
outHash[9] = msg1[4 + 8].y;
outHash[10] = msg1[5 + 8].x;
outHash[11] = msg1[5 + 8].y;
outHash[12] = msg1[6 + 8].x;
outHash[13] = msg1[6 + 8].y;
outHash[14] = msg1[7 + 8].x;
outHash[15] = msg1[7 + 8].y;
}
}