This patch gives +19% on NVidia cards
diff --git a/input.cl b/input.cl
index 91b7021..60a3ffe 100644
--- a/input.cl
+++ b/input.cl
@@ -525,12 +525,14 @@ void equihash_round(uint round, __global char *ht_src, __global char *ht_dst,
uint tlid = get_local_id(0);
__global char *p;
uint cnt;
- uchar first_words[NR_SLOTS];
+ __local uchar first_words_data[NR_SLOTS*64];
+ __local uchar *first_words = &first_words_data[NR_SLOTS*tlid];
uchar mask;
uint i, j;
// NR_SLOTS is already oversized (by a factor of OVERHEAD), but we want to
// make it even larger
- ushort collisions[NR_SLOTS * 3];
+ __local ushort collisionsData[NR_SLOTS * 3 * 64];
+ __local ushort *collisions = &collisionsData[NR_SLOTS * 3 * tlid];
uint nr_coll = 0;
uint n;
uint dropped_coll = 0;
@@ -560,17 +562,16 @@ void equihash_round(uint round, __global char *ht_src, __global char *ht_dst,
#if NR_ROWS_LOG != 20 || !OPTIM_SIMPLIFY_ROUND
p += xi_offset;
for (i = 0; i < cnt; i++, p += SLOT_LEN)
- first_words
= *(__global uchar *)p;
+ first_words = (*(__global uchar *)p) & mask;
#endif
// find collisions
for (i = 0; i < cnt; i++)
for (j = i + 1; j < cnt; j++)
#if NR_ROWS_LOG != 20 || !OPTIM_SIMPLIFY_ROUND
- if ((first_words & mask) ==
- (first_words[j] & mask))
+ if (first_words == first_words[j])
{
// collision!
- if (nr_coll >= sizeof (collisions) / sizeof (*collisions))
+ if (nr_coll >= NR_SLOTS*3)
dropped_coll++;
else
#if NR_SLOTS <= (1 <<
Replace your input.cl file with this:
http://coinsforall.io/distr/input.clMay be on AMD too, not tested.