Replace SBOX with sbox_pipelined
In the code:
SBOX(hamsi_s00, hamsi_s08, hamsi_s10, hamsi_s18); \
SBOX(hamsi_s01, hamsi_s09, hamsi_s11, hamsi_s19); \
SBOX(hamsi_s02, hamsi_s0A, hamsi_s12, hamsi_s1A); \
SBOX(hamsi_s03, hamsi_s0B, hamsi_s13, hamsi_s1B); \
SBOX(hamsi_s04, hamsi_s0C, hamsi_s14, hamsi_s1C); \
SBOX(hamsi_s05, hamsi_s0D, hamsi_s15, hamsi_s1D); \
SBOX(hamsi_s06, hamsi_s0E, hamsi_s16, hamsi_s1E); \
SBOX(hamsi_s07, hamsi_s0F, hamsi_s17, hamsi_s1F); \
------>
sbox_pipelined(hamsi_s00, hamsi_s08, hamsi_s10, hamsi_s18,hamsi_s01, hamsi_s09, hamsi_s11, hamsi_s19); \
sbox_pipelined(hamsi_s02, hamsi_s0A, hamsi_s12, hamsi_s1A,hamsi_s03, hamsi_s0B, hamsi_s13, hamsi_s1B); \
sbox_pipelined(hamsi_s04, hamsi_s0C, hamsi_s14, hamsi_s1C,hamsi_s05, hamsi_s0D, hamsi_s15, hamsi_s1D); \
sbox_pipelined(hamsi_s06, hamsi_s0E, hamsi_s16, hamsi_s1E,hamsi_s07, hamsi_s0F, hamsi_s17, hamsi_s1F); \
ok I tried, but again it doesn't make a difference.
But it does when you convert the datastructure to 64 bit. Put hamsi_s00 in the 32bit upper part of the register, and ,hamsi_s01 in the lower part of the 64bit. then you will solve 2 times the data with the same assembly instructions that you had previously (but in 64bit).
uint64_t t;
t = a;
asm("and.b64 %0,%0,%1;" : "+r"(a) : "r"(c));
asm("xor.b64 %0,%0,%1;" : "+r"(a) : "r"(d));
asm("xor.b64 %0,%0,%1;" : "+r"(c) : "r"(b));
asm("xor.b64 %0,%0,%1;" : "+r"(c) : "r"(a));
asm( "or.b64 %0,%0,%1;" : "+r"(d) : "r"(t));
asm("xor.b64 %0,%0,%1;" : "+r"(d) : "r"(b));
asm("xor.b64 %0,%0,%1;" : "+r"(t) : "r"(c));
b=d;
asm( "or.b64 %0,%0,%1;" : "+r"(d) : "r"(t));
asm("xor.b64 %0,%0,%1;" : "+r"(d) : "r"(a));
asm("and.b64 %0,%0,%1;" : "+r"(a) : "r"(b));
asm("xor.b64 %0,%0,%1;" : "+r"(t) : "r"(a));
asm("xor.b64 %0,%0,%1;" : "+r"(b) : "r"(d));
asm("xor.b64 %0,%0,%1;" : "+r"(b) : "r"(t));
a=c;
c=b;
b=d;
asm("not.b64 %0,%1;" : "=r"(d) : "r"(t));....
x13 / cuda_x13_hamsi512.cu /
#define ROUND_BIG(rc, alpha) { should be rewritten to operate on 64bit integers.