I think no need to discuss this topic. We should just pray that noone will solve the problem how to simplify SALSA.
Praying is not very useful and obscurity is definitely not the right way to do things here
Even without sharing the sources yet, do you have any statistics for the number of 32-bit XOR, ADD and ROTATE operations before and after your optimization? Or even better would be the statistics for 128-bit SIMD operations (XOR, ADD, ROTATE, SHUFFLE).
I didn't count the statistics. I tried to simplify the algo using usual NAND/NOR-logic optimization techniques. (Un)fortunatelly I ran out of memory (I had only 16 Gb) so optimization wasn't completed.
If u wish u could try to do it urself using my implementation of [1024, 1, 1]-Scrypt. When I wrote it using SSE2 (calc 4 hashes at once) I got 2500 h/s from each GHz of an Intel CPU.
#define ROTR(x, n) ((x>>n)|(x<<(32-n)))
void Salsa(unsigned __int32 uiData[16])
{
unsigned __int32 uiA[16], i, j;
for (i=0; i<16; i++) uiA[i]=uiData[i];
for (i=0; i<4; i++)
{
uiA[4]^=ROTR((uiA[0]+uiA[12]), 25);
uiA[9]^=ROTR((uiA[5]+uiA[1]), 25);
uiA[14]^=ROTR((uiA[10]+uiA[6]), 25);
uiA[3]^=ROTR((uiA[15]+uiA[11]), 25);
uiA[8]^=ROTR((uiA[4]+uiA[0]), 23);
uiA[13]^=ROTR((uiA[9]+uiA[5]), 23);
uiA[2]^=ROTR((uiA[14]+uiA[10]), 23);
uiA[7]^=ROTR((uiA[3]+uiA[15]), 23);
uiA[12]^=ROTR((uiA[8]+uiA[4]), 19);
uiA[1]^=ROTR((uiA[13]+uiA[9]), 19);
uiA[6]^=ROTR((uiA[2]+uiA[14]), 19);
uiA[11]^=ROTR((uiA[7]+uiA[3]), 19);
uiA[0]^=ROTR((uiA[12]+uiA[8]), 14);
uiA[5]^=ROTR((uiA[1]+uiA[13]), 14);
uiA[10]^=ROTR((uiA[6]+uiA[2]), 14);
uiA[15]^=ROTR((uiA[11]+uiA[7]), 14);
uiA[1]^=ROTR((uiA[0]+uiA[3]), 25);
uiA[6]^=ROTR((uiA[5]+uiA[4]), 25);
uiA[11]^=ROTR((uiA[10]+uiA[9]), 25);
uiA[12]^=ROTR((uiA[15]+uiA[14]), 25);
uiA[2]^=ROTR((uiA[1]+uiA[0]), 23);
uiA[7]^=ROTR((uiA[6]+uiA[5]), 23);
uiA[8]^=ROTR((uiA[11]+uiA[10]), 23);
uiA[13]^=ROTR((uiA[12]+uiA[15]), 23);
uiA[3]^=ROTR((uiA[2]+uiA[1]), 19);
uiA[4]^=ROTR((uiA[7]+uiA[6]), 19);
uiA[9]^=ROTR((uiA[8]+uiA[11]), 19);
uiA[14]^=ROTR((uiA[13]+uiA[12]), 19);
uiA[0]^=ROTR((uiA[3]+uiA[2]), 14);
uiA[5]^=ROTR((uiA[4]+uiA[7]), 14);
uiA[10]^=ROTR((uiA[9]+uiA[8]), 14);
uiA[15]^=ROTR((uiA[14]+uiA[13]), 14);
}
for (i=0; i<16; i++) uiData[i]+=uiA[i];
}
unsigned __int32 uiStorage[32768+32];
..........................
..........................
..........................
for (i=0; i<32768; i+=16)
{
for (j=0; j<16; j++) uiStorage[i+32+j]=uiStorage[i+j]^uiStorage[i+16+j];
Salsa(&uiStorage[i+32]);
}
for (i=0; i<32768; i+=32)
{
for (j=0; j<16; j++) uiStorage[i+j]^=uiStorage[i+16+j];
}
for (i=0; i<32768; i+=16)
{
j=((uiStorage[32784]&1023)<<5)+(i&31);
for (k=0; k<16; k++) uiStorage[32768+(i&31)+k]=uiStorage[32768+k]^uiStorage[32784+k]^uiStorage[j+k];
Salsa(&uiStorage[32768+(i&31)]);
}
..........................
..........................
..........................
I can't post here my semi-optimized solution. It used some precomputing and required a lot of memory. Nothing special, but I decided to keep it in a cold dark place.