I found a cpp source for scrypt_core()
I put it in scrypt_mine.cpp
#if defined(__aarch64__)
static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])
{
uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
int i;
x00 = (B[ 0] ^= Bx[ 0]);
x01 = (B[ 1] ^= Bx[ 1]);
x02 = (B[ 2] ^= Bx[ 2]);
x03 = (B[ 3] ^= Bx[ 3]);
x04 = (B[ 4] ^= Bx[ 4]);
x05 = (B[ 5] ^= Bx[ 5]);
x06 = (B[ 6] ^= Bx[ 6]);
x07 = (B[ 7] ^= Bx[ 7]);
x08 = (B[ 8] ^= Bx[ 8]);
x09 = (B[ 9] ^= Bx[ 9]);
x10 = (B[10] ^= Bx[10]);
x11 = (B[11] ^= Bx[11]);
x12 = (B[12] ^= Bx[12]);
x13 = (B[13] ^= Bx[13]);
x14 = (B[14] ^= Bx[14]);
x15 = (B[15] ^= Bx[15]);
for (i = 0; i < 8; i += 2) {
#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
/* Operate on columns. */
x04 ^= R(x00+x12, 7); x09 ^= R(x05+x01, 7);
x14 ^= R(x10+x06, 7); x03 ^= R(x15+x11, 7);
x08 ^= R(x04+x00, 9); x13 ^= R(x09+x05, 9);
x02 ^= R(x14+x10, 9); x07 ^= R(x03+x15, 9);
x12 ^= R(x08+x04,13); x01 ^= R(x13+x09,13);
x06 ^= R(x02+x14,13); x11 ^= R(x07+x03,13);
x00 ^= R(x12+x08,18); x05 ^= R(x01+x13,18);
x10 ^= R(x06+x02,18); x15 ^= R(x11+x07,18);
/* Operate on rows. */
x01 ^= R(x00+x03, 7); x06 ^= R(x05+x04, 7);
x11 ^= R(x10+x09, 7); x12 ^= R(x15+x14, 7);
x02 ^= R(x01+x00, 9); x07 ^= R(x06+x05, 9);
x08 ^= R(x11+x10, 9); x13 ^= R(x12+x15, 9);
x03 ^= R(x02+x01,13); x04 ^= R(x07+x06,13);
x09 ^= R(x08+x11,13); x14 ^= R(x13+x12,13);
x00 ^= R(x03+x02,18); x05 ^= R(x04+x07,18);
x10 ^= R(x09+x08,18); x15 ^= R(x14+x13,18);
#undef R
}
B[ 0] += x00;
B[ 1] += x01;
B[ 2] += x02;
B[ 3] += x03;
B[ 4] += x04;
B[ 5] += x05;
B[ 6] += x06;
B[ 7] += x07;
B[ 8] += x08;
B[ 9] += x09;
B[10] += x10;
B[11] += x11;
B[12] += x12;
B[13] += x13;
B[14] += x14;
B[15] += x15;
}
static inline void scrypt_core(uint32_t *X, uint32_t *V)
{
uint32_t i, j, k;
int n=1024;
for (i = 0; i < n; i++) {
memcpy(&V[i * 32], X, 128);
xor_salsa8(&X[0], &X[16]);
xor_salsa8(&X[16], &X[0]);
}
for (i = 0; i < n; i++) {
j = 32 * (X[16] & (n - 1));
for (k = 0; k < 32; k++)
X[k] ^= V[j + k];
xor_salsa8(&X[0], &X[16]);
xor_salsa8(&X[16], &X[0]);
}
}
#endif
Code compiles ... and syncing ... 64 bit and no assembly code
now I can have compiler generate an intermediate .S file and I can optimize it
Synced up. cpu use dropped to about 30% is that about normal on a linux build?
compiling with -O3 and see if that makes a difference ...
top - 05:46:50 up 4:30, 3 users, load average: 0.27, 0.36, 0.65
Tasks: 157 total, 2 running, 155 sleeping, 0 stopped, 0 zombie
%Cpu(s): 0.2 us, 0.6 sy, 7.0 ni, 92.2 id, 0.1 wa, 0.0 hi, 0.0 si, 0.0 st
KiB Mem: 1760360 total, 932260 used, 828100 free, 4112 buffers
KiB Swap: 2047996 total, 128036 used, 1919960 free. 112292 cached Mem
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
2362 odroid 20 0 1443020 627508 9444 S 29.9 35.6 56:20.24 diamond-qt-O3
Swap barely got touched with all the compiling I've been doing ... probably never touch it after a reboot.
Disk usage
odroid@odroid64:~$ df
Filesystem 1K-blocks Used Available Use% Mounted on
/dev/mmcblk0p2 29940700 3873036 26049272 13% /
udev 10240 0 10240 0% /dev
tmpfs 352072 11624 340448 4% /run
tmpfs 880180 80 880100 1% /dev/shm
tmpfs 5120 4 5116 1% /run/lock
tmpfs 880180 0 880180 0% /sys/fs/cgroup
/dev/mmcblk0p1 130796 16394 114402 13% /media/boot
/dev/mmcblk1p2 28528640 2856728 24199656 11% /home
tmpfs 176036 4 176032 1% /run/user/115
tmpfs 176036 16 176020 1% /run/user/1000
tmpfs 176036 0 176036 0% /run/user/0