Thanks - your contributions have made it easier for even a relative novice to make further modifications to the code and come out (slightly) ahead of enemy and trex, even before their fees.
Convert simd to use shared memory instead of a big memory buffer (d_temp4[thr_id]) and then you get a good speedup. this has already been done in the other private miners. I might opensource later on..
..
st.shared.u32 [%r1671], %r1666;
shfl.sync.up.b32 %r1672|%p370, %r162, %r304, %r1653, %r302;
selp.b32 %r1673, %r144, %r1672, %p92;
shfl.sync.up.b32 %r1674|%p371, %r154, %r304, %r1653, %r302;
selp.b32 %r1675, %r136, %r1674, %p92;
mul.lo.s32 %r1676, %r1673, 185;
mul.lo.s32 %r1677, %r1675, 185;
prmt.b32 %r1678, %r1676, %r1677, %r1661;
shfl.sync.idx.b32 %r1679|%p372, %r1678, %r1665, %r301, %r302;
st.shared.u32 [%r1671+4], %r1679;
shfl.sync.up.b32 %r1680|%p373, %r126, %r304, %r1653, %r302;
selp.b32 %r1681, %r108, %r1680, %p92;
shfl.sync.up.b32 %r1682|%p374, %r118, %r304, %r1653, %r302;
selp.b32 %r1683, %r100, %r1682, %p92;
mul.lo.s32 %r1684, %r1681, 185;
mul.lo.s32 %r1685, %r1683, 185;
prmt.b32 %r1686, %r1684, %r1685, %r1661;
shfl.sync.idx.b32 %r1687|%p375, %r1686, %r1665, %r301, %r302;
st.shared.u32 [%r1671+8], %r1687;
shfl.sync.up.b32 %r1688|%p376, %r1613, %r304, %r1653, %r302;
selp.b32 %r1689, %r180, %r1688, %p92;
shfl.sync.up.b32 %r1690|%p377, %r190, %r304, %r1653, %r302;
selp.b32 %r1691, %r172, %r1690, %p92;
mul.lo.s32 %r1692, %r1689, 185;
mul.lo.s32 %r1693, %r1691, 185;
prmt.b32 %r1694, %r1692, %r1693, %r1661;
shfl.sync.idx.b32 %r1695|%p378, %r1694, %r1665, %r301, %r302;
st.shared.u32 [%r1671+12], %r1695;
shfl.sync.up.b32 %r1696|%p379, %r91, %r304, %r1653, %r302;
selp.b32 %r1697, %r73, %r1696, %p92;
shfl.sync.up.b32 %r1698|%p380, %r3433, %r304, %r1653, %r302;
selp.b32 %r1699, %r3432, %r1698, %p92;
mul.lo.s32 %r1700, %r1697, 185;
mul.lo.s32 %r1701, %r1699, 185;
prmt.b32 %r1702, %r1700, %r1701, %r1661;
ld.const.u8 %r1703, [%r1664+8];
shfl.sync.idx.b32 %r1704|%p381, %r1702, %r1703, %r301, %r302;
st.shared.u32 [%r1671+128], %r1704;
...