There were errors. Now it should work:
__device__ void _ModSqr(uint64_t *rp, const uint64_t *up) {
uint64_t u0 = up[0];
uint64_t u1 = up[1];
uint64_t u2 = up[2];
uint64_t u3 = up[3];
uint64_t u10, u11;
uint64_t r0 = 0x0ULL;
uint64_t r1 = 0x0ULL;
uint64_t r3 = 0x0ULL;
uint64_t r4 = 0x0ULL;
uint64_t t1 = 0x0ULL;
uint64_t t2 = 0x0ULL;
uint64_t s0, s1, s2, s3, s4, s5, s6, s7;
uint64_t z1, z2, z3, z4, z5, z6, z7, z8;
//k=0
UMULLO(s0, u0, u0);
UMULHI(r1, u0, u0);
//k=1
UMULLO(r3, u0, u1);
UMULHI(r4, u0, u1);
UADDO1(r3, r3);
UADDC1(r4, r4);
UADD1(t1, 0x0ULL);
UADDO1(r3, r1);
UADDC1(r4, 0x0ULL);
UADD1(t1, 0x0ULL);
s1 = r3;
//k=2
UMULLO(r0, u0, u2);
UMULHI(r1, u0, u2);
UADDO1(r0, r0);
UADDC1(r1, r1);
UADD1(t2, 0x0ULL);
UMULLO(u10, u1, u1);
UMULHI(u11, u1, u1);
UADDO1(r0, u10);
UADDC1(r1, u11);
UADD1(t2, 0x0ULL);
UADDO1(r0, r4);
UADDC1(r1, t1);
UADD1(t2, 0x0ULL);
s2 = r0;
t1 = 0;
//k=3
UMULLO(r3, u0, u3);
UMULHI(r4, u0, u3);
UMULLO(u10, u1, u2);
UMULHI(u11, u1, u2);
UADDO1(r3, u10);
UADDC1(r4, u11);
UADD1(t1, 0x0ULL);
t1 += t1;
UADDO1(r3, r3);
UADDC1(r4, r4);
UADD1(t1, 0x0ULL);
UADDO1(r3, r1);
UADDC1(r4, t2);
UADD1(t1, 0x0ULL);
s3 = r3;
t2 = 0;
//k=4
UMULLO(r0, u1, u3);
UMULHI(r1, u1, u3);
UADDO1(r0, r0);
UADDC1(r1, r1);
UADD1(t2, 0x0ULL);
UMULLO(u10, u2, u2);
UMULHI(u11, u2, u2);
UADDO1(r0, u10);
UADDC1(r1, u11);
UADD1(t2, 0x0ULL);
UADDO1(r0, r4);
UADDC1(r1, t1);
UADD1(t2, 0x0ULL);
s4 = r0;
t1 = 0;
//k=5
UMULLO(r3, u2, u3);
UMULHI(r4, u2, u3);
UADDO1(r3, r3);
UADDC1(r4, r4);
UADD1(t1, 0x0ULL);
UADDO1(r3, r1);
UADDC1(r4, t2);
UADD1(t1, 0x0ULL);
s5 = r3;
//k=6
UMULLO(r0, u3, u3);
UMULHI(r1, u3, u3);
UADDO1(r0, r4);
UADD1(r1, t1);
s6 = r0;
//k=7
s7 = r1;
//Reduction
UMULLO(z3, s5, 0x1000003d1ULL);
UMULHI(z4, s5, 0x1000003d1ULL);
UMULLO(z5, s6, 0x1000003d1ULL);
UMULHI(z6, s6, 0x1000003d1ULL);
UMULLO(z7, s7, 0x1000003d1ULL);
UMULHI(z8, s7, 0x1000003d1ULL);
UMULLO(z1, s4, 0x1000003d1ULL);
UMULHI(z2, s4, 0x1000003d1ULL);
UADDO1(z1, s0);
UADD1(z2, 0x0ULL);
UADDO1(z2, s1);
UADDC1(z4, s2);
UADDC1(z6, s3);
UADD1(z8, 0x0ULL);
//uint64_t c = 0;
UADDO1(z3, z2);
UADDC1(z5, z4);
UADDC1(z7, z6);
UADD1(z8, 0x0ULL);
UMULLO(u10, z8, 0x1000003d1ULL);
UMULHI(u11, z8, 0x1000003d1ULL);
UADDO1(z1, u10);
UADDC1(z3, u11);
UADDC1(z5, 0x0ULL);
UADD1(z7, 0x0ULL);
/*
UADD1(c, 0x0ULL);
rp[0] = z1;
rp[1] = z3;
if(c == 1){
UADDO1(z5, 0x1ULL);
UADD1(z7, 0x0ULL);
}
*/
rp[0] = z1;
rp[1] = z3;
rp[2] = z5;
rp[3] = z7;
}