Have you a _ModSqrMontgomery function?
I would try to compute the inverse this way:
__device__ void _ModInv(uint64_t* a) {
uint64_t x2[4], x3[4], x6[4], x9[4], x11[4], x22[4], x44[4], x88[4], x176[4], x220[4], x223[4], t1[4];
uint8_t j;
/** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
* { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
* [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
*/
_ModSqr(x2, a);
_ModMult(x2, a);
_ModSqr(x3, x2);
_ModMult(x3, a);
memcpy(x6,x3,32);
_ModSqr(x6);
_ModSqr(x6);
_ModSqr(x6);
_ModMult(x6, x3);
memcpy(x9,x6,32);
_ModSqr(x9);
_ModSqr(x9);
_ModSqr(x9);
_ModMult(x9, x3);
memcpy(x11,x9,32);
_ModSqr(x11);
_ModSqr(x11);
_ModMult(x11, x2);
memcpy(x22,x11,32);
for (j=0; j<11; j++) {
_ModSqr(x22);
}
_ModMult(x22, x11);
memcpy(x44,x22,32);
for (j=0; j<22; j++) {
_ModSqr(x44);
}
_ModMult(x44, x22);
memcpy(x88,x44,32);
for (j=0; j<44; j++) {
_ModSqr(x88);
}
_ModMult(x88, x44);
memcpy(x176,x88,32);
for (j=0; j<88; j++) {
_ModSqr(x176);
}
_ModMult(x176, x88);
memcpy(x220,x176,32);
for (j=0; j<44; j++) {
_ModSqr(x220);
}
_ModMult(x220, x44);
memcpy(x223,x220,32);
_ModSqr(x223);
_ModSqr(x223);
_ModSqr(x223);
_ModMult(x223, x3);
/* The final result is then assembled using a sliding window over the blocks. */
memcpy(t1,x223,32);
for (j=0; j<23; j++) {
_ModSqr(t1);
}
_ModMult(t1, x22);
_ModSqr(t1);
_ModSqr(t1);
_ModSqr(t1);
_ModSqr(t1);
_ModSqr(t1);
_ModMult(t1, a);
_ModSqr(t1);
_ModSqr(t1);
_ModSqr(t1);
_ModMult(t1, t1, x2);
_ModSqr(t1);
_ModSqr(t1);
_ModMult(a, t1);
}