Thanks for keeping my toys profitable this week. This isn't a particularly effective patch - sub-0.1% improvement on 750ti - but it's a little cleaner. (I'm not sure if this part is the speedup or some other changes I made that I'll submit separately, but I figure the cleanup is worthwhile anyway). The only real substance in here is ensuring that the temporary variable for the swaps is scoped more tightly; the rest just shifts them to using typesafe inline functions instead of the existing macros.
diff --git a/bitslice_transformations_quad.cu b/bitslice_transformations_quad.cu
index fa81e83..67786c0 100644
--- a/bitslice_transformations_quad.cu
+++ b/bitslice_transformations_quad.cu
@@ -10,46 +10,53 @@
#define merge8(z, x, y, b)\
z=__byte_perm(x, y, b); \
-#define SWAP8(x,y)\
- x=__byte_perm(x, y, 0x5410); \
- y=__byte_perm(x, y, 0x7632);
-
-#define SWAP4(x,y)\
- t = (y<<4); \
- t = (x ^ t); \
- t = 0xf0f0f0f0UL & t; \
- x = (x ^ t); \
- t= t>>4;\
- y= y ^ t;
-
-#define SWAP4_final(x,y)\
- t = (y<<4); \
- t = (x ^ t); \
- t = 0xf0f0f0f0UL & t; \
- x = (x ^ t); \
-
-
-#define SWAP2(x,y)\
- t = (y<<2); \
- t = (x ^ t); \
- t = 0xccccccccUL & t; \
- x = (x ^ t); \
- t= t>>2;\
- y= y ^ t;
-
-#define SWAP1(x,y)\
- t = (y+y); \
- t = (x ^ t); \
- t = 0xaaaaaaaaUL & t; \
- x = (x ^ t); \
- t= t>>1;\
- y= y ^ t;
+__device__ __forceinline__
+void SWAP8(uint32_t &x, uint32_t &y) {
+ x = __byte_perm(x, y, 0x5410);
+ y = __byte_perm(x, y, 0x7632);
+}
+
+__device__ __forceinline__
+void SWAP4(uint32_t &x, uint32_t &y) {
+ uint32_t t = (y<<4) ^ x;
+ t = 0xf0f0f0f0UL & t;
+ x = (x ^ t);
+ t = t>>4;
+ y = y ^ t;
+}
+
+__device__ __forceinline__
+void SWAP4_final(uint32_t &x, const uint32_t y) {
+ uint32_t t = (y<<4);
+ t = (x ^ t);
+ t = 0xf0f0f0f0UL & t;
+ x = (x ^ t);
+}
+
+__device__ __forceinline__
+void SWAP2(uint32_t &x, uint32_t &y) {
+ uint32_t t = (y<<2);
+ t = (x ^ t);
+ t = 0xccccccccUL & t;
+ x = (x ^ t);
+ t = t>>2;
+ y = y ^ t;
+}
+
+__device__ __forceinline__
+void SWAP1(uint32_t &x, uint32_t &y) {
+ uint32_t t = (y+y);
+ t = (x ^ t);
+ t = 0xaaaaaaaaUL & t;
+ x = (x ^ t);
+ t = t>>1;
+ y = y ^ t;
+}
__device__ __forceinline__
void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __restrict__ output)
{
uint32_t other[8];
- uint32_t t;
uint32_t perm = (threadIdx.x & 1) ? 0x7362 : 0x5140;
const unsigned int n = threadIdx.x & 3;
@@ -90,7 +97,6 @@ void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __rest
__device__ __forceinline__
void from_bitslice_quad(const uint32_t *const __restrict__ input, uint32_t *const __restrict__ output)
{
- uint32_t t;
const uint32_t perm = 0x7531;//(threadIdx.x & 1) ? 0x3175 : 0x7531;
output[0] = __byte_perm(input[0], input[4], perm);
@@ -158,7 +164,6 @@ void from_bitslice_quad(const uint32_t *const __restrict__ input, uint32_t *cons
__device__ __forceinline__
void from_bitslice_quad_final(const uint32_t *const __restrict__ input, uint32_t *const __restrict__ output)
{
- uint32_t t;
const uint32_t perm = 0x7531;//(threadIdx.x & 1) ? 0x3175 : 0x7531;
if (threadIdx.x & 3)
And to groestl functions:
diff --git a/groestl_functions_quad.cu b/groestl_functions_quad.cu
index c39e81d..5b1cdb1 100644
--- a/groestl_functions_quad.cu
+++ b/groestl_functions_quad.cu
@@ -54,11 +56,9 @@ __device__ __forceinline__ void G256_AddRoundConstantP_quad(uint32_t &x7, uint32
__device__ __forceinline__ void G16mul_quad(uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0,
const uint32_t &y3, const uint32_t &y2, const uint32_t &y1, const uint32_t &y0)
{
- uint32_t t0,t1,t2;
-
- t0 = ((x2 ^ x0) ^ (x3 ^ x1)) & ((y2 ^ y0) ^ (y3 ^ y1));
- t1 = ((x2 ^ x0) & (y2 ^ y0)) ^ t0;
- t2 = ((x3 ^ x1) & (y3 ^ y1)) ^ t0 ^ t1;
+ uint32_t t0 = ((x2 ^ x0) ^ (x3 ^ x1)) & ((y2 ^ y0) ^ (y3 ^ y1));
+ uint32_t t1 = ((x2 ^ x0) & (y2 ^ y0)) ^ t0;
+ uint32_t t2 = ((x3 ^ x1) & (y3 ^ y1)) ^ t0 ^ t1;
t0 = (x2^x3) & (y2^y3);
x3 = (x3 & y3) ^ t0 ^ t1;
@@ -71,26 +71,24 @@ __device__ __forceinline__ void G16mul_quad(uint32_t &x3, uint32_t &x2, uint32_t
__device__ __forceinline__ void G256_inv_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_
{
- uint32_t t0,t1,t2,t3,t4,t5,t6,a,b;
-
- t3 = x7;
- t2 = x6;
- t1 = x5;
- t0 = x4;
+ uint32_t t3 = x7;
+ uint32_t t2 = x6;
+ uint32_t t1 = x5;
+ uint32_t t0 = x4;
G16mul_quad(t3, t2, t1, t0, x3, x2, x1, x0);
- a = (x4 ^ x0);
+ uint32_t a = (x4 ^ x0);
t0 ^= a;
t2 ^= (x7 ^ x3) ^ (x5 ^ x1);
t1 ^= (x5 ^ x1) ^ a;
t3 ^= (x6 ^ x2) ^ a;
- b = t0 ^ t1;
- t4 = (t2 ^ t3) & b;
+ uint32_t b = t0 ^ t1;
+ uint32_t t4 = (t2 ^ t3) & b;
a = t4 ^ t3 ^ t1;
- t5 = (t3 & t1) ^ a;
- t6 = (t2 & t0) ^ a ^ (t2 ^ t0);
+ uint32_t t5 = (t3 & t1) ^ a;
+ uint32_t t6 = (t2 & t0) ^ a ^ (t2 ^ t0);
t4 = (t5 ^ t6) & b;
t1 = (t6 & t1) ^ t4;
@@ -107,9 +105,8 @@ __device__ __forceinline__ void G256_inv_quad(uint32_t &x7, uint32_t &x6, uint32
__device__ __forceinline__ void transAtoX_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32
{
- uint32_t t0, t1;
- t0 = x0 ^ x1 ^ x2;
- t1 = x5 ^ x6;
+ uint32_t t0 = x0 ^ x1 ^ x2;
+ uint32_t t1 = x5 ^ x6;
x2 = t0 ^ t1 ^ x7;
x6 = t0 ^ x3 ^ x6;
x3 = x0 ^ x1 ^ x3 ^ x4 ^ x7;
@@ -122,19 +119,17 @@ __device__ __forceinline__ void transAtoX_quad(uint32_t &x0, uint32_t &x1, uint3
__device__ __forceinline__ void transXtoA_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32
{
- uint32_t t0,t2,t3,t5;
-
x1 ^= x4;
- t0 = x1 ^ x6;
+ uint32_t t0 = x1 ^ x6;
x1 ^= x5;
- t2 = x0 ^ x2;
+ uint32_t t2 = x0 ^ x2;
x2 = x3 ^ x5;
t2 ^= x2 ^ x6;
x2 ^= x7;
- t3 = x4 ^ x2 ^ x6;
+ uint32_t t3 = x4 ^ x2 ^ x6;
- t5 = x0 ^ x6;
+ uint32_t t5 = x0 ^ x6;
x4 = x3 ^ x7;
x0 = x3 ^ x5;
@@ -160,14 +155,12 @@ __device__ __forceinline__ void sbox_quad(uint32_t *const r)
__device__ __forceinline__ void G256_ShiftBytesP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4,
{
- uint32_t t0,t1;
-
const uint32_t tpos = threadIdx.x & 0x03;
const uint32_t shift1 = tpos << 1;
const uint32_t shift2 = shift1 + 1 + ((tpos == 3) << 2);
- t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
- t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
+ uint32_t t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
+ uint32_t t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
x0 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
@@ -201,14 +194,12 @@ __device__ __forceinline__ void G256_ShiftBytesP_quad(uint32_t &x7, uint32_t &x6
__device__ __forceinline__ void G256_ShiftBytesQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4,
{
- uint32_t t0,t1;
-
const uint32_t tpos = threadIdx.x & 0x03;
const uint32_t shift1 = (1 - (tpos >> 1)) + ((tpos & 0x01) << 2);
const uint32_t shift2 = shift1 + 2 + ((tpos == 1) << 2);
- t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
- t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
+ uint32_t t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
+ uint32_t t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
x0 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
Cheers.