32 #ifndef _threefry_dot_h_
33 #define _threefry_dot_h_
66 enum r123_enum_threefry64x4 {
69 R_64x4_0_0=14, R_64x4_0_1=16,
70 R_64x4_1_0=52, R_64x4_1_1=57,
71 R_64x4_2_0=23, R_64x4_2_1=40,
72 R_64x4_3_0= 5, R_64x4_3_1=37,
73 R_64x4_4_0=25, R_64x4_4_1=33,
74 R_64x4_5_0=46, R_64x4_5_1=12,
75 R_64x4_6_0=58, R_64x4_6_1=22,
76 R_64x4_7_0=32, R_64x4_7_1=32
79 enum r123_enum_threefry64x2 {
104 enum r123_enum_threefry32x4 {
110 R_32x4_0_0=10, R_32x4_0_1=26,
111 R_32x4_1_0=11, R_32x4_1_1=21,
112 R_32x4_2_0=13, R_32x4_2_1=27,
113 R_32x4_3_0=23, R_32x4_3_1= 5,
114 R_32x4_4_0= 6, R_32x4_4_1=20,
115 R_32x4_5_0=17, R_32x4_5_1=11,
116 R_32x4_6_0=25, R_32x4_6_1=10,
117 R_32x4_7_0=18, R_32x4_7_1=20
130 enum r123_enum_threefry32x2 {
154 enum r123_enum_threefry_wcnt {
160 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint64_t RotL_64(uint64_t x,
unsigned int N));
161 R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x,
unsigned int N)
163 return (x << (N & 63)) | (x >> ((64-N) & 63));
167 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint32_t RotL_32(uint32_t x,
unsigned int N));
168 R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x,
unsigned int N)
170 return (x << (N & 31)) | (x >> ((32-N) & 31));
173 #define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32))
174 #define SKEIN_KS_PARITY64 SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
175 #define SKEIN_KS_PARITY32 0x1BD11BDA
179 #ifndef THREEFRY2x32_DEFAULT_ROUNDS
180 #define THREEFRY2x32_DEFAULT_ROUNDS 20
183 #ifndef THREEFRY2x64_DEFAULT_ROUNDS
184 #define THREEFRY2x64_DEFAULT_ROUNDS 20
187 #ifndef THREEFRY4x32_DEFAULT_ROUNDS
188 #define THREEFRY4x32_DEFAULT_ROUNDS 20
191 #ifndef THREEFRY4x64_DEFAULT_ROUNDS
192 #define THREEFRY4x64_DEFAULT_ROUNDS 20
195 #define _threefry2x_tpl(W) \
196 typedef struct r123array2x##W threefry2x##W##_ctr_t; \
197 typedef struct r123array2x##W threefry2x##W##_key_t; \
198 typedef struct r123array2x##W threefry2x##W##_ukey_t; \
199 R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \
200 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
201 R123_CUDA_DEVICE R123_STATIC_INLINE \
202 threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
204 uint##W##_t ks0, ks1, ks2; \
205 R123_ASSERT(Nrounds<=32); \
206 ks2 = SKEIN_KS_PARITY##W; \
208 X0 = in.v[0] + ks0; \
212 X1 = in.v[1] + ks1; \
215 if(Nrounds>0){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
216 if(Nrounds>1){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
217 if(Nrounds>2){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
218 if(Nrounds>3){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
221 X0 += ks1; X1 += ks2; \
224 if(Nrounds>4){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
225 if(Nrounds>5){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
226 if(Nrounds>6){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
227 if(Nrounds>7){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
230 X0 += ks2; X1 += ks0; \
233 if(Nrounds>8){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
234 if(Nrounds>9){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
235 if(Nrounds>10){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
236 if(Nrounds>11){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
239 X0 += ks0; X1 += ks1; \
242 if(Nrounds>12){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
243 if(Nrounds>13){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
244 if(Nrounds>14){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
245 if(Nrounds>15){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
248 X0 += ks1; X1 += ks2; \
251 if(Nrounds>16){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
252 if(Nrounds>17){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
253 if(Nrounds>18){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
254 if(Nrounds>19){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
257 X0 += ks2; X1 += ks0; \
260 if(Nrounds>20){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
261 if(Nrounds>21){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
262 if(Nrounds>22){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
263 if(Nrounds>23){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
266 X0 += ks0; X1 += ks1; \
269 if(Nrounds>24){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
270 if(Nrounds>25){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
271 if(Nrounds>26){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
272 if(Nrounds>27){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
275 X0 += ks1; X1 += ks2; \
278 if(Nrounds>28){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
279 if(Nrounds>29){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
280 if(Nrounds>30){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
281 if(Nrounds>31){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
284 X0 += ks2; X1 += ks0; \
287 threefry2x##W##_ctr_t ret={{X0, X1}}; \
291 enum r123_enum_threefry2x##W { threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS }; \
292 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
293 R123_CUDA_DEVICE R123_STATIC_INLINE \
294 threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
295 return threefry2x##W##_R(threefry2x##W##_rounds, in, k); \
299 #define _threefry4x_tpl(W) \
300 typedef struct r123array4x##W threefry4x##W##_ctr_t; \
301 typedef struct r123array4x##W threefry4x##W##_key_t; \
302 typedef struct r123array4x##W threefry4x##W##_ukey_t; \
303 R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \
304 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
305 R123_CUDA_DEVICE R123_STATIC_INLINE \
306 threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
307 uint##W##_t X0, X1, X2, X3; \
308 uint##W##_t ks0, ks1, ks2, ks3, ks4; \
309 R123_ASSERT(Nrounds<=72); \
310 ks4 = SKEIN_KS_PARITY##W; \
312 X0 = in.v[0] + ks0; \
316 X1 = in.v[1] + ks1; \
320 X2 = in.v[2] + ks2; \
324 X3 = in.v[3] + ks3; \
328 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
329 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
332 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
333 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
336 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
337 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
340 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
341 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
345 X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
350 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
351 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
354 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
355 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
358 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
359 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
362 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
363 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
367 X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
372 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
373 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
376 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
377 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
380 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
381 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
384 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
385 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
389 X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
394 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
395 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
398 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
399 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
402 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
403 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
406 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
407 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
411 X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \
416 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
417 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
420 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
421 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
424 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
425 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
428 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
429 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
433 X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \
438 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
439 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
442 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
443 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
446 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
447 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
450 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
451 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
455 X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
460 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
461 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
464 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
465 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
468 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
469 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
472 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
473 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
477 X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
482 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
483 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
486 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
487 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
490 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
491 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
494 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
495 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
499 X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
504 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
505 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
508 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
509 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
512 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
513 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
516 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
517 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
521 X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \
526 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
527 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
530 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
531 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
534 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
535 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
538 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
539 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
543 X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \
548 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
549 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
552 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
553 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
556 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
557 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
560 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
561 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
565 X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
570 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
571 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
574 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
575 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
578 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
579 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
582 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
583 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
587 X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
592 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
593 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
596 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
597 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
600 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
601 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
604 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
605 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
609 X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
614 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
615 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
618 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
619 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
622 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
623 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
626 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
627 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
631 X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \
636 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
637 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
640 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
641 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
644 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
645 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
648 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
649 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
653 X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \
658 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
659 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
662 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
663 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
666 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
667 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
670 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
671 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
675 X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
680 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
681 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
684 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
685 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
688 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
689 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
692 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
693 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
697 X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
702 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
703 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
706 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
707 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
710 X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
711 X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
714 X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
715 X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
719 X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
723 threefry4x##W##_ctr_t ret = {{X0, X1, X2, X3}}; \
728 enum r123_enum_threefry4x##W { threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS }; \
729 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
730 R123_CUDA_DEVICE R123_STATIC_INLINE \
731 threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
732 return threefry4x##W##_R(threefry4x##W##_rounds, in, k); \
744 #define threefry2x32(c,k) threefry2x32_R(threefry2x32_rounds, c, k)
745 #define threefry4x32(c,k) threefry4x32_R(threefry4x32_rounds, c, k)
746 #define threefry2x64(c,k) threefry2x64_R(threefry2x64_rounds, c, k)
747 #define threefry4x64(c,k) threefry4x64_R(threefry4x64_rounds, c, k)
749 #if defined(__cplusplus)
750 #define _threefryNxWclass_tpl(NxW) \
752 template<unsigned int ROUNDS> \
753 struct Threefry##NxW##_R{ \
754 typedef threefry##NxW##_ctr_t ctr_type; \
755 typedef threefry##NxW##_key_t key_type; \
756 typedef threefry##NxW##_key_t ukey_type; \
757 static const R123_METAL_CONSTANT_ADDRESS_SPACE unsigned int rounds=ROUNDS; \
758 inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)){ \
759 R123_STATIC_ASSERT(ROUNDS<=72, "threefry is only unrolled up to 72 rounds\n"); \
760 return threefry##NxW##_R(ROUNDS, ctr, key); \
763 typedef Threefry##NxW##_R<threefry##NxW##_rounds> Threefry##NxW; \