Random123
threefry.h
Go to the documentation of this file.
1 /*
2 Copyright 2010-2011, D. E. Shaw Research.
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are
7 met:
8 
9 * Redistributions of source code must retain the above copyright
10  notice, this list of conditions, and the following disclaimer.
11 
12 * Redistributions in binary form must reproduce the above copyright
13  notice, this list of conditions, and the following disclaimer in the
14  documentation and/or other materials provided with the distribution.
15 
16 * Neither the name of D. E. Shaw Research nor the names of its
17  contributors may be used to endorse or promote products derived from
18  this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32 #ifndef _threefry_dot_h_
33 #define _threefry_dot_h_
35 #include "array.h"
36 
38 /* Significant parts of this file were copied from
39  from:
40  Skein_FinalRnd/ReferenceImplementation/skein.h
41  Skein_FinalRnd/ReferenceImplementation/skein_block.c
42 
43  in http://csrc.nist.gov/groups/ST/hash/sha-3/Round3/documents/Skein_FinalRnd.zip
44 
45  This file has been modified so that it may no longer perform its originally
46  intended function. If you're looking for a Skein or Threefish source code,
47  please consult the original file.
48 
49  The original file had the following header:
50 **************************************************************************
51 **
52 ** Interface declarations and internal definitions for Skein hashing.
53 **
54 ** Source code author: Doug Whiting, 2008.
55 **
56 ** This algorithm and source code is released to the public domain.
57 **
58 ***************************************************************************
59 
60 */
61 
62 /* See comment at the top of philox.h for the macro pre-process
63  strategy. */
64 
65 /* Rotation constants: */
66 enum r123_enum_threefry64x4 {
67  /* These are the R_256 constants from the Threefish reference sources
68  with names changed to R_64x4... */
69  R_64x4_0_0=14, R_64x4_0_1=16,
70  R_64x4_1_0=52, R_64x4_1_1=57,
71  R_64x4_2_0=23, R_64x4_2_1=40,
72  R_64x4_3_0= 5, R_64x4_3_1=37,
73  R_64x4_4_0=25, R_64x4_4_1=33,
74  R_64x4_5_0=46, R_64x4_5_1=12,
75  R_64x4_6_0=58, R_64x4_6_1=22,
76  R_64x4_7_0=32, R_64x4_7_1=32
77 };
78 
79 enum r123_enum_threefry64x2 {
80  /*
81  // Output from skein_rot_search: (srs64_B64-X1000)
82  // Random seed = 1. BlockSize = 128 bits. sampleCnt = 1024. rounds = 8, minHW_or=57
83  // Start: Tue Mar 1 10:07:48 2011
84  // rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format
85  */
86  R_64x2_0_0=16,
87  R_64x2_1_0=42,
88  R_64x2_2_0=12,
89  R_64x2_3_0=31,
90  R_64x2_4_0=16,
91  R_64x2_5_0=32,
92  R_64x2_6_0=24,
93  R_64x2_7_0=21
94  /* 4 rounds: minHW = 4 [ 4 4 4 4 ]
95  // 5 rounds: minHW = 8 [ 8 8 8 8 ]
96  // 6 rounds: minHW = 16 [ 16 16 16 16 ]
97  // 7 rounds: minHW = 32 [ 32 32 32 32 ]
98  // 8 rounds: minHW = 64 [ 64 64 64 64 ]
99  // 9 rounds: minHW = 64 [ 64 64 64 64 ]
100  //10 rounds: minHW = 64 [ 64 64 64 64 ]
101  //11 rounds: minHW = 64 [ 64 64 64 64 ] */
102 };
103 
104 enum r123_enum_threefry32x4 {
105  /* Output from skein_rot_search: (srs-B128-X5000.out)
106  // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
107  // Start: Mon Aug 24 22:41:36 2009
108  // ...
109  // rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format */
110  R_32x4_0_0=10, R_32x4_0_1=26,
111  R_32x4_1_0=11, R_32x4_1_1=21,
112  R_32x4_2_0=13, R_32x4_2_1=27,
113  R_32x4_3_0=23, R_32x4_3_1= 5,
114  R_32x4_4_0= 6, R_32x4_4_1=20,
115  R_32x4_5_0=17, R_32x4_5_1=11,
116  R_32x4_6_0=25, R_32x4_6_1=10,
117  R_32x4_7_0=18, R_32x4_7_1=20
118 
119  /* 4 rounds: minHW = 3 [ 3 3 3 3 ]
120  // 5 rounds: minHW = 7 [ 7 7 7 7 ]
121  // 6 rounds: minHW = 12 [ 13 12 13 12 ]
122  // 7 rounds: minHW = 22 [ 22 23 22 23 ]
123  // 8 rounds: minHW = 31 [ 31 31 31 31 ]
124  // 9 rounds: minHW = 32 [ 32 32 32 32 ]
125  //10 rounds: minHW = 32 [ 32 32 32 32 ]
126  //11 rounds: minHW = 32 [ 32 32 32 32 ] */
127 
128 };
129 
130 enum r123_enum_threefry32x2 {
131  /* Output from skein_rot_search (srs32x2-X5000.out)
132  // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
133  // Start: Tue Jul 12 11:11:33 2011
134  // rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize= 64].format */
135  R_32x2_0_0=13,
136  R_32x2_1_0=15,
137  R_32x2_2_0=26,
138  R_32x2_3_0= 6,
139  R_32x2_4_0=17,
140  R_32x2_5_0=29,
141  R_32x2_6_0=16,
142  R_32x2_7_0=24
143 
144  /* 4 rounds: minHW = 4 [ 4 4 4 4 ]
145  // 5 rounds: minHW = 6 [ 6 8 6 8 ]
146  // 6 rounds: minHW = 9 [ 9 12 9 12 ]
147  // 7 rounds: minHW = 16 [ 16 24 16 24 ]
148  // 8 rounds: minHW = 32 [ 32 32 32 32 ]
149  // 9 rounds: minHW = 32 [ 32 32 32 32 ]
150  //10 rounds: minHW = 32 [ 32 32 32 32 ]
151  //11 rounds: minHW = 32 [ 32 32 32 32 ] */
152  };
153 
154 enum r123_enum_threefry_wcnt {
155  WCNT2=2,
156  WCNT4=4
157 };
158 
159 #if R123_USE_64BIT
160 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint64_t RotL_64(uint64_t x, unsigned int N));
161 R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N)
162 {
163  return (x << (N & 63)) | (x >> ((64-N) & 63));
164 }
165 #endif
166 
167 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint32_t RotL_32(uint32_t x, unsigned int N));
168 R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N)
169 {
170  return (x << (N & 31)) | (x >> ((32-N) & 31));
171 }
172 
173 #define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32))
174 #define SKEIN_KS_PARITY64 SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
175 #define SKEIN_KS_PARITY32 0x1BD11BDA
176 
179 #ifndef THREEFRY2x32_DEFAULT_ROUNDS
180 #define THREEFRY2x32_DEFAULT_ROUNDS 20
181 #endif
182 
183 #ifndef THREEFRY2x64_DEFAULT_ROUNDS
184 #define THREEFRY2x64_DEFAULT_ROUNDS 20
185 #endif
186 
187 #ifndef THREEFRY4x32_DEFAULT_ROUNDS
188 #define THREEFRY4x32_DEFAULT_ROUNDS 20
189 #endif
190 
191 #ifndef THREEFRY4x64_DEFAULT_ROUNDS
192 #define THREEFRY4x64_DEFAULT_ROUNDS 20
193 #endif
194 
195 #define _threefry2x_tpl(W) \
196 typedef struct r123array2x##W threefry2x##W##_ctr_t; \
197 typedef struct r123array2x##W threefry2x##W##_key_t; \
198 typedef struct r123array2x##W threefry2x##W##_ukey_t; \
199 R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \
200 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
201 R123_CUDA_DEVICE R123_STATIC_INLINE \
202 threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
203  uint##W##_t X0,X1; \
204  uint##W##_t ks0, ks1, ks2; \
205  R123_ASSERT(Nrounds<=32); \
206  ks2 = SKEIN_KS_PARITY##W; \
207  ks0 = k.v[0]; \
208  X0 = in.v[0] + ks0; \
209  ks2 ^= ks0; \
210 \
211  ks1 = k.v[1]; \
212  X1 = in.v[1] + ks1; \
213  ks2 ^= ks1; \
214  \
215  if(Nrounds>0){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
216  if(Nrounds>1){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
217  if(Nrounds>2){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
218  if(Nrounds>3){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
219  if(Nrounds>3){ \
220  /* InjectKey(r=1) */ \
221  X0 += ks1; X1 += ks2; \
222  X1 += 1; /* X.v[2-1] += r */ \
223  } \
224  if(Nrounds>4){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
225  if(Nrounds>5){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
226  if(Nrounds>6){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
227  if(Nrounds>7){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
228  if(Nrounds>7){ \
229  /* InjectKey(r=2) */ \
230  X0 += ks2; X1 += ks0; \
231  X1 += 2; \
232  } \
233  if(Nrounds>8){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
234  if(Nrounds>9){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
235  if(Nrounds>10){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
236  if(Nrounds>11){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
237  if(Nrounds>11){ \
238  /* InjectKey(r=3) */ \
239  X0 += ks0; X1 += ks1; \
240  X1 += 3; \
241  } \
242  if(Nrounds>12){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
243  if(Nrounds>13){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
244  if(Nrounds>14){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
245  if(Nrounds>15){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
246  if(Nrounds>15){ \
247  /* InjectKey(r=4) */ \
248  X0 += ks1; X1 += ks2; \
249  X1 += 4; \
250  } \
251  if(Nrounds>16){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
252  if(Nrounds>17){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
253  if(Nrounds>18){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
254  if(Nrounds>19){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
255  if(Nrounds>19){ \
256  /* InjectKey(r=5) */ \
257  X0 += ks2; X1 += ks0; \
258  X1 += 5; \
259  } \
260  if(Nrounds>20){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
261  if(Nrounds>21){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
262  if(Nrounds>22){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
263  if(Nrounds>23){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
264  if(Nrounds>23){ \
265  /* InjectKey(r=6) */ \
266  X0 += ks0; X1 += ks1; \
267  X1 += 6; \
268  } \
269  if(Nrounds>24){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_0_0); X1 ^= X0; } \
270  if(Nrounds>25){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_1_0); X1 ^= X0; } \
271  if(Nrounds>26){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_2_0); X1 ^= X0; } \
272  if(Nrounds>27){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_3_0); X1 ^= X0; } \
273  if(Nrounds>27){ \
274  /* InjectKey(r=7) */ \
275  X0 += ks1; X1 += ks2; \
276  X1 += 7; \
277  } \
278  if(Nrounds>28){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_4_0); X1 ^= X0; } \
279  if(Nrounds>29){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_5_0); X1 ^= X0; } \
280  if(Nrounds>30){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_6_0); X1 ^= X0; } \
281  if(Nrounds>31){ X0 += X1; X1 = RotL_##W(X1,R_##W##x2_7_0); X1 ^= X0; } \
282  if(Nrounds>31){ \
283  /* InjectKey(r=8) */ \
284  X0 += ks2; X1 += ks0; \
285  X1 += 8; \
286  } \
287  threefry2x##W##_ctr_t ret={{X0, X1}}; \
288  return ret; \
289 } \
290  \
291 enum r123_enum_threefry2x##W { threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS }; \
292 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
293 R123_CUDA_DEVICE R123_STATIC_INLINE \
294 threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
295  return threefry2x##W##_R(threefry2x##W##_rounds, in, k); \
296 }
297 
298 
299 #define _threefry4x_tpl(W) \
300 typedef struct r123array4x##W threefry4x##W##_ctr_t; \
301 typedef struct r123array4x##W threefry4x##W##_key_t; \
302 typedef struct r123array4x##W threefry4x##W##_ukey_t; \
303 R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \
304 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
305 R123_CUDA_DEVICE R123_STATIC_INLINE \
306 threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
307  uint##W##_t X0, X1, X2, X3; \
308  uint##W##_t ks0, ks1, ks2, ks3, ks4; \
309  R123_ASSERT(Nrounds<=72); \
310  ks4 = SKEIN_KS_PARITY##W; \
311  ks0 = k.v[0]; \
312  X0 = in.v[0] + ks0; \
313  ks4 ^= ks0; \
314  \
315  ks1 = k.v[1]; \
316  X1 = in.v[1] + ks1; \
317  ks4 ^= ks1; \
318  \
319  ks2 = k.v[2]; \
320  X2 = in.v[2] + ks2; \
321  ks4 ^= ks2; \
322  \
323  ks3 = k.v[3]; \
324  X3 = in.v[3] + ks3; \
325  ks4 ^= ks3; \
326  \
327  if(Nrounds>0){ \
328  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
329  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
330  } \
331  if(Nrounds>1){ \
332  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
333  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
334  } \
335  if(Nrounds>2){ \
336  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
337  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
338  } \
339  if(Nrounds>3){ \
340  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
341  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
342  } \
343  if(Nrounds>3){ \
344  /* InjectKey(r=1) */ \
345  X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
346  X3 += 1; /* XWCNT4-1 += r */ \
347  } \
348  \
349  if(Nrounds>4){ \
350  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
351  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
352  } \
353  if(Nrounds>5){ \
354  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
355  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
356  } \
357  if(Nrounds>6){ \
358  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
359  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
360  } \
361  if(Nrounds>7){ \
362  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
363  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
364  } \
365  if(Nrounds>7){ \
366  /* InjectKey(r=2) */ \
367  X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
368  X3 += 2; /* XWCNT4-1 += r */ \
369  } \
370  \
371  if(Nrounds>8){ \
372  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
373  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
374  } \
375  if(Nrounds>9){ \
376  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
377  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
378  } \
379  if(Nrounds>10){ \
380  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
381  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
382  } \
383  if(Nrounds>11){ \
384  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
385  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
386  } \
387  if(Nrounds>11){ \
388  /* InjectKey(r=3) */ \
389  X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
390  X3 += 3; /* XWCNT4-1 += r */ \
391  } \
392  \
393  if(Nrounds>12){ \
394  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
395  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
396  } \
397  if(Nrounds>13){ \
398  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
399  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
400  } \
401  if(Nrounds>14){ \
402  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
403  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
404  } \
405  if(Nrounds>15){ \
406  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
407  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
408  } \
409  if(Nrounds>15){ \
410  /* InjectKey(r=1) */ \
411  X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \
412  X3 += 4; /* XWCNT4-1 += r */ \
413  } \
414  \
415  if(Nrounds>16){ \
416  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
417  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
418  } \
419  if(Nrounds>17){ \
420  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
421  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
422  } \
423  if(Nrounds>18){ \
424  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
425  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
426  } \
427  if(Nrounds>19){ \
428  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
429  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
430  } \
431  if(Nrounds>19){ \
432  /* InjectKey(r=1) */ \
433  X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \
434  X3 += 5; /* XWCNT4-1 += r */ \
435  } \
436  \
437  if(Nrounds>20){ \
438  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
439  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
440  } \
441  if(Nrounds>21){ \
442  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
443  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
444  } \
445  if(Nrounds>22){ \
446  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
447  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
448  } \
449  if(Nrounds>23){ \
450  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
451  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
452  } \
453  if(Nrounds>23){ \
454  /* InjectKey(r=1) */ \
455  X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
456  X3 += 6; /* XWCNT4-1 += r */ \
457  } \
458  \
459  if(Nrounds>24){ \
460  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
461  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
462  } \
463  if(Nrounds>25){ \
464  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
465  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
466  } \
467  if(Nrounds>26){ \
468  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
469  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
470  } \
471  if(Nrounds>27){ \
472  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
473  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
474  } \
475  if(Nrounds>27){ \
476  /* InjectKey(r=1) */ \
477  X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
478  X3 += 7; /* XWCNT4-1 += r */ \
479  } \
480  \
481  if(Nrounds>28){ \
482  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
483  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
484  } \
485  if(Nrounds>29){ \
486  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
487  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
488  } \
489  if(Nrounds>30){ \
490  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
491  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
492  } \
493  if(Nrounds>31){ \
494  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
495  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
496  } \
497  if(Nrounds>31){ \
498  /* InjectKey(r=1) */ \
499  X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
500  X3 += 8; /* XWCNT4-1 += r */ \
501  } \
502  \
503  if(Nrounds>32){ \
504  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
505  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
506  } \
507  if(Nrounds>33){ \
508  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
509  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
510  } \
511  if(Nrounds>34){ \
512  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
513  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
514  } \
515  if(Nrounds>35){ \
516  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
517  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
518  } \
519  if(Nrounds>35){ \
520  /* InjectKey(r=1) */ \
521  X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \
522  X3 += 9; /* XWCNT4-1 += r */ \
523  } \
524  \
525  if(Nrounds>36){ \
526  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
527  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
528  } \
529  if(Nrounds>37){ \
530  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
531  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
532  } \
533  if(Nrounds>38){ \
534  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
535  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
536  } \
537  if(Nrounds>39){ \
538  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
539  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
540  } \
541  if(Nrounds>39){ \
542  /* InjectKey(r=1) */ \
543  X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \
544  X3 += 10; /* XWCNT4-1 += r */ \
545  } \
546  \
547  if(Nrounds>40){ \
548  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
549  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
550  } \
551  if(Nrounds>41){ \
552  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
553  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
554  } \
555  if(Nrounds>42){ \
556  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
557  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
558  } \
559  if(Nrounds>43){ \
560  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
561  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
562  } \
563  if(Nrounds>43){ \
564  /* InjectKey(r=1) */ \
565  X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
566  X3 += 11; /* XWCNT4-1 += r */ \
567  } \
568  \
569  if(Nrounds>44){ \
570  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
571  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
572  } \
573  if(Nrounds>45){ \
574  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
575  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
576  } \
577  if(Nrounds>46){ \
578  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
579  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
580  } \
581  if(Nrounds>47){ \
582  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
583  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
584  } \
585  if(Nrounds>47){ \
586  /* InjectKey(r=1) */ \
587  X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
588  X3 += 12; /* XWCNT4-1 += r */ \
589  } \
590  \
591  if(Nrounds>48){ \
592  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
593  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
594  } \
595  if(Nrounds>49){ \
596  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
597  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
598  } \
599  if(Nrounds>50){ \
600  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
601  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
602  } \
603  if(Nrounds>51){ \
604  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
605  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
606  } \
607  if(Nrounds>51){ \
608  /* InjectKey(r=1) */ \
609  X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
610  X3 += 13; /* XWCNT4-1 += r */ \
611  } \
612  \
613  if(Nrounds>52){ \
614  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
615  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
616  } \
617  if(Nrounds>53){ \
618  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
619  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
620  } \
621  if(Nrounds>54){ \
622  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
623  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
624  } \
625  if(Nrounds>55){ \
626  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
627  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
628  } \
629  if(Nrounds>55){ \
630  /* InjectKey(r=1) */ \
631  X0 += ks4; X1 += ks0; X2 += ks1; X3 += ks2; \
632  X3 += 14; /* XWCNT4-1 += r */ \
633  } \
634  \
635  if(Nrounds>56){ \
636  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
637  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
638  } \
639  if(Nrounds>57){ \
640  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
641  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
642  } \
643  if(Nrounds>58){ \
644  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
645  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
646  } \
647  if(Nrounds>59){ \
648  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
649  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
650  } \
651  if(Nrounds>59){ \
652  /* InjectKey(r=1) */ \
653  X0 += ks0; X1 += ks1; X2 += ks2; X3 += ks3; \
654  X3 += 15; /* XWCNT4-1 += r */ \
655  } \
656  \
657  if(Nrounds>60){ \
658  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
659  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
660  } \
661  if(Nrounds>61){ \
662  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
663  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
664  } \
665  if(Nrounds>62){ \
666  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
667  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
668  } \
669  if(Nrounds>63){ \
670  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
671  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
672  } \
673  if(Nrounds>63){ \
674  /* InjectKey(r=1) */ \
675  X0 += ks1; X1 += ks2; X2 += ks3; X3 += ks4; \
676  X3 += 16; /* XWCNT4-1 += r */ \
677  } \
678  \
679  if(Nrounds>64){ \
680  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_0_0); X1 ^= X0; \
681  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_0_1); X3 ^= X2; \
682  } \
683  if(Nrounds>65){ \
684  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_1_0); X3 ^= X0; \
685  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_1_1); X1 ^= X2; \
686  } \
687  if(Nrounds>66){ \
688  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_2_0); X1 ^= X0; \
689  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_2_1); X3 ^= X2; \
690  } \
691  if(Nrounds>67){ \
692  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_3_0); X3 ^= X0; \
693  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_3_1); X1 ^= X2; \
694  } \
695  if(Nrounds>67){ \
696  /* InjectKey(r=1) */ \
697  X0 += ks2; X1 += ks3; X2 += ks4; X3 += ks0; \
698  X3 += 17; /* XWCNT4-1 += r */ \
699  } \
700  \
701  if(Nrounds>68){ \
702  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_4_0); X1 ^= X0; \
703  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_4_1); X3 ^= X2; \
704  } \
705  if(Nrounds>69){ \
706  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_5_0); X3 ^= X0; \
707  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_5_1); X1 ^= X2; \
708  } \
709  if(Nrounds>70){ \
710  X0 += X1; X1 = RotL_##W(X1,R_##W##x4_6_0); X1 ^= X0; \
711  X2 += X3; X3 = RotL_##W(X3,R_##W##x4_6_1); X3 ^= X2; \
712  } \
713  if(Nrounds>71){ \
714  X0 += X3; X3 = RotL_##W(X3,R_##W##x4_7_0); X3 ^= X0; \
715  X2 += X1; X1 = RotL_##W(X1,R_##W##x4_7_1); X1 ^= X2; \
716  } \
717  if(Nrounds>71){ \
718  /* InjectKey(r=1) */ \
719  X0 += ks3; X1 += ks4; X2 += ks0; X3 += ks1; \
720  X3 += 18; /* XWCNT4-1 += r */ \
721  } \
722  \
723  threefry4x##W##_ctr_t ret = {{X0, X1, X2, X3}}; \
724  return ret; \
725 } \
726  \
727  \
728 enum r123_enum_threefry4x##W { threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS }; \
729 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
730 R123_CUDA_DEVICE R123_STATIC_INLINE \
731 threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
732  return threefry4x##W##_R(threefry4x##W##_rounds, in, k); \
733 }
734 
735 #if R123_USE_64BIT
738 #endif
741 
742 /* gcc4.5 and 4.6 seem to optimize a macro-ized threefryNxW better
743  than a static inline function. Why? */
744 #define threefry2x32(c,k) threefry2x32_R(threefry2x32_rounds, c, k)
745 #define threefry4x32(c,k) threefry4x32_R(threefry4x32_rounds, c, k)
746 #define threefry2x64(c,k) threefry2x64_R(threefry2x64_rounds, c, k)
747 #define threefry4x64(c,k) threefry4x64_R(threefry4x64_rounds, c, k)
748 
749 #if defined(__cplusplus)
750 #define _threefryNxWclass_tpl(NxW) \
751 namespace r123{ \
752 template<unsigned int ROUNDS> \
753  struct Threefry##NxW##_R{ \
754  typedef threefry##NxW##_ctr_t ctr_type; \
755  typedef threefry##NxW##_key_t key_type; \
756  typedef threefry##NxW##_key_t ukey_type; \
757  static const R123_METAL_CONSTANT_ADDRESS_SPACE unsigned int rounds=ROUNDS; \
758  inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)){ \
759  R123_STATIC_ASSERT(ROUNDS<=72, "threefry is only unrolled up to 72 rounds\n"); \
760  return threefry##NxW##_R(ROUNDS, ctr, key); \
761  } \
762 }; \
763  typedef Threefry##NxW##_R<threefry##NxW##_rounds> Threefry##NxW; \
764 } // namespace r123
765 
768 #if R123_USE_64BIT
771 #endif
772 
773 /* The _tpl macros don't quite work to do string-pasting inside comments.
774  so we just write out the boilerplate documentation four times... */
775 
872 #endif
873 
874 #endif
_threefryNxWclass_tpl
#define _threefryNxWclass_tpl(NxW)
Definition: threefry.h:750
array.h
_threefry4x_tpl
#define _threefry4x_tpl(W)
Definition: threefry.h:299
_threefry2x_tpl
#define _threefry2x_tpl(W)
Definition: threefry.h:195
compilerfeatures.h