Random123
philox.h
Go to the documentation of this file.
1 /*
2 Copyright 2010-2011, D. E. Shaw Research.
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are
7 met:
8 
9 * Redistributions of source code must retain the above copyright
10  notice, this list of conditions, and the following disclaimer.
11 
12 * Redistributions in binary form must reproduce the above copyright
13  notice, this list of conditions, and the following disclaimer in the
14  documentation and/or other materials provided with the distribution.
15 
16 * Neither the name of D. E. Shaw Research nor the names of its
17  contributors may be used to endorse or promote products derived from
18  this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32 #ifndef _philox_dot_h_
33 #define _philox_dot_h_
34 
38 #include "array.h"
39 
40 
41 /*
42 // Macros _Foo_tpl are code generation 'templates' They define
43 // inline functions with names obtained by mangling Foo and the
44 // macro arguments. E.g.,
45 // _mulhilo_tpl(32, uint32_t, uint64_t)
46 // expands to a definition of:
47 // mulhilo32(uint32_t, uint32_t, uint32_t *, uint32_t *)
48 // We then 'instantiate the template' to define
49 // several different functions, e.g.,
50 // mulhilo32
51 // mulhilo64
52 // These functions will be visible to user code, and may
53 // also be used later in subsequent templates and definitions.
54 
55 // A template for mulhilo using a temporary of twice the word-width.
56 // Gcc figures out that this can be reduced to a single 'mul' instruction,
57 // despite the apparent use of double-wide variables, shifts, etc. It's
58 // obviously not guaranteed that all compilers will be that smart, so
59 // other implementations might be preferable, e.g., using an intrinsic
60 // or an asm block. On the other hand, for 32-bit multiplies,
61 // this *is* perfectly standard C99 - any C99 compiler should
62 // understand it and produce correct code. For 64-bit multiplies,
63 // it's only usable if the compiler recognizes that it can do
64 // arithmetic on a 128-bit type. That happens to be true for gcc on
65 // x86-64, and powerpc64 but not much else.
66 */
67 #define _mulhilo_dword_tpl(W, Word, Dword) \
68 R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
69  Dword product = ((Dword)a)*((Dword)b); \
70  *hip = product>>W; \
71  return (Word)product; \
72 }
73 
74 /*
75 // A template for mulhilo using gnu-style asm syntax.
76 // INSN can be "mulw", "mull" or "mulq".
77 // FIXME - porting to other architectures, we'll need still-more conditional
78 // branching here. Note that intrinsics are usually preferable.
79 */
80 #ifdef __powerpc__
81 #define _mulhilo_asm_tpl(W, Word, INSN) \
82 R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \
83  Word dx = 0; \
84  __asm__("\n\t" \
85  INSN " %0,%1,%2\n\t" \
86  : "=r"(dx) \
87  : "r"(b), "r"(ax) \
88  ); \
89  *hip = dx; \
90  return ax*b; \
91 }
92 #else
93 #define _mulhilo_asm_tpl(W, Word, INSN) \
94 R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \
95  Word dx; \
96  __asm__("\n\t" \
97  INSN " %2\n\t" \
98  : "=a"(ax), "=d"(dx) \
99  : "r"(b), "0"(ax) \
100  ); \
101  *hip = dx; \
102  return ax; \
103 }
104 #endif /* __powerpc__ */
105 
106 /*
107 // A template for mulhilo using MSVC-style intrinsics
108 // For example,_umul128 is an msvc intrinsic, c.f.
109 // http://msdn.microsoft.com/en-us/library/3dayytw9.aspx
110 */
111 #define _mulhilo_msvc_intrin_tpl(W, Word, INTRIN) \
112 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
113  return INTRIN(a, b, hip); \
114 }
115 
116 /* N.B. This really should be called _mulhilo_mulhi_intrin. It just
117  happens that CUDA was the first time we used the idiom. */
118 #define _mulhilo_cuda_intrin_tpl(W, Word, INTRIN) \
119 R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, R123_METAL_THREAD_ADDRESS_SPACE Word* hip){ \
120  *hip = INTRIN(a, b); \
121  return a*b; \
122 }
123 
124 /*
125 // A template for mulhilo using only word-size operations and
126 // C99 operators (no adc, no mulhi). It
127 // requires four multiplies and a dozen or so shifts, adds
128 // and tests. It's *SLOW*. It can be used to
129 // implement philoxNx32 on platforms that completely lack
130 // 64-bit types, e.g., Metal.
131 // On 32-bit platforms, it could be used to
132 // implement philoxNx64, but on such platforms both the philoxNx32
133 // and the threefryNx64 cbrngs are going to have much better
134 // performance. It is enabled below by R123_USE_MULHILO64_C99,
135 // but that is currently (Feb 2019) only set by
136 // features/metalfeatures.h headers. It can, of course, be
137 // set with a compile-time -D option.
138 */
139 #define _mulhilo_c99_tpl(W, Word) \
140 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, R123_METAL_THREAD_ADDRESS_SPACE Word *hip){ \
141  const unsigned WHALF = W/2; \
142  const Word LOMASK = ((((Word)1)<<WHALF)-1); \
143  Word lo = a*b; /* full low multiply */ \
144  Word ahi = a>>WHALF; \
145  Word alo = a& LOMASK; \
146  Word bhi = b>>WHALF; \
147  Word blo = b& LOMASK; \
148  \
149  Word ahbl = ahi*blo; \
150  Word albh = alo*bhi; \
151  \
152  Word ahbl_albh = ((ahbl&LOMASK) + (albh&LOMASK)); \
153  Word hi = ahi*bhi + (ahbl>>WHALF) + (albh>>WHALF); \
154  hi += ahbl_albh >> WHALF; /* carry from the sum of lo(ahbl) + lo(albh) ) */ \
155  /* carry from the sum with alo*blo */ \
156  hi += ((lo >> WHALF) < (ahbl_albh&LOMASK)); \
157  *hip = hi; \
158  return lo; \
159 }
160 
161 /*
162 // A template for mulhilo on a platform that can't do it
163 // We could put a C version here, but is it better to run *VERY*
164 // slowly or to just stop and force the user to find another CBRNG?
165 */
166 #define _mulhilo_fail_tpl(W, Word) \
167 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){ \
168  R123_STATIC_ASSERT(0, "mulhilo" #W " is not implemented on this machine\n"); \
169 }
170 
171 /*
172 // N.B. There's an MSVC intrinsic called _emul,
173 // which *might* compile into better code than
174 // _mulhilo_dword_tpl
175 */
176 #if R123_USE_MULHILO32_ASM
177 #ifdef __powerpc__
178 _mulhilo_asm_tpl(32, uint32_t, "mulhwu")
179 #else
180 _mulhilo_asm_tpl(32, uint32_t, "mull")
181 #endif /* __powerpc__ */
182 #else
183 #if R123_USE_64BIT
184 _mulhilo_dword_tpl(32, uint32_t, uint64_t)
185 #elif R123_USE_MULHILO32_MULHI_INTRIN
186 _mulhilo_cuda_intrin_tpl(32, uint32_t, R123_MULHILO32_MULHI_INTRIN)
187 #else
188 _mulhilo_c99_tpl(32, uint32_t)
189 #endif
190 #endif
191 
192 #if R123_USE_PHILOX_64BIT
193 #if R123_USE_MULHILO64_ASM
194 #ifdef __powerpc64__
195 _mulhilo_asm_tpl(64, uint64_t, "mulhdu")
196 #else
197 _mulhilo_asm_tpl(64, uint64_t, "mulq")
198 #endif /* __powerpc64__ */
199 #elif R123_USE_MULHILO64_MSVC_INTRIN
200 _mulhilo_msvc_intrin_tpl(64, uint64_t, _umul128)
201 #elif R123_USE_MULHILO64_CUDA_INTRIN
202 _mulhilo_cuda_intrin_tpl(64, uint64_t, __umul64hi)
203 #elif R123_USE_MULHILO64_OPENCL_INTRIN
204 _mulhilo_cuda_intrin_tpl(64, uint64_t, mul_hi)
205 #elif R123_USE_MULHILO64_MULHI_INTRIN
206 _mulhilo_cuda_intrin_tpl(64, uint64_t, R123_MULHILO64_MULHI_INTRIN)
207 #elif R123_USE_GNU_UINT128
208 _mulhilo_dword_tpl(64, uint64_t, __uint128_t)
209 #elif R123_USE_MULHILO64_C99
210 _mulhilo_c99_tpl(64, uint64_t)
211 #else
212 _mulhilo_fail_tpl(64, uint64_t)
213 #endif
214 #endif
215 
216 /*
217 // The multipliers and Weyl constants are "hard coded".
218 // To change them, you can #define them with different
219 // values before #include-ing this file.
220 // This isn't terribly elegant, but it works for C as
221 // well as C++. A nice C++-only solution would be to
222 // use template parameters in the style of <random>
223 */
224 #ifndef PHILOX_M2x64_0
225 #define PHILOX_M2x64_0 R123_64BIT(0xD2B74407B1CE6E93)
226 #endif
227 
228 #ifndef PHILOX_M4x64_0
229 #define PHILOX_M4x64_0 R123_64BIT(0xD2E7470EE14C6C93)
230 #endif
231 
232 #ifndef PHILOX_M4x64_1
233 #define PHILOX_M4x64_1 R123_64BIT(0xCA5A826395121157)
234 #endif
235 
236 #ifndef PHILOX_M2x32_0
237 #define PHILOX_M2x32_0 ((uint32_t)0xd256d193)
238 #endif
239 
240 #ifndef PHILOX_M4x32_0
241 #define PHILOX_M4x32_0 ((uint32_t)0xD2511F53)
242 #endif
243 #ifndef PHILOX_M4x32_1
244 #define PHILOX_M4x32_1 ((uint32_t)0xCD9E8D57)
245 #endif
246 
247 #ifndef PHILOX_W64_0
248 #define PHILOX_W64_0 R123_64BIT(0x9E3779B97F4A7C15) /* golden ratio */
249 #endif
250 #ifndef PHILOX_W64_1
251 #define PHILOX_W64_1 R123_64BIT(0xBB67AE8584CAA73B) /* sqrt(3)-1 */
252 #endif
253 
254 #ifndef PHILOX_W32_0
255 #define PHILOX_W32_0 ((uint32_t)0x9E3779B9)
256 #endif
257 #ifndef PHILOX_W32_1
258 #define PHILOX_W32_1 ((uint32_t)0xBB67AE85)
259 #endif
260 
262 #ifndef PHILOX2x32_DEFAULT_ROUNDS
263 #define PHILOX2x32_DEFAULT_ROUNDS 10
264 #endif
265 
266 #ifndef PHILOX2x64_DEFAULT_ROUNDS
267 #define PHILOX2x64_DEFAULT_ROUNDS 10
268 #endif
269 
270 #ifndef PHILOX4x32_DEFAULT_ROUNDS
271 #define PHILOX4x32_DEFAULT_ROUNDS 10
272 #endif
273 
274 #ifndef PHILOX4x64_DEFAULT_ROUNDS
275 #define PHILOX4x64_DEFAULT_ROUNDS 10
276 #endif
277 
279 /* The ignored fourth argument allows us to instantiate the
280  same macro regardless of N. */
281 #define _philox2xWround_tpl(W, T) \
282 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key)); \
283 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key){ \
284  T hi; \
285  T lo = mulhilo##W(PHILOX_M2x##W##_0, ctr.v[0], &hi); \
286  struct r123array2x##W out = {{hi^key.v[0]^ctr.v[1], lo}}; \
287  return out; \
288 }
289 #define _philox2xWbumpkey_tpl(W) \
290 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array1x##W _philox2x##W##bumpkey( struct r123array1x##W key) { \
291  key.v[0] += PHILOX_W##W##_0; \
292  return key; \
293 }
294 
295 #define _philox4xWround_tpl(W, T) \
296 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key)); \
297 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key){ \
298  T hi0; \
299  T hi1; \
300  T lo0 = mulhilo##W(PHILOX_M4x##W##_0, ctr.v[0], &hi0); \
301  T lo1 = mulhilo##W(PHILOX_M4x##W##_1, ctr.v[2], &hi1); \
302  struct r123array4x##W out = {{hi1^ctr.v[1]^key.v[0], lo1, \
303  hi0^ctr.v[3]^key.v[1], lo0}}; \
304  return out; \
305 }
306 
307 #define _philox4xWbumpkey_tpl(W) \
308 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox4x##W##bumpkey( struct r123array2x##W key) { \
309  key.v[0] += PHILOX_W##W##_0; \
310  key.v[1] += PHILOX_W##W##_1; \
311  return key; \
312 }
313 
315 #define _philoxNxW_tpl(N, Nhalf, W, T) \
316  \
317 enum r123_enum_philox##N##x##W { philox##N##x##W##_rounds = PHILOX##N##x##W##_DEFAULT_ROUNDS }; \
318 typedef struct r123array##N##x##W philox##N##x##W##_ctr_t; \
319 typedef struct r123array##Nhalf##x##W philox##N##x##W##_key_t; \
320 typedef struct r123array##Nhalf##x##W philox##N##x##W##_ukey_t; \
321 R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_key_t philox##N##x##W##keyinit(philox##N##x##W##_ukey_t uk) { return uk; } \
322 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key)); \
323 R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key) { \
324  R123_ASSERT(R<=16); \
325  if(R>0){ ctr = _philox##N##x##W##round(ctr, key); } \
326  if(R>1){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
327  if(R>2){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
328  if(R>3){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
329  if(R>4){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
330  if(R>5){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
331  if(R>6){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
332  if(R>7){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
333  if(R>8){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
334  if(R>9){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
335  if(R>10){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
336  if(R>11){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
337  if(R>12){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
338  if(R>13){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
339  if(R>14){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
340  if(R>15){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
341  return ctr; \
342 }
343 
345 _philox4xWbumpkey_tpl(32)
346 _philox2xWround_tpl(32, uint32_t) /* philox2x32round */
347 _philox4xWround_tpl(32, uint32_t) /* philo4x32round */
348 
349 _philoxNxW_tpl(2, 1, 32, uint32_t) /* philox2x32bijection */
350 _philoxNxW_tpl(4, 2, 32, uint32_t) /* philox4x32bijection */
351 #if R123_USE_PHILOX_64BIT
352 
354 _philox4xWbumpkey_tpl(64)
355 _philox2xWround_tpl(64, uint64_t) /* philo2x64round */
356 _philox4xWround_tpl(64, uint64_t) /* philo4x64round */
358 _philoxNxW_tpl(2, 1, 64, uint64_t) /* philox2x64bijection */
359 _philoxNxW_tpl(4, 2, 64, uint64_t) /* philox4x64bijection */
360 #endif /* R123_USE_PHILOX_64BIT */
361 
362 #define philox2x32(c,k) philox2x32_R(philox2x32_rounds, c, k)
363 #define philox4x32(c,k) philox4x32_R(philox4x32_rounds, c, k)
364 #if R123_USE_PHILOX_64BIT
365 #define philox2x64(c,k) philox2x64_R(philox2x64_rounds, c, k)
366 #define philox4x64(c,k) philox4x64_R(philox4x64_rounds, c, k)
367 #endif /* R123_USE_PHILOX_64BIT */
368 
369 #if defined(__cplusplus)
370 
371 #define _PhiloxNxW_base_tpl(CType, KType, N, W) \
372 namespace r123{ \
373 template<unsigned int ROUNDS> \
374 struct Philox##N##x##W##_R{ \
375  typedef CType ctr_type; \
376  typedef KType key_type; \
377  typedef KType ukey_type; \
378  static const R123_METAL_CONSTANT_ADDRESS_SPACE unsigned int rounds=ROUNDS; \
379  inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const){ \
380  R123_STATIC_ASSERT(ROUNDS<=16, "philox is only unrolled up to 16 rounds\n"); \
381  return philox##N##x##W##_R(ROUNDS, ctr, key); \
382  } \
383 }; \
384 typedef Philox##N##x##W##_R<philox##N##x##W##_rounds> Philox##N##x##W; \
385  } // namespace r123
386 
389 #if R123_USE_PHILOX_64BIT
392 #endif
393 
394 /* The _tpl macros don't quite work to do string-pasting inside comments.
395  so we just write out the boilerplate documentation four times... */
396 
491 #endif /* __cplusplus */
492 
493 #endif /* _philox_dot_h_ */
_philox2xWbumpkey_tpl
_philox2xWbumpkey_tpl(32) _philox4xWbumpkey_tpl(32) _philox2xWround_tpl(32
_PhiloxNxW_base_tpl
#define _PhiloxNxW_base_tpl(CType, KType, N, W)
Definition: philox.h:371
array.h
_philox4xWround_tpl
uint32_t _philox4xWround_tpl(32, uint32_t) enum r123_enum_philox2x32
Definition: philox.h:347
r123array1x32
Definition: array.h:314
r123array2x32
Definition: array.h:315
r123array1x64
Definition: array.h:320
r123array2x64
Definition: array.h:321
r123array4x32
Definition: array.h:316
compilerfeatures.h
r123array4x64
Definition: array.h:322
_philoxNxW_tpl
#define _philoxNxW_tpl(N, Nhalf, W, T)
Definition: philox.h:315