32 #ifndef _philox_dot_h_
33 #define _philox_dot_h_
67 #define _mulhilo_dword_tpl(W, Word, Dword) \
68 R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
69 Dword product = ((Dword)a)*((Dword)b); \
71 return (Word)product; \
81 #define _mulhilo_asm_tpl(W, Word, INSN) \
82 R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \
85 INSN " %0,%1,%2\n\t" \
93 #define _mulhilo_asm_tpl(W, Word, INSN) \
94 R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \
98 : "=a"(ax), "=d"(dx) \
111 #define _mulhilo_msvc_intrin_tpl(W, Word, INTRIN) \
112 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
113 return INTRIN(a, b, hip); \
118 #define _mulhilo_cuda_intrin_tpl(W, Word, INTRIN) \
119 R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, R123_METAL_THREAD_ADDRESS_SPACE Word* hip){ \
120 *hip = INTRIN(a, b); \
139 #define _mulhilo_c99_tpl(W, Word) \
140 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, R123_METAL_THREAD_ADDRESS_SPACE Word *hip){ \
141 const unsigned WHALF = W/2; \
142 const Word LOMASK = ((((Word)1)<<WHALF)-1); \
144 Word ahi = a>>WHALF; \
145 Word alo = a& LOMASK; \
146 Word bhi = b>>WHALF; \
147 Word blo = b& LOMASK; \
149 Word ahbl = ahi*blo; \
150 Word albh = alo*bhi; \
152 Word ahbl_albh = ((ahbl&LOMASK) + (albh&LOMASK)); \
153 Word hi = ahi*bhi + (ahbl>>WHALF) + (albh>>WHALF); \
154 hi += ahbl_albh >> WHALF; \
156 hi += ((lo >> WHALF) < (ahbl_albh&LOMASK)); \
166 #define _mulhilo_fail_tpl(W, Word) \
167 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){ \
168 R123_STATIC_ASSERT(0, "mulhilo" #W " is not implemented on this machine\n"); \
176 #if R123_USE_MULHILO32_ASM
178 _mulhilo_asm_tpl(32, uint32_t,
"mulhwu")
180 _mulhilo_asm_tpl(32, uint32_t,
"mull")
184 _mulhilo_dword_tpl(32, uint32_t, uint64_t)
185 #elif R123_USE_MULHILO32_MULHI_INTRIN
186 _mulhilo_cuda_intrin_tpl(32, uint32_t, R123_MULHILO32_MULHI_INTRIN)
188 _mulhilo_c99_tpl(32, uint32_t)
192 #if R123_USE_PHILOX_64BIT
193 #if R123_USE_MULHILO64_ASM
195 _mulhilo_asm_tpl(64, uint64_t,
"mulhdu")
197 _mulhilo_asm_tpl(64, uint64_t,
"mulq")
199 #elif R123_USE_MULHILO64_MSVC_INTRIN
200 _mulhilo_msvc_intrin_tpl(64, uint64_t, _umul128)
201 #elif R123_USE_MULHILO64_CUDA_INTRIN
202 _mulhilo_cuda_intrin_tpl(64, uint64_t, __umul64hi)
203 #elif R123_USE_MULHILO64_OPENCL_INTRIN
204 _mulhilo_cuda_intrin_tpl(64, uint64_t, mul_hi)
205 #elif R123_USE_MULHILO64_MULHI_INTRIN
206 _mulhilo_cuda_intrin_tpl(64, uint64_t, R123_MULHILO64_MULHI_INTRIN)
207 #elif R123_USE_GNU_UINT128
208 _mulhilo_dword_tpl(64, uint64_t, __uint128_t)
209 #elif R123_USE_MULHILO64_C99
210 _mulhilo_c99_tpl(64, uint64_t)
212 _mulhilo_fail_tpl(64, uint64_t)
224 #ifndef PHILOX_M2x64_0
225 #define PHILOX_M2x64_0 R123_64BIT(0xD2B74407B1CE6E93)
228 #ifndef PHILOX_M4x64_0
229 #define PHILOX_M4x64_0 R123_64BIT(0xD2E7470EE14C6C93)
232 #ifndef PHILOX_M4x64_1
233 #define PHILOX_M4x64_1 R123_64BIT(0xCA5A826395121157)
236 #ifndef PHILOX_M2x32_0
237 #define PHILOX_M2x32_0 ((uint32_t)0xd256d193)
240 #ifndef PHILOX_M4x32_0
241 #define PHILOX_M4x32_0 ((uint32_t)0xD2511F53)
243 #ifndef PHILOX_M4x32_1
244 #define PHILOX_M4x32_1 ((uint32_t)0xCD9E8D57)
248 #define PHILOX_W64_0 R123_64BIT(0x9E3779B97F4A7C15)
251 #define PHILOX_W64_1 R123_64BIT(0xBB67AE8584CAA73B)
255 #define PHILOX_W32_0 ((uint32_t)0x9E3779B9)
258 #define PHILOX_W32_1 ((uint32_t)0xBB67AE85)
262 #ifndef PHILOX2x32_DEFAULT_ROUNDS
263 #define PHILOX2x32_DEFAULT_ROUNDS 10
266 #ifndef PHILOX2x64_DEFAULT_ROUNDS
267 #define PHILOX2x64_DEFAULT_ROUNDS 10
270 #ifndef PHILOX4x32_DEFAULT_ROUNDS
271 #define PHILOX4x32_DEFAULT_ROUNDS 10
274 #ifndef PHILOX4x64_DEFAULT_ROUNDS
275 #define PHILOX4x64_DEFAULT_ROUNDS 10
281 #define _philox2xWround_tpl(W, T) \
282 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key)); \
283 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key){ \
285 T lo = mulhilo##W(PHILOX_M2x##W##_0, ctr.v[0], &hi); \
286 struct r123array2x##W out = {{hi^key.v[0]^ctr.v[1], lo}}; \
289 #define _philox2xWbumpkey_tpl(W) \
290 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array1x##W _philox2x##W##bumpkey( struct r123array1x##W key) { \
291 key.v[0] += PHILOX_W##W##_0; \
295 #define _philox4xWround_tpl(W, T) \
296 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key)); \
297 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key){ \
300 T lo0 = mulhilo##W(PHILOX_M4x##W##_0, ctr.v[0], &hi0); \
301 T lo1 = mulhilo##W(PHILOX_M4x##W##_1, ctr.v[2], &hi1); \
302 struct r123array4x##W out = {{hi1^ctr.v[1]^key.v[0], lo1, \
303 hi0^ctr.v[3]^key.v[1], lo0}}; \
307 #define _philox4xWbumpkey_tpl(W) \
308 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox4x##W##bumpkey( struct r123array2x##W key) { \
309 key.v[0] += PHILOX_W##W##_0; \
310 key.v[1] += PHILOX_W##W##_1; \
315 #define _philoxNxW_tpl(N, Nhalf, W, T) \
317 enum r123_enum_philox##N##x##W { philox##N##x##W##_rounds = PHILOX##N##x##W##_DEFAULT_ROUNDS }; \
318 typedef struct r123array##N##x##W philox##N##x##W##_ctr_t; \
319 typedef struct r123array##Nhalf##x##W philox##N##x##W##_key_t; \
320 typedef struct r123array##Nhalf##x##W philox##N##x##W##_ukey_t; \
321 R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_key_t philox##N##x##W##keyinit(philox##N##x##W##_ukey_t uk) { return uk; } \
322 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key)); \
323 R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key) { \
324 R123_ASSERT(R<=16); \
325 if(R>0){ ctr = _philox##N##x##W##round(ctr, key); } \
326 if(R>1){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
327 if(R>2){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
328 if(R>3){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
329 if(R>4){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
330 if(R>5){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
331 if(R>6){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
332 if(R>7){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
333 if(R>8){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
334 if(R>9){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
335 if(R>10){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
336 if(R>11){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
337 if(R>12){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
338 if(R>13){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
339 if(R>14){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
340 if(R>15){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
345 _philox4xWbumpkey_tpl(32)
346 _philox2xWround_tpl(32, uint32_t)
351 #if R123_USE_PHILOX_64BIT
354 _philox4xWbumpkey_tpl(64)
355 _philox2xWround_tpl(64, uint64_t)
362 #define philox2x32(c,k) philox2x32_R(philox2x32_rounds, c, k)
363 #define philox4x32(c,k) philox4x32_R(philox4x32_rounds, c, k)
364 #if R123_USE_PHILOX_64BIT
365 #define philox2x64(c,k) philox2x64_R(philox2x64_rounds, c, k)
366 #define philox4x64(c,k) philox4x64_R(philox4x64_rounds, c, k)
369 #if defined(__cplusplus)
371 #define _PhiloxNxW_base_tpl(CType, KType, N, W) \
373 template<unsigned int ROUNDS> \
374 struct Philox##N##x##W##_R{ \
375 typedef CType ctr_type; \
376 typedef KType key_type; \
377 typedef KType ukey_type; \
378 static const R123_METAL_CONSTANT_ADDRESS_SPACE unsigned int rounds=ROUNDS; \
379 inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const){ \
380 R123_STATIC_ASSERT(ROUNDS<=16, "philox is only unrolled up to 16 rounds\n"); \
381 return philox##N##x##W##_R(ROUNDS, ctr, key); \
384 typedef Philox##N##x##W##_R<philox##N##x##W##_rounds> Philox##N##x##W; \
389 #if R123_USE_PHILOX_64BIT