1*6467f958SSadaf Ebrahimi /*
2*6467f958SSadaf Ebrahimi A C-program for MT19937, with initialization improved 2002/1/26.
3*6467f958SSadaf Ebrahimi Coded by Takuji Nishimura and Makoto Matsumoto.
4*6467f958SSadaf Ebrahimi
5*6467f958SSadaf Ebrahimi Before using, initialize the state by using init_genrand(seed)
6*6467f958SSadaf Ebrahimi or init_by_array(init_key, key_length).
7*6467f958SSadaf Ebrahimi
8*6467f958SSadaf Ebrahimi Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
9*6467f958SSadaf Ebrahimi All rights reserved.
10*6467f958SSadaf Ebrahimi
11*6467f958SSadaf Ebrahimi Redistribution and use in source and binary forms, with or without
12*6467f958SSadaf Ebrahimi modification, are permitted provided that the following conditions
13*6467f958SSadaf Ebrahimi are met:
14*6467f958SSadaf Ebrahimi
15*6467f958SSadaf Ebrahimi 1. Redistributions of source code must retain the above copyright
16*6467f958SSadaf Ebrahimi notice, this list of conditions and the following disclaimer.
17*6467f958SSadaf Ebrahimi
18*6467f958SSadaf Ebrahimi 2. Redistributions in binary form must reproduce the above copyright
19*6467f958SSadaf Ebrahimi notice, this list of conditions and the following disclaimer in the
20*6467f958SSadaf Ebrahimi documentation and/or other materials provided with the distribution.
21*6467f958SSadaf Ebrahimi
22*6467f958SSadaf Ebrahimi 3. The names of its contributors may not be used to endorse or promote
23*6467f958SSadaf Ebrahimi products derived from this software without specific prior written
24*6467f958SSadaf Ebrahimi permission.
25*6467f958SSadaf Ebrahimi
26*6467f958SSadaf Ebrahimi THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27*6467f958SSadaf Ebrahimi "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28*6467f958SSadaf Ebrahimi LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29*6467f958SSadaf Ebrahimi A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
30*6467f958SSadaf Ebrahimi OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31*6467f958SSadaf Ebrahimi EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32*6467f958SSadaf Ebrahimi PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33*6467f958SSadaf Ebrahimi PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34*6467f958SSadaf Ebrahimi LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35*6467f958SSadaf Ebrahimi NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36*6467f958SSadaf Ebrahimi SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37*6467f958SSadaf Ebrahimi
38*6467f958SSadaf Ebrahimi
39*6467f958SSadaf Ebrahimi Any feedback is very welcome.
40*6467f958SSadaf Ebrahimi http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
41*6467f958SSadaf Ebrahimi email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
42*6467f958SSadaf Ebrahimi
43*6467f958SSadaf Ebrahimi Modifications for use in OpenCL by Ian Ollmann, Apple Inc.
44*6467f958SSadaf Ebrahimi
45*6467f958SSadaf Ebrahimi */
46*6467f958SSadaf Ebrahimi
47*6467f958SSadaf Ebrahimi #include <stdio.h>
48*6467f958SSadaf Ebrahimi #include <stdlib.h>
49*6467f958SSadaf Ebrahimi #include "mt19937.h"
50*6467f958SSadaf Ebrahimi #include "mingw_compat.h"
51*6467f958SSadaf Ebrahimi #include "harness/alloc.h"
52*6467f958SSadaf Ebrahimi
53*6467f958SSadaf Ebrahimi #ifdef __SSE2__
54*6467f958SSadaf Ebrahimi #include <mutex>
55*6467f958SSadaf Ebrahimi #include <emmintrin.h>
56*6467f958SSadaf Ebrahimi #endif
57*6467f958SSadaf Ebrahimi
58*6467f958SSadaf Ebrahimi /* Period parameters */
59*6467f958SSadaf Ebrahimi #define N 624 /* vector code requires multiple of 4 here */
60*6467f958SSadaf Ebrahimi #define M 397
61*6467f958SSadaf Ebrahimi #define MATRIX_A (cl_uint)0x9908b0dfUL /* constant vector a */
62*6467f958SSadaf Ebrahimi #define UPPER_MASK (cl_uint)0x80000000UL /* most significant w-r bits */
63*6467f958SSadaf Ebrahimi #define LOWER_MASK (cl_uint)0x7fffffffUL /* least significant r bits */
64*6467f958SSadaf Ebrahimi
65*6467f958SSadaf Ebrahimi typedef struct _MTdata
66*6467f958SSadaf Ebrahimi {
67*6467f958SSadaf Ebrahimi cl_uint mt[N];
68*6467f958SSadaf Ebrahimi #ifdef __SSE2__
69*6467f958SSadaf Ebrahimi cl_uint cache[N];
70*6467f958SSadaf Ebrahimi #endif
71*6467f958SSadaf Ebrahimi cl_int mti;
72*6467f958SSadaf Ebrahimi } _MTdata;
73*6467f958SSadaf Ebrahimi
74*6467f958SSadaf Ebrahimi /* initializes mt[N] with a seed */
init_genrand(cl_uint s)75*6467f958SSadaf Ebrahimi MTdata init_genrand(cl_uint s)
76*6467f958SSadaf Ebrahimi {
77*6467f958SSadaf Ebrahimi MTdata r = (MTdata)align_malloc(sizeof(_MTdata), 16);
78*6467f958SSadaf Ebrahimi if (NULL != r)
79*6467f958SSadaf Ebrahimi {
80*6467f958SSadaf Ebrahimi cl_uint *mt = r->mt;
81*6467f958SSadaf Ebrahimi int mti = 0;
82*6467f958SSadaf Ebrahimi mt[0] = s; // & 0xffffffffUL;
83*6467f958SSadaf Ebrahimi for (mti = 1; mti < N; mti++)
84*6467f958SSadaf Ebrahimi {
85*6467f958SSadaf Ebrahimi mt[mti] = (cl_uint)(
86*6467f958SSadaf Ebrahimi 1812433253UL * (mt[mti - 1] ^ (mt[mti - 1] >> 30)) + mti);
87*6467f958SSadaf Ebrahimi /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
88*6467f958SSadaf Ebrahimi /* In the previous versions, MSBs of the seed affect */
89*6467f958SSadaf Ebrahimi /* only MSBs of the array mt[]. */
90*6467f958SSadaf Ebrahimi /* 2002/01/09 modified by Makoto Matsumoto */
91*6467f958SSadaf Ebrahimi // mt[mti] &= 0xffffffffUL;
92*6467f958SSadaf Ebrahimi /* for >32 bit machines */
93*6467f958SSadaf Ebrahimi }
94*6467f958SSadaf Ebrahimi r->mti = mti;
95*6467f958SSadaf Ebrahimi }
96*6467f958SSadaf Ebrahimi
97*6467f958SSadaf Ebrahimi return r;
98*6467f958SSadaf Ebrahimi }
99*6467f958SSadaf Ebrahimi
free_mtdata(MTdata d)100*6467f958SSadaf Ebrahimi void free_mtdata(MTdata d)
101*6467f958SSadaf Ebrahimi {
102*6467f958SSadaf Ebrahimi if (d) align_free(d);
103*6467f958SSadaf Ebrahimi }
104*6467f958SSadaf Ebrahimi
105*6467f958SSadaf Ebrahimi /* generates a random number on [0,0xffffffff]-interval */
genrand_int32(MTdata d)106*6467f958SSadaf Ebrahimi cl_uint genrand_int32(MTdata d)
107*6467f958SSadaf Ebrahimi {
108*6467f958SSadaf Ebrahimi /* mag01[x] = x * MATRIX_A for x=0,1 */
109*6467f958SSadaf Ebrahimi static const cl_uint mag01[2] = { 0x0UL, MATRIX_A };
110*6467f958SSadaf Ebrahimi #ifdef __SSE2__
111*6467f958SSadaf Ebrahimi static std::once_flag init_flag;
112*6467f958SSadaf Ebrahimi static union {
113*6467f958SSadaf Ebrahimi __m128i v;
114*6467f958SSadaf Ebrahimi cl_uint s[4];
115*6467f958SSadaf Ebrahimi } upper_mask, lower_mask, one, matrix_a, c0, c1;
116*6467f958SSadaf Ebrahimi #endif
117*6467f958SSadaf Ebrahimi
118*6467f958SSadaf Ebrahimi
119*6467f958SSadaf Ebrahimi cl_uint *mt = d->mt;
120*6467f958SSadaf Ebrahimi cl_uint y;
121*6467f958SSadaf Ebrahimi
122*6467f958SSadaf Ebrahimi if (d->mti == N)
123*6467f958SSadaf Ebrahimi { /* generate N words at one time */
124*6467f958SSadaf Ebrahimi int kk;
125*6467f958SSadaf Ebrahimi
126*6467f958SSadaf Ebrahimi #ifdef __SSE2__
127*6467f958SSadaf Ebrahimi auto init_fn = []() {
128*6467f958SSadaf Ebrahimi upper_mask.s[0] = upper_mask.s[1] = upper_mask.s[2] =
129*6467f958SSadaf Ebrahimi upper_mask.s[3] = UPPER_MASK;
130*6467f958SSadaf Ebrahimi lower_mask.s[0] = lower_mask.s[1] = lower_mask.s[2] =
131*6467f958SSadaf Ebrahimi lower_mask.s[3] = LOWER_MASK;
132*6467f958SSadaf Ebrahimi one.s[0] = one.s[1] = one.s[2] = one.s[3] = 1;
133*6467f958SSadaf Ebrahimi matrix_a.s[0] = matrix_a.s[1] = matrix_a.s[2] = matrix_a.s[3] =
134*6467f958SSadaf Ebrahimi MATRIX_A;
135*6467f958SSadaf Ebrahimi c0.s[0] = c0.s[1] = c0.s[2] = c0.s[3] = (cl_uint)0x9d2c5680UL;
136*6467f958SSadaf Ebrahimi c1.s[0] = c1.s[1] = c1.s[2] = c1.s[3] = (cl_uint)0xefc60000UL;
137*6467f958SSadaf Ebrahimi };
138*6467f958SSadaf Ebrahimi std::call_once(init_flag, init_fn);
139*6467f958SSadaf Ebrahimi #endif
140*6467f958SSadaf Ebrahimi
141*6467f958SSadaf Ebrahimi kk = 0;
142*6467f958SSadaf Ebrahimi #ifdef __SSE2__
143*6467f958SSadaf Ebrahimi // vector loop
144*6467f958SSadaf Ebrahimi for (; kk + 4 <= N - M; kk += 4)
145*6467f958SSadaf Ebrahimi {
146*6467f958SSadaf Ebrahimi // ((mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK))
147*6467f958SSadaf Ebrahimi __m128i vy = _mm_or_si128(
148*6467f958SSadaf Ebrahimi _mm_and_si128(_mm_load_si128((__m128i *)(mt + kk)),
149*6467f958SSadaf Ebrahimi upper_mask.v),
150*6467f958SSadaf Ebrahimi _mm_and_si128(_mm_loadu_si128((__m128i *)(mt + kk + 1)),
151*6467f958SSadaf Ebrahimi lower_mask.v));
152*6467f958SSadaf Ebrahimi
153*6467f958SSadaf Ebrahimi // y & 1 ? -1 : 0
154*6467f958SSadaf Ebrahimi __m128i mask = _mm_cmpeq_epi32(_mm_and_si128(vy, one.v), one.v);
155*6467f958SSadaf Ebrahimi // y & 1 ? MATRIX_A, 0 = mag01[y & (cl_uint) 0x1UL]
156*6467f958SSadaf Ebrahimi __m128i vmag01 = _mm_and_si128(mask, matrix_a.v);
157*6467f958SSadaf Ebrahimi // mt[kk+M] ^ (y >> 1)
158*6467f958SSadaf Ebrahimi __m128i vr =
159*6467f958SSadaf Ebrahimi _mm_xor_si128(_mm_loadu_si128((__m128i *)(mt + kk + M)),
160*6467f958SSadaf Ebrahimi (__m128i)_mm_srli_epi32(vy, 1));
161*6467f958SSadaf Ebrahimi // mt[kk+M] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL]
162*6467f958SSadaf Ebrahimi vr = _mm_xor_si128(vr, vmag01);
163*6467f958SSadaf Ebrahimi _mm_store_si128((__m128i *)(mt + kk), vr);
164*6467f958SSadaf Ebrahimi }
165*6467f958SSadaf Ebrahimi #endif
166*6467f958SSadaf Ebrahimi for (; kk < N - M; kk++)
167*6467f958SSadaf Ebrahimi {
168*6467f958SSadaf Ebrahimi y = (cl_uint)((mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK));
169*6467f958SSadaf Ebrahimi mt[kk] = mt[kk + M] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL];
170*6467f958SSadaf Ebrahimi }
171*6467f958SSadaf Ebrahimi
172*6467f958SSadaf Ebrahimi #ifdef __SSE2__
173*6467f958SSadaf Ebrahimi // advance to next aligned location
174*6467f958SSadaf Ebrahimi for (; kk < N - 1 && (kk & 3); kk++)
175*6467f958SSadaf Ebrahimi {
176*6467f958SSadaf Ebrahimi y = (cl_uint)((mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK));
177*6467f958SSadaf Ebrahimi mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL];
178*6467f958SSadaf Ebrahimi }
179*6467f958SSadaf Ebrahimi
180*6467f958SSadaf Ebrahimi // vector loop
181*6467f958SSadaf Ebrahimi for (; kk + 4 <= N - 1; kk += 4)
182*6467f958SSadaf Ebrahimi {
183*6467f958SSadaf Ebrahimi __m128i vy = _mm_or_si128(
184*6467f958SSadaf Ebrahimi _mm_and_si128(_mm_load_si128((__m128i *)(mt + kk)),
185*6467f958SSadaf Ebrahimi upper_mask.v),
186*6467f958SSadaf Ebrahimi // ((mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK))
187*6467f958SSadaf Ebrahimi _mm_and_si128(_mm_loadu_si128((__m128i *)(mt + kk + 1)),
188*6467f958SSadaf Ebrahimi lower_mask.v));
189*6467f958SSadaf Ebrahimi
190*6467f958SSadaf Ebrahimi // y & 1 ? -1 : 0
191*6467f958SSadaf Ebrahimi __m128i mask = _mm_cmpeq_epi32(_mm_and_si128(vy, one.v), one.v);
192*6467f958SSadaf Ebrahimi // y & 1 ? MATRIX_A, 0 = mag01[y & (cl_uint) 0x1UL]
193*6467f958SSadaf Ebrahimi __m128i vmag01 = _mm_and_si128(mask, matrix_a.v);
194*6467f958SSadaf Ebrahimi // mt[kk+M-N] ^ (y >> 1)
195*6467f958SSadaf Ebrahimi __m128i vr =
196*6467f958SSadaf Ebrahimi _mm_xor_si128(_mm_loadu_si128((__m128i *)(mt + kk + M - N)),
197*6467f958SSadaf Ebrahimi _mm_srli_epi32(vy, 1));
198*6467f958SSadaf Ebrahimi // mt[kk+M] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL]
199*6467f958SSadaf Ebrahimi vr = _mm_xor_si128(vr, vmag01);
200*6467f958SSadaf Ebrahimi _mm_store_si128((__m128i *)(mt + kk), vr);
201*6467f958SSadaf Ebrahimi }
202*6467f958SSadaf Ebrahimi #endif
203*6467f958SSadaf Ebrahimi
204*6467f958SSadaf Ebrahimi for (; kk < N - 1; kk++)
205*6467f958SSadaf Ebrahimi {
206*6467f958SSadaf Ebrahimi y = (cl_uint)((mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK));
207*6467f958SSadaf Ebrahimi mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL];
208*6467f958SSadaf Ebrahimi }
209*6467f958SSadaf Ebrahimi y = (cl_uint)((mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK));
210*6467f958SSadaf Ebrahimi mt[N - 1] = mt[M - 1] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL];
211*6467f958SSadaf Ebrahimi
212*6467f958SSadaf Ebrahimi #ifdef __SSE2__
213*6467f958SSadaf Ebrahimi // Do the tempering ahead of time in vector code
214*6467f958SSadaf Ebrahimi for (kk = 0; kk + 4 <= N; kk += 4)
215*6467f958SSadaf Ebrahimi {
216*6467f958SSadaf Ebrahimi // y = mt[k];
217*6467f958SSadaf Ebrahimi __m128i vy = _mm_load_si128((__m128i *)(mt + kk));
218*6467f958SSadaf Ebrahimi // y ^= (y >> 11);
219*6467f958SSadaf Ebrahimi vy = _mm_xor_si128(vy, _mm_srli_epi32(vy, 11));
220*6467f958SSadaf Ebrahimi // y ^= (y << 7) & (cl_uint) 0x9d2c5680UL;
221*6467f958SSadaf Ebrahimi vy = _mm_xor_si128(vy, _mm_and_si128(_mm_slli_epi32(vy, 7), c0.v));
222*6467f958SSadaf Ebrahimi // y ^= (y << 15) & (cl_uint) 0xefc60000UL;
223*6467f958SSadaf Ebrahimi vy = _mm_xor_si128(vy, _mm_and_si128(_mm_slli_epi32(vy, 15), c1.v));
224*6467f958SSadaf Ebrahimi // y ^= (y >> 18);
225*6467f958SSadaf Ebrahimi vy = _mm_xor_si128(vy, _mm_srli_epi32(vy, 18));
226*6467f958SSadaf Ebrahimi _mm_store_si128((__m128i *)(d->cache + kk), vy);
227*6467f958SSadaf Ebrahimi }
228*6467f958SSadaf Ebrahimi #endif
229*6467f958SSadaf Ebrahimi
230*6467f958SSadaf Ebrahimi d->mti = 0;
231*6467f958SSadaf Ebrahimi }
232*6467f958SSadaf Ebrahimi #ifdef __SSE2__
233*6467f958SSadaf Ebrahimi y = d->cache[d->mti++];
234*6467f958SSadaf Ebrahimi #else
235*6467f958SSadaf Ebrahimi y = mt[d->mti++];
236*6467f958SSadaf Ebrahimi
237*6467f958SSadaf Ebrahimi /* Tempering */
238*6467f958SSadaf Ebrahimi y ^= (y >> 11);
239*6467f958SSadaf Ebrahimi y ^= (y << 7) & (cl_uint)0x9d2c5680UL;
240*6467f958SSadaf Ebrahimi y ^= (y << 15) & (cl_uint)0xefc60000UL;
241*6467f958SSadaf Ebrahimi y ^= (y >> 18);
242*6467f958SSadaf Ebrahimi #endif
243*6467f958SSadaf Ebrahimi
244*6467f958SSadaf Ebrahimi
245*6467f958SSadaf Ebrahimi return y;
246*6467f958SSadaf Ebrahimi }
247*6467f958SSadaf Ebrahimi
genrand_int64(MTdata d)248*6467f958SSadaf Ebrahimi cl_ulong genrand_int64(MTdata d)
249*6467f958SSadaf Ebrahimi {
250*6467f958SSadaf Ebrahimi return ((cl_ulong)genrand_int32(d) << 32) | (cl_uint)genrand_int32(d);
251*6467f958SSadaf Ebrahimi }
252*6467f958SSadaf Ebrahimi
253*6467f958SSadaf Ebrahimi /* generates a random number on [0,1]-real-interval */
genrand_real1(MTdata d)254*6467f958SSadaf Ebrahimi double genrand_real1(MTdata d)
255*6467f958SSadaf Ebrahimi {
256*6467f958SSadaf Ebrahimi return genrand_int32(d) * (1.0 / 4294967295.0);
257*6467f958SSadaf Ebrahimi /* divided by 2^32-1 */
258*6467f958SSadaf Ebrahimi }
259*6467f958SSadaf Ebrahimi
260*6467f958SSadaf Ebrahimi /* generates a random number on [0,1)-real-interval */
genrand_real2(MTdata d)261*6467f958SSadaf Ebrahimi double genrand_real2(MTdata d)
262*6467f958SSadaf Ebrahimi {
263*6467f958SSadaf Ebrahimi return genrand_int32(d) * (1.0 / 4294967296.0);
264*6467f958SSadaf Ebrahimi /* divided by 2^32 */
265*6467f958SSadaf Ebrahimi }
266*6467f958SSadaf Ebrahimi
267*6467f958SSadaf Ebrahimi /* generates a random number on (0,1)-real-interval */
genrand_real3(MTdata d)268*6467f958SSadaf Ebrahimi double genrand_real3(MTdata d)
269*6467f958SSadaf Ebrahimi {
270*6467f958SSadaf Ebrahimi return (((double)genrand_int32(d)) + 0.5) * (1.0 / 4294967296.0);
271*6467f958SSadaf Ebrahimi /* divided by 2^32 */
272*6467f958SSadaf Ebrahimi }
273*6467f958SSadaf Ebrahimi
274*6467f958SSadaf Ebrahimi /* generates a random number on [0,1) with 53-bit resolution*/
genrand_res53(MTdata d)275*6467f958SSadaf Ebrahimi double genrand_res53(MTdata d)
276*6467f958SSadaf Ebrahimi {
277*6467f958SSadaf Ebrahimi unsigned long a = genrand_int32(d) >> 5, b = genrand_int32(d) >> 6;
278*6467f958SSadaf Ebrahimi return (a * 67108864.0 + b) * (1.0 / 9007199254740992.0);
279*6467f958SSadaf Ebrahimi }
280*6467f958SSadaf Ebrahimi
genrand_bool(MTdata d)281*6467f958SSadaf Ebrahimi bool genrand_bool(MTdata d) { return ((cl_uint)genrand_int32(d) & 1); }
282