xref: /aosp_15_r20/external/OpenCL-CTS/test_common/harness/mt19937.cpp (revision 6467f958c7de8070b317fc65bcb0f6472e388d82)
1*6467f958SSadaf Ebrahimi /*
2*6467f958SSadaf Ebrahimi    A C-program for MT19937, with initialization improved 2002/1/26.
3*6467f958SSadaf Ebrahimi    Coded by Takuji Nishimura and Makoto Matsumoto.
4*6467f958SSadaf Ebrahimi 
5*6467f958SSadaf Ebrahimi    Before using, initialize the state by using init_genrand(seed)
6*6467f958SSadaf Ebrahimi    or init_by_array(init_key, key_length).
7*6467f958SSadaf Ebrahimi 
8*6467f958SSadaf Ebrahimi    Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
9*6467f958SSadaf Ebrahimi    All rights reserved.
10*6467f958SSadaf Ebrahimi 
11*6467f958SSadaf Ebrahimi    Redistribution and use in source and binary forms, with or without
12*6467f958SSadaf Ebrahimi    modification, are permitted provided that the following conditions
13*6467f958SSadaf Ebrahimi    are met:
14*6467f958SSadaf Ebrahimi 
15*6467f958SSadaf Ebrahimi      1. Redistributions of source code must retain the above copyright
16*6467f958SSadaf Ebrahimi         notice, this list of conditions and the following disclaimer.
17*6467f958SSadaf Ebrahimi 
18*6467f958SSadaf Ebrahimi      2. Redistributions in binary form must reproduce the above copyright
19*6467f958SSadaf Ebrahimi         notice, this list of conditions and the following disclaimer in the
20*6467f958SSadaf Ebrahimi         documentation and/or other materials provided with the distribution.
21*6467f958SSadaf Ebrahimi 
22*6467f958SSadaf Ebrahimi      3. The names of its contributors may not be used to endorse or promote
23*6467f958SSadaf Ebrahimi         products derived from this software without specific prior written
24*6467f958SSadaf Ebrahimi         permission.
25*6467f958SSadaf Ebrahimi 
26*6467f958SSadaf Ebrahimi    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27*6467f958SSadaf Ebrahimi    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28*6467f958SSadaf Ebrahimi    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29*6467f958SSadaf Ebrahimi    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
30*6467f958SSadaf Ebrahimi    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31*6467f958SSadaf Ebrahimi    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32*6467f958SSadaf Ebrahimi    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33*6467f958SSadaf Ebrahimi    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34*6467f958SSadaf Ebrahimi    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35*6467f958SSadaf Ebrahimi    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36*6467f958SSadaf Ebrahimi    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37*6467f958SSadaf Ebrahimi 
38*6467f958SSadaf Ebrahimi 
39*6467f958SSadaf Ebrahimi    Any feedback is very welcome.
40*6467f958SSadaf Ebrahimi    http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
41*6467f958SSadaf Ebrahimi    email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
42*6467f958SSadaf Ebrahimi 
43*6467f958SSadaf Ebrahimi    Modifications for use in OpenCL by Ian Ollmann, Apple Inc.
44*6467f958SSadaf Ebrahimi 
45*6467f958SSadaf Ebrahimi */
46*6467f958SSadaf Ebrahimi 
47*6467f958SSadaf Ebrahimi #include <stdio.h>
48*6467f958SSadaf Ebrahimi #include <stdlib.h>
49*6467f958SSadaf Ebrahimi #include "mt19937.h"
50*6467f958SSadaf Ebrahimi #include "mingw_compat.h"
51*6467f958SSadaf Ebrahimi #include "harness/alloc.h"
52*6467f958SSadaf Ebrahimi 
53*6467f958SSadaf Ebrahimi #ifdef __SSE2__
54*6467f958SSadaf Ebrahimi #include <mutex>
55*6467f958SSadaf Ebrahimi #include <emmintrin.h>
56*6467f958SSadaf Ebrahimi #endif
57*6467f958SSadaf Ebrahimi 
58*6467f958SSadaf Ebrahimi /* Period parameters */
59*6467f958SSadaf Ebrahimi #define N 624 /* vector code requires multiple of 4 here */
60*6467f958SSadaf Ebrahimi #define M 397
61*6467f958SSadaf Ebrahimi #define MATRIX_A (cl_uint)0x9908b0dfUL /* constant vector a */
62*6467f958SSadaf Ebrahimi #define UPPER_MASK (cl_uint)0x80000000UL /* most significant w-r bits */
63*6467f958SSadaf Ebrahimi #define LOWER_MASK (cl_uint)0x7fffffffUL /* least significant r bits */
64*6467f958SSadaf Ebrahimi 
65*6467f958SSadaf Ebrahimi typedef struct _MTdata
66*6467f958SSadaf Ebrahimi {
67*6467f958SSadaf Ebrahimi     cl_uint mt[N];
68*6467f958SSadaf Ebrahimi #ifdef __SSE2__
69*6467f958SSadaf Ebrahimi     cl_uint cache[N];
70*6467f958SSadaf Ebrahimi #endif
71*6467f958SSadaf Ebrahimi     cl_int mti;
72*6467f958SSadaf Ebrahimi } _MTdata;
73*6467f958SSadaf Ebrahimi 
74*6467f958SSadaf Ebrahimi /* initializes mt[N] with a seed */
init_genrand(cl_uint s)75*6467f958SSadaf Ebrahimi MTdata init_genrand(cl_uint s)
76*6467f958SSadaf Ebrahimi {
77*6467f958SSadaf Ebrahimi     MTdata r = (MTdata)align_malloc(sizeof(_MTdata), 16);
78*6467f958SSadaf Ebrahimi     if (NULL != r)
79*6467f958SSadaf Ebrahimi     {
80*6467f958SSadaf Ebrahimi         cl_uint *mt = r->mt;
81*6467f958SSadaf Ebrahimi         int mti = 0;
82*6467f958SSadaf Ebrahimi         mt[0] = s; // & 0xffffffffUL;
83*6467f958SSadaf Ebrahimi         for (mti = 1; mti < N; mti++)
84*6467f958SSadaf Ebrahimi         {
85*6467f958SSadaf Ebrahimi             mt[mti] = (cl_uint)(
86*6467f958SSadaf Ebrahimi                 1812433253UL * (mt[mti - 1] ^ (mt[mti - 1] >> 30)) + mti);
87*6467f958SSadaf Ebrahimi             /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
88*6467f958SSadaf Ebrahimi             /* In the previous versions, MSBs of the seed affect   */
89*6467f958SSadaf Ebrahimi             /* only MSBs of the array mt[].                        */
90*6467f958SSadaf Ebrahimi             /* 2002/01/09 modified by Makoto Matsumoto             */
91*6467f958SSadaf Ebrahimi             // mt[mti] &= 0xffffffffUL;
92*6467f958SSadaf Ebrahimi             /* for >32 bit machines */
93*6467f958SSadaf Ebrahimi         }
94*6467f958SSadaf Ebrahimi         r->mti = mti;
95*6467f958SSadaf Ebrahimi     }
96*6467f958SSadaf Ebrahimi 
97*6467f958SSadaf Ebrahimi     return r;
98*6467f958SSadaf Ebrahimi }
99*6467f958SSadaf Ebrahimi 
free_mtdata(MTdata d)100*6467f958SSadaf Ebrahimi void free_mtdata(MTdata d)
101*6467f958SSadaf Ebrahimi {
102*6467f958SSadaf Ebrahimi     if (d) align_free(d);
103*6467f958SSadaf Ebrahimi }
104*6467f958SSadaf Ebrahimi 
105*6467f958SSadaf Ebrahimi /* generates a random number on [0,0xffffffff]-interval */
genrand_int32(MTdata d)106*6467f958SSadaf Ebrahimi cl_uint genrand_int32(MTdata d)
107*6467f958SSadaf Ebrahimi {
108*6467f958SSadaf Ebrahimi     /* mag01[x] = x * MATRIX_A  for x=0,1 */
109*6467f958SSadaf Ebrahimi     static const cl_uint mag01[2] = { 0x0UL, MATRIX_A };
110*6467f958SSadaf Ebrahimi #ifdef __SSE2__
111*6467f958SSadaf Ebrahimi     static std::once_flag init_flag;
112*6467f958SSadaf Ebrahimi     static union {
113*6467f958SSadaf Ebrahimi         __m128i v;
114*6467f958SSadaf Ebrahimi         cl_uint s[4];
115*6467f958SSadaf Ebrahimi     } upper_mask, lower_mask, one, matrix_a, c0, c1;
116*6467f958SSadaf Ebrahimi #endif
117*6467f958SSadaf Ebrahimi 
118*6467f958SSadaf Ebrahimi 
119*6467f958SSadaf Ebrahimi     cl_uint *mt = d->mt;
120*6467f958SSadaf Ebrahimi     cl_uint y;
121*6467f958SSadaf Ebrahimi 
122*6467f958SSadaf Ebrahimi     if (d->mti == N)
123*6467f958SSadaf Ebrahimi     { /* generate N words at one time */
124*6467f958SSadaf Ebrahimi         int kk;
125*6467f958SSadaf Ebrahimi 
126*6467f958SSadaf Ebrahimi #ifdef __SSE2__
127*6467f958SSadaf Ebrahimi         auto init_fn = []() {
128*6467f958SSadaf Ebrahimi             upper_mask.s[0] = upper_mask.s[1] = upper_mask.s[2] =
129*6467f958SSadaf Ebrahimi                 upper_mask.s[3] = UPPER_MASK;
130*6467f958SSadaf Ebrahimi             lower_mask.s[0] = lower_mask.s[1] = lower_mask.s[2] =
131*6467f958SSadaf Ebrahimi                 lower_mask.s[3] = LOWER_MASK;
132*6467f958SSadaf Ebrahimi             one.s[0] = one.s[1] = one.s[2] = one.s[3] = 1;
133*6467f958SSadaf Ebrahimi             matrix_a.s[0] = matrix_a.s[1] = matrix_a.s[2] = matrix_a.s[3] =
134*6467f958SSadaf Ebrahimi                 MATRIX_A;
135*6467f958SSadaf Ebrahimi             c0.s[0] = c0.s[1] = c0.s[2] = c0.s[3] = (cl_uint)0x9d2c5680UL;
136*6467f958SSadaf Ebrahimi             c1.s[0] = c1.s[1] = c1.s[2] = c1.s[3] = (cl_uint)0xefc60000UL;
137*6467f958SSadaf Ebrahimi         };
138*6467f958SSadaf Ebrahimi         std::call_once(init_flag, init_fn);
139*6467f958SSadaf Ebrahimi #endif
140*6467f958SSadaf Ebrahimi 
141*6467f958SSadaf Ebrahimi         kk = 0;
142*6467f958SSadaf Ebrahimi #ifdef __SSE2__
143*6467f958SSadaf Ebrahimi         // vector loop
144*6467f958SSadaf Ebrahimi         for (; kk + 4 <= N - M; kk += 4)
145*6467f958SSadaf Ebrahimi         {
146*6467f958SSadaf Ebrahimi             // ((mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK))
147*6467f958SSadaf Ebrahimi             __m128i vy = _mm_or_si128(
148*6467f958SSadaf Ebrahimi                 _mm_and_si128(_mm_load_si128((__m128i *)(mt + kk)),
149*6467f958SSadaf Ebrahimi                               upper_mask.v),
150*6467f958SSadaf Ebrahimi                 _mm_and_si128(_mm_loadu_si128((__m128i *)(mt + kk + 1)),
151*6467f958SSadaf Ebrahimi                               lower_mask.v));
152*6467f958SSadaf Ebrahimi 
153*6467f958SSadaf Ebrahimi             // y & 1 ? -1 : 0
154*6467f958SSadaf Ebrahimi             __m128i mask = _mm_cmpeq_epi32(_mm_and_si128(vy, one.v), one.v);
155*6467f958SSadaf Ebrahimi             // y & 1 ? MATRIX_A, 0    =  mag01[y & (cl_uint) 0x1UL]
156*6467f958SSadaf Ebrahimi             __m128i vmag01 = _mm_and_si128(mask, matrix_a.v);
157*6467f958SSadaf Ebrahimi             // mt[kk+M] ^ (y >> 1)
158*6467f958SSadaf Ebrahimi             __m128i vr =
159*6467f958SSadaf Ebrahimi                 _mm_xor_si128(_mm_loadu_si128((__m128i *)(mt + kk + M)),
160*6467f958SSadaf Ebrahimi                               (__m128i)_mm_srli_epi32(vy, 1));
161*6467f958SSadaf Ebrahimi             // mt[kk+M] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL]
162*6467f958SSadaf Ebrahimi             vr = _mm_xor_si128(vr, vmag01);
163*6467f958SSadaf Ebrahimi             _mm_store_si128((__m128i *)(mt + kk), vr);
164*6467f958SSadaf Ebrahimi         }
165*6467f958SSadaf Ebrahimi #endif
166*6467f958SSadaf Ebrahimi         for (; kk < N - M; kk++)
167*6467f958SSadaf Ebrahimi         {
168*6467f958SSadaf Ebrahimi             y = (cl_uint)((mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK));
169*6467f958SSadaf Ebrahimi             mt[kk] = mt[kk + M] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL];
170*6467f958SSadaf Ebrahimi         }
171*6467f958SSadaf Ebrahimi 
172*6467f958SSadaf Ebrahimi #ifdef __SSE2__
173*6467f958SSadaf Ebrahimi         // advance to next aligned location
174*6467f958SSadaf Ebrahimi         for (; kk < N - 1 && (kk & 3); kk++)
175*6467f958SSadaf Ebrahimi         {
176*6467f958SSadaf Ebrahimi             y = (cl_uint)((mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK));
177*6467f958SSadaf Ebrahimi             mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL];
178*6467f958SSadaf Ebrahimi         }
179*6467f958SSadaf Ebrahimi 
180*6467f958SSadaf Ebrahimi         // vector loop
181*6467f958SSadaf Ebrahimi         for (; kk + 4 <= N - 1; kk += 4)
182*6467f958SSadaf Ebrahimi         {
183*6467f958SSadaf Ebrahimi             __m128i vy = _mm_or_si128(
184*6467f958SSadaf Ebrahimi                 _mm_and_si128(_mm_load_si128((__m128i *)(mt + kk)),
185*6467f958SSadaf Ebrahimi                               upper_mask.v),
186*6467f958SSadaf Ebrahimi                 // ((mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK))
187*6467f958SSadaf Ebrahimi                 _mm_and_si128(_mm_loadu_si128((__m128i *)(mt + kk + 1)),
188*6467f958SSadaf Ebrahimi                               lower_mask.v));
189*6467f958SSadaf Ebrahimi 
190*6467f958SSadaf Ebrahimi             // y & 1 ? -1 : 0
191*6467f958SSadaf Ebrahimi             __m128i mask = _mm_cmpeq_epi32(_mm_and_si128(vy, one.v), one.v);
192*6467f958SSadaf Ebrahimi             // y & 1 ? MATRIX_A, 0    =  mag01[y & (cl_uint) 0x1UL]
193*6467f958SSadaf Ebrahimi             __m128i vmag01 = _mm_and_si128(mask, matrix_a.v);
194*6467f958SSadaf Ebrahimi             // mt[kk+M-N] ^ (y >> 1)
195*6467f958SSadaf Ebrahimi             __m128i vr =
196*6467f958SSadaf Ebrahimi                 _mm_xor_si128(_mm_loadu_si128((__m128i *)(mt + kk + M - N)),
197*6467f958SSadaf Ebrahimi                               _mm_srli_epi32(vy, 1));
198*6467f958SSadaf Ebrahimi             // mt[kk+M] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL]
199*6467f958SSadaf Ebrahimi             vr = _mm_xor_si128(vr, vmag01);
200*6467f958SSadaf Ebrahimi             _mm_store_si128((__m128i *)(mt + kk), vr);
201*6467f958SSadaf Ebrahimi         }
202*6467f958SSadaf Ebrahimi #endif
203*6467f958SSadaf Ebrahimi 
204*6467f958SSadaf Ebrahimi         for (; kk < N - 1; kk++)
205*6467f958SSadaf Ebrahimi         {
206*6467f958SSadaf Ebrahimi             y = (cl_uint)((mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK));
207*6467f958SSadaf Ebrahimi             mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL];
208*6467f958SSadaf Ebrahimi         }
209*6467f958SSadaf Ebrahimi         y = (cl_uint)((mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK));
210*6467f958SSadaf Ebrahimi         mt[N - 1] = mt[M - 1] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL];
211*6467f958SSadaf Ebrahimi 
212*6467f958SSadaf Ebrahimi #ifdef __SSE2__
213*6467f958SSadaf Ebrahimi         // Do the tempering ahead of time in vector code
214*6467f958SSadaf Ebrahimi         for (kk = 0; kk + 4 <= N; kk += 4)
215*6467f958SSadaf Ebrahimi         {
216*6467f958SSadaf Ebrahimi             // y = mt[k];
217*6467f958SSadaf Ebrahimi             __m128i vy = _mm_load_si128((__m128i *)(mt + kk));
218*6467f958SSadaf Ebrahimi             // y ^= (y >> 11);
219*6467f958SSadaf Ebrahimi             vy = _mm_xor_si128(vy, _mm_srli_epi32(vy, 11));
220*6467f958SSadaf Ebrahimi             // y ^= (y << 7) & (cl_uint) 0x9d2c5680UL;
221*6467f958SSadaf Ebrahimi             vy = _mm_xor_si128(vy, _mm_and_si128(_mm_slli_epi32(vy, 7), c0.v));
222*6467f958SSadaf Ebrahimi             // y ^= (y << 15) & (cl_uint) 0xefc60000UL;
223*6467f958SSadaf Ebrahimi             vy = _mm_xor_si128(vy, _mm_and_si128(_mm_slli_epi32(vy, 15), c1.v));
224*6467f958SSadaf Ebrahimi             // y ^= (y >> 18);
225*6467f958SSadaf Ebrahimi             vy = _mm_xor_si128(vy, _mm_srli_epi32(vy, 18));
226*6467f958SSadaf Ebrahimi             _mm_store_si128((__m128i *)(d->cache + kk), vy);
227*6467f958SSadaf Ebrahimi         }
228*6467f958SSadaf Ebrahimi #endif
229*6467f958SSadaf Ebrahimi 
230*6467f958SSadaf Ebrahimi         d->mti = 0;
231*6467f958SSadaf Ebrahimi     }
232*6467f958SSadaf Ebrahimi #ifdef __SSE2__
233*6467f958SSadaf Ebrahimi     y = d->cache[d->mti++];
234*6467f958SSadaf Ebrahimi #else
235*6467f958SSadaf Ebrahimi     y = mt[d->mti++];
236*6467f958SSadaf Ebrahimi 
237*6467f958SSadaf Ebrahimi     /* Tempering */
238*6467f958SSadaf Ebrahimi     y ^= (y >> 11);
239*6467f958SSadaf Ebrahimi     y ^= (y << 7) & (cl_uint)0x9d2c5680UL;
240*6467f958SSadaf Ebrahimi     y ^= (y << 15) & (cl_uint)0xefc60000UL;
241*6467f958SSadaf Ebrahimi     y ^= (y >> 18);
242*6467f958SSadaf Ebrahimi #endif
243*6467f958SSadaf Ebrahimi 
244*6467f958SSadaf Ebrahimi 
245*6467f958SSadaf Ebrahimi     return y;
246*6467f958SSadaf Ebrahimi }
247*6467f958SSadaf Ebrahimi 
genrand_int64(MTdata d)248*6467f958SSadaf Ebrahimi cl_ulong genrand_int64(MTdata d)
249*6467f958SSadaf Ebrahimi {
250*6467f958SSadaf Ebrahimi     return ((cl_ulong)genrand_int32(d) << 32) | (cl_uint)genrand_int32(d);
251*6467f958SSadaf Ebrahimi }
252*6467f958SSadaf Ebrahimi 
253*6467f958SSadaf Ebrahimi /* generates a random number on [0,1]-real-interval */
genrand_real1(MTdata d)254*6467f958SSadaf Ebrahimi double genrand_real1(MTdata d)
255*6467f958SSadaf Ebrahimi {
256*6467f958SSadaf Ebrahimi     return genrand_int32(d) * (1.0 / 4294967295.0);
257*6467f958SSadaf Ebrahimi     /* divided by 2^32-1 */
258*6467f958SSadaf Ebrahimi }
259*6467f958SSadaf Ebrahimi 
260*6467f958SSadaf Ebrahimi /* generates a random number on [0,1)-real-interval */
genrand_real2(MTdata d)261*6467f958SSadaf Ebrahimi double genrand_real2(MTdata d)
262*6467f958SSadaf Ebrahimi {
263*6467f958SSadaf Ebrahimi     return genrand_int32(d) * (1.0 / 4294967296.0);
264*6467f958SSadaf Ebrahimi     /* divided by 2^32 */
265*6467f958SSadaf Ebrahimi }
266*6467f958SSadaf Ebrahimi 
267*6467f958SSadaf Ebrahimi /* generates a random number on (0,1)-real-interval */
genrand_real3(MTdata d)268*6467f958SSadaf Ebrahimi double genrand_real3(MTdata d)
269*6467f958SSadaf Ebrahimi {
270*6467f958SSadaf Ebrahimi     return (((double)genrand_int32(d)) + 0.5) * (1.0 / 4294967296.0);
271*6467f958SSadaf Ebrahimi     /* divided by 2^32 */
272*6467f958SSadaf Ebrahimi }
273*6467f958SSadaf Ebrahimi 
274*6467f958SSadaf Ebrahimi /* generates a random number on [0,1) with 53-bit resolution*/
genrand_res53(MTdata d)275*6467f958SSadaf Ebrahimi double genrand_res53(MTdata d)
276*6467f958SSadaf Ebrahimi {
277*6467f958SSadaf Ebrahimi     unsigned long a = genrand_int32(d) >> 5, b = genrand_int32(d) >> 6;
278*6467f958SSadaf Ebrahimi     return (a * 67108864.0 + b) * (1.0 / 9007199254740992.0);
279*6467f958SSadaf Ebrahimi }
280*6467f958SSadaf Ebrahimi 
genrand_bool(MTdata d)281*6467f958SSadaf Ebrahimi bool genrand_bool(MTdata d) { return ((cl_uint)genrand_int32(d) & 1); }
282