/* xor.c : coded by isaku@pb4.so-net.ne.jp
An Implementation of Xorshift RNG using SSE2(Double version)

       codes by Isaku WADA  Jun 2014

One billion generation time(sec):

Core i7 2600(turbo boost off)
+-----------------------------------------------+-------+-------+--------+
|compiler and system                            |dxor128|dxor513|dback513|
+-----------------------------------------------+-------+-------+--------+
|VC++2010 64bit cl /O2 /GL /W4 on Windows7 64bit|  2.121|  2.168|   1.887|
|VC++2010 32bit cl /O2 /GL /W4 on Windows7 64bit|  4.602|  1.840|   1.856|
|gcc -O4 -Wall             on Ubuntu 14.04 64bit|  2.655|  1.976|   1.863|
|gcc -O4 -Wall -m32 -msse2 on Ubuntu 14.04 64bit|  2.430|  1.847|   1.817|
+-----------------------------------------------+-------+-------+--------+

dxor513() is based on xor514().
xor514() does not FAIL dieharder test(3.31.1)

enum { a=101,b=99,c=8 };
t=x^x<<a; x=y; y=z; z=w; t^=t>>b; w^=w>>c^t; return w;

 This dxor513() provide 2^{513}-2 random 52bit pricision double
 with range [0,1).

*/

#include <emmintrin.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <stdint.h>
#define RND_SSE2_NOINLINE 0 /* Turn it 1 if old gcc reports some warnnings */

/*
 Xorshift RNGs. George Marsaglia.
 Journal of statistical software, Vol 8, Issue 14, Jul 2003
 http://www.jstatsoft.org/v08/i14/paper   page 5
*/

__inline double dxor128(void) {
    enum { a=11,b=8,c=19 };
    static uint32_t x=123456789,y=362436069,z=521288629,w=88675123;
    uint32_t t=x^x<<a; x=y; y=z; z=w;
    return (w^=w>>c^t^t>>b)*(1.0/4294967296.0);
}

typedef union tag513 {
    uint32_t u[36];
    double   d[18];
    __m128i  m[9];
} dxor513_t;

const union { int i[4]; __m128i m; } EXPO513 = {{0,0x3ff00000,0,0x3ff00000}};

void sxor513(dxor513_t*p,uint32_t s) {
   uint32_t i; p->u[0]=18; p->u[4]=s;
   for (i=1;i<16;i++) p->u[i+4]=s=1812433253*(s^(s>>30))+i;
   for (i=1;i<=4;i++)
       p->m[i+4]=_mm_or_si128(_mm_srli_epi64(p->m[i],12),EXPO513.m);
}


#if 1

 #define DXOR513HEAD 10

 #define DXOR513(X,W)                                            \
    t=_mm_xor_si128(X,_mm_slli_si128(_mm_slli_epi64(X,a-64),8)); \
    t=_mm_xor_si128(t,_mm_srli_si128(_mm_srli_epi64(t,b-64),8)); \
    X=_mm_xor_si128(W,_mm_xor_si128(_mm_srli_si128(W,c/8),t))

 #define DBACK513(X,W)                                           \
    t=_mm_xor_si128(_mm_xor_si128(X,W),_mm_srli_si128(W,c/8));   \
    t=_mm_xor_si128(t,_mm_srli_epi64(_mm_srli_si128(t,8),b-64)); \
    X=_mm_xor_si128(t,_mm_slli_epi64(_mm_slli_si128(t,8),a-64))

 #if RND_SSE2_NOINLINE
extern void dxor513sub(dxor513_t*p) __attribute__((noinline));
extern void dback513sub(dxor513_t*p) __attribute__((noinline));
 #endif

void dxor513sub(dxor513_t*p) {
    enum { a=101,b=99,c=8 }; __m128i t,x,y,z,w;
    x=_mm_load_si128(&p->m[1]);
    y=_mm_load_si128(&p->m[2]);
    z=_mm_load_si128(&p->m[3]);
    w=_mm_load_si128(&p->m[4]);
    DXOR513(x,w); DXOR513(y,x); DXOR513(z,y); DXOR513(w,z); t=EXPO513.m;
    _mm_store_si128(&p->m[1],x);
    _mm_store_si128(&p->m[2],y);
    _mm_store_si128(&p->m[3],z);
    _mm_store_si128(&p->m[4],w);
    _mm_store_si128(&p->m[5],_mm_or_si128(_mm_srli_epi64(x,12),t));
    _mm_store_si128(&p->m[6],_mm_or_si128(_mm_srli_epi64(y,12),t));
    _mm_store_si128(&p->m[7],_mm_or_si128(_mm_srli_epi64(z,12),t));
    _mm_store_si128(&p->m[8],_mm_or_si128(_mm_srli_epi64(w,12),t));
}

void dback513sub(dxor513_t*p) {
    enum { a=101,b=99,c=8 }; __m128i t,x,y,z,w;
    x=_mm_load_si128(&p->m[1]);
    y=_mm_load_si128(&p->m[2]);
    z=_mm_load_si128(&p->m[3]);
    w=_mm_load_si128(&p->m[4]);
    DBACK513(w,z); DBACK513(z,y); DBACK513(y,x); DBACK513(x,w); t=EXPO513.m;
    _mm_store_si128(&p->m[1],x);
    _mm_store_si128(&p->m[2],y);
    _mm_store_si128(&p->m[3],z);
    _mm_store_si128(&p->m[4],w);
    _mm_store_si128(&p->m[5],_mm_or_si128(_mm_srli_epi64(x,12),t));
    _mm_store_si128(&p->m[6],_mm_or_si128(_mm_srli_epi64(y,12),t));
    _mm_store_si128(&p->m[7],_mm_or_si128(_mm_srli_epi64(z,12),t));
    _mm_store_si128(&p->m[8],_mm_or_si128(_mm_srli_epi64(w,12),t));
}

#else

 #define DXOR513HEAD 16

void dxor513sub(dxor513_t*p) {
    enum { a=101,b=99,c=8 }; __m128i t=p->m[1],s=p->m[4];
    p->m[1]=p->m[2]; p->m[2]=p->m[3]; p->m[3]=s; 
    t=_mm_xor_si128(t,_mm_slli_si128(_mm_slli_epi64(t,a-64),8));
    t=_mm_xor_si128(t,_mm_srli_si128(_mm_srli_epi64(t,b-64),8));
    p->m[4]=s=_mm_xor_si128(s,_mm_xor_si128(_mm_srli_si128(s,c/8),t));
    p->m[8]=_mm_or_si128(_mm_srli_epi64(s,12),EXPO513.m);
}

void dback513sub(dxor513_t*p) {
    enum { a=101,b=99,c=8 }; __m128i t=p->m[4],s=p->m[3];
    p->m[4]=s; p->m[3]=p->m[2]; p->m[2]=p->m[1];
    p->m[8]=_mm_or_si128(_mm_srli_epi64(s,12),EXPO513.m);
    t=_mm_xor_si128(_mm_xor_si128(t,s),_mm_srli_si128(s,c/8));
    t=_mm_xor_si128(t,_mm_srli_epi64(_mm_srli_si128(t,8),b-64));
    p->m[1]=_mm_xor_si128(t,_mm_slli_epi64(_mm_slli_si128(t,8),a-64));
}

#endif

double dxor513(dxor513_t*p) {
    if (p->u[0]==18) { p->u[0]=DXOR513HEAD; dxor513sub(p); }
    return p->d[p->u[0]++]-1.0;
}

double dback513(dxor513_t*p) {
    int i=p->u[0]-1; double r=p->d[i];
    if (i==DXOR513HEAD) { i=18; dback513sub(p); }
    p->u[0]=i; return r-1.0;
}


#define N 1000000000

int main(void) {
    clock_t lap,min1,min2; uint32_t i,k; dxor513_t arg[1]; double sum;

    printf("sizeof(size_t)=%d\n",(int)sizeof(size_t));
    sxor513(arg,1);

    min1=0x7fffffff; sum=0; printf("dxor128 ");
    for (k=0;k<8;k++) {
        lap=clock();
        for (i=0;i<N;i++) sum+=dxor128();
        lap=(clock()-lap)*1000/CLOCKS_PER_SEC;
        printf(" %3lu",lap);
#if defined(__GNUC__)
        fflush(stdout);
#endif
        if (lap<min1) min1=lap;
    }
    printf("(%lu)%g\n",min1,sum);

    min2=0x7fffffff; sum=0; printf("dxor513 ");
    for (k=0;k<8;k++) {
        lap=clock();
        for (i=0;i<N;i++) sum+=dxor513(arg);
        lap=(clock()-lap)*1000/CLOCKS_PER_SEC;
        printf(" %3lu",lap);
#if defined(__GNUC__)
        fflush(stdout);
#endif
        if (lap<min2) min2=lap;
    }
    printf("(%lu)%g ",min2,sum);
    if (min2<min1) printf("%.2f%%fast\n",100.0*(min1-min2)/min1);
    else           printf("%.2f%%slow\n",100.0*(min2-min1)/min1);

    min2=0x7fffffff; printf("dback513");
    for (k=0;k<8;k++) {
        lap=clock();
        for (i=0;i<N;i++) sum-=dback513(arg);
        lap=(clock()-lap)*1000/CLOCKS_PER_SEC;
        printf(" %3lu",lap);
#if defined(__GNUC__)
        fflush(stdout);
#endif
        if (lap<min2) min2=lap;
    }
    printf("(%lu)%g ",min2,sum);
    if (min2<min1) printf("%.2f%%fast\n",100.0*(min1-min2)/min1);
    else           printf("%.2f%%slow\n",100.0*(min2-min1)/min1);
    return 0;
}