/* xor514.c : coded by isaku@pb4.so-net.ne.jp
An Implementation of Xorshift RNG using SSE2(Integer version)

       codes by Isaku WADA  Sep 2017

One billion generation time(sec):

Core i7 2600(turbo boost off) Windows10 64bit
+-------------------------------------+-------+-------+-------+
|compiler                             |Next128|Next514|Back514|
+-------------------------------------+-------+-------+-------+
|VC++2017 64bit cl /Wall /O2 /GL      | 1.861 | 1.092 | 1.044 |
|VC++2017 32bit cl /Wall /O2 /GL      | 1.886 | 1.018 | 1.197 |
|TDM-GCC-64 gcc -Wall -O4 -m64        | 1.875 | 0.984 | 1.140 |
|TDM-GCC-64 gcc -Wall -O4 -m32 -msse2 | 2.328 | 0.984 | 1.109 |
+-------------------------------------+-------+-------+-------+

 The first version xor130() does only two 128 bit xorshift operations.

enum { a=11,b=23 };
x^=x<<a; x^=x>>b; return x;

 The sequence that generated by this first version does not FAIL
 diehard test. but it always FAIL old dieharder test.

 Next version does three 128 bit xorshift operations.

enum { a=8,b=69,c=75 };
x^=x<<a; x^=x>>b; x^=x<<c; return x;

 These (a,b,c) prameters were chosen to become high-speed.
 As a result, I got possible to execute at speed same
 as old version. And this version does not FAIL old dieharder test.
 But this version FAILs new dieharder test(3.31.1).

 I write new version xor514() again. It does NOT FAIL dieharder test(3.31.1)
enum { a=101,b=99,c=8 };
t=x^x<<a; x=y; y=z; z=w; t^=t>>b; w^=w>>c^t; return w;

 I modified this program in SEP 2017 and
 becomes faster than the before version.
 And add efficient Jump function.

 This Next514() provides 2^{514}-4 random 32-bit integers.
*/

#if defined(_MSC_VER)
 #pragma warning(disable:4514)
 #pragma warning(disable:4668)
 #pragma warning(disable:4710)
 #pragma warning(disable:4711)
 #pragma warning(disable:4820)
#endif
#include <emmintrin.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <stdint.h>
#define RND_SSE2_NOINLINE 0 /* Turn it 1 if old gcc reports some warnnings */
#define USE_BSR   1 /* Turn it 1 if you want to use bit scan reverse  */
#define DIEHARDER 0 /* Turn it 1 if you want to test with dieharder test */

/*
 Xorshift RNGs. George Marsaglia.
 Journal of statistical software, Vol 8, Issue 14, Jul 2003
 http://www.jstatsoft.org/v08/i14/paper   page 5
*/

typedef struct tag128 { uint32_t x,y,z,w; } xor128_t;

uint32_t Next128(xor128_t*p) {
    enum { a=11,b=8,c=19 };
    uint32_t t=p->x ^ p->x<<a;  p->x=p->y; p->y=p->z; p->z=p->w;
    return p->w ^= p->w>>c ^ t ^ t>>b;
}

void Srand128(xor128_t*p,uint32_t s) {
   s^=521288629;
   p->x=s=1812433253*(s^(s>>30))+1;
   p->y=s=1812433253*(s^(s>>30))+2;
   p->z=s=1812433253*(s^(s>>30))+3;
   p->w=s=1812433253*(s^(s>>30))+4;
}

#if defined(__GNUC__)
 #define MUL514 1
#else
 #define MUL514 16 /* Expand status table */
#endif

typedef union tag514 {
    uint32_t u[4+MUL514*16];
    __m128i m[1+MUL514*4];
    int i;
} xor514_t;

#define NEXT514(X,W)                                                    \
 t=_mm_slli_epi64(X,a-64); t=_mm_slli_si128(t,8); t=_mm_xor_si128(t,X); \
 X=_mm_srli_epi64(t,b-64); X=_mm_srli_si128(X,8); t=_mm_xor_si128(t,X); \
 X=_mm_srli_si128(W,c/8);  X=_mm_xor_si128(X,t);  X=_mm_xor_si128(X,W);

#define BACK514(X,W)                                                      \
 t=_mm_srli_si128(W,c/8); X=_mm_xor_si128(X,W);     t=_mm_xor_si128(t,X); \
 X=_mm_srli_si128(t,8);   X=_mm_srli_epi64(X,b-64); t=_mm_xor_si128(t,X); \
 X=_mm_slli_si128(t,8);   X=_mm_slli_epi64(X,a-64); X=_mm_xor_si128(X,t);

#if RND_SSE2_NOINLINE && defined(__GNUC__)
extern void Next514Sub(xor514_t*p) __attribute__((noinline));
extern void Back514Sub(xor514_t*p) __attribute__((noinline));
#endif

void Next514Sub(xor514_t*p) {
    enum { a=101,b=99,c=8,m=MUL514*4-4 };
    __m128i x=_mm_load_si128(&p->m[m+1]),y=_mm_load_si128(&p->m[m+2]);
    __m128i z=_mm_load_si128(&p->m[m+3]),w=_mm_load_si128(&p->m[m+4]);
    __m128i t;
#if MUL514 == 1
    NEXT514(x,w); NEXT514(y,x); NEXT514(z,y); NEXT514(w,z);
   _mm_store_si128(&p->m[1],x); _mm_store_si128(&p->m[2],y);
   _mm_store_si128(&p->m[3],z); _mm_store_si128(&p->m[4],w);
#else
    int i;
    for (i=0;i<MUL514*4;i+=4) {
        NEXT514(x,w); NEXT514(y,x); NEXT514(z,y); NEXT514(w,z);
        _mm_store_si128(&p->m[1+i],x); _mm_store_si128(&p->m[2+i],y); 
        _mm_store_si128(&p->m[3+i],z); _mm_store_si128(&p->m[4+i],w);
    }
#endif
}

void Back514Sub(xor514_t*p) {
    enum { a=101,b=99,c=8 };
    __m128i x=_mm_load_si128(&p->m[1]),y=_mm_load_si128(&p->m[2]);
    __m128i z=_mm_load_si128(&p->m[3]),w=_mm_load_si128(&p->m[4]);
    __m128i t;
#if MUL514 == 1
    BACK514(w,z); BACK514(z,y); BACK514(y,x); BACK514(x,w);
    _mm_store_si128(&p->m[4],w); _mm_store_si128(&p->m[3],z);
    _mm_store_si128(&p->m[2],y); _mm_store_si128(&p->m[1],x);
#else
    int i;
    for (i=MUL514*4;i>0;) {
        i-=4; BACK514(w,z); BACK514(z,y); BACK514(y,x); BACK514(x,w);
        _mm_store_si128(&p->m[4+i],w); _mm_store_si128(&p->m[3+i],z);
        _mm_store_si128(&p->m[2+i],y); _mm_store_si128(&p->m[1+i],x);
    }
#endif
}

uint32_t Next514(xor514_t*p) {
    if (p->i==4+MUL514*16) { p->i=4; Next514Sub(p); }
    return p->u[p->i++];
}

uint32_t Back514(xor514_t*p) {
    uint32_t r=p->u[--p->i];
    if (p->i==4) { p->i=4+MUL514*16; Back514Sub(p); }
    return r;
}

void Srand514(xor514_t*p,uint32_t s) {
   uint32_t i; p->i=4+MUL514*16; s^=521288629;
   for (i=0;i<16;i++) p->u[i+MUL514*16-12]=s=1812433253*(s^(s>>30))+i;
#if MUL514 != 1
   Next514(p); Back514(p);
#endif
}

/*
Efficient Jump Ahead for F2-Linear Random Number Generators
http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/ARTICLES/jumpf2-printed.pdf
*/
#define CHARA_POLY \
"10000f0f0f0b2573064118bdcc6c897ac22e64d205e02586911d967f89bbbf986" \
"b3e95ac2ac80517506ddf762b05063702dcaf729c2d72e367cdee8d01500f001"

#define RND_EXP__ 512
#define SZSZT__     ((int)sizeof(size_t))
#define RND_NUM__   ((RND_EXP__+SZSZT__*8)/(SZSZT__*8))
#define RND_NUM2__  (((RND_EXP__+1)*2+SZSZT__*8-1)/(SZSZT__*8))

#if USE_BSR
 #if defined(__GNUC__)
  #define ALLOC_TEXT__(X) const char X[]__attribute__((section(".text#")))
 #elif defined(_MSC_VER)
  #pragma code_seg(".text")
  #define ALLOC_TEXT__(X) __declspec(allocate(".text")) const char X[]
 #elif defined(__BORLANDC__)
  #define ALLOC_TEXT__(X) extern const char X[]
 #else
  #define ALLOC_TEXT__(X) const char X[]
 #endif

 #if defined(__i386) && defined(__GNUC__) /* gcc 32 */
  #define FAST__ __attribute__((fastcall))
 #elif defined(__x86_64__) || defined(_M_X64)
  #define FAST__
 #elif defined(__BORLANDC__) /* bcc32 */
  #define FAST__ __msfastcall
 #else /* vc++ 32 */
  #define FAST__ __fastcall
 #endif

ALLOC_TEXT__(Bsr__)=
 #if (defined(__x86_64__) || defined(_M_X64)) && defined(__unix__)
   "\x48\x0F\xBD\xC7" /* bsr rax,rdi */
 #elif defined(__x86_64__) || defined(_M_X64)
   "\x48\x0F\xBD\xC1" /* bsr rax,rcx */
 #else
   "\x0F\xBD\xC1" /* bsr eax,ecx */
 #endif
   "\xC3";            /* ret */

 #define Bsr(A) (((int(FAST__*)(size_t))(size_t)Bsr__)(A))

#endif

typedef size_t Gf2_t[RND_NUM2__];
Gf2_t Chara__,BackChara__;

void NextState(xor514_t*p) {
    enum { a=101,b=99,c=8 }; __m128i t=p->m[1],s=p->m[4];
    p->m[1]=p->m[2]; p->m[2]=p->m[3]; p->m[3]=s; 
    t=_mm_xor_si128(t,_mm_slli_si128(_mm_slli_epi64(t,a-64),8));
    t=_mm_xor_si128(t,_mm_srli_si128(_mm_srli_epi64(t,b-64),8));
    p->m[4]=_mm_xor_si128(s,_mm_xor_si128(_mm_srli_si128(s,c/8),t));
}

void BackState(xor514_t*p) {
    enum { a=101,b=99,c=8 }; __m128i t=p->m[4],s=p->m[3];
    p->m[4]=s; p->m[3]=p->m[2]; p->m[2]=p->m[1];
    t=_mm_xor_si128(_mm_xor_si128(t,s),_mm_srli_si128(s,c/8));
    t=_mm_xor_si128(t,_mm_srli_epi64(_mm_srli_si128(t,8),b-64));
    p->m[1]=_mm_xor_si128(t,_mm_slli_epi64(_mm_slli_si128(t,8),a-64));
}

#if MUL514 != 1
void Update514(xor514_t*p) {
    enum { a=101,b=99,c=8 };
    __m128i x=_mm_load_si128(&p->m[1]),y=_mm_load_si128(&p->m[2]);
    __m128i z=_mm_load_si128(&p->m[3]),w=_mm_load_si128(&p->m[4]);
    __m128i t;
    int i;
    for (i=4;i<MUL514*4;i+=4) {
        NEXT514(x,w); NEXT514(y,x); NEXT514(z,y); NEXT514(w,z);
        _mm_store_si128(&p->m[1+i],x); _mm_store_si128(&p->m[2+i],y); 
        _mm_store_si128(&p->m[3+i],z); _mm_store_si128(&p->m[4+i],w);
    }
}
#endif
void SetPoly(void) {
    int i,j,c; size_t x=0;
    for (i=0;i<(RND_EXP__+4)/4;i++) {
        j=(RND_EXP__+4)/4-1-i;
        c=CHARA_POLY[j];
        if (c<='9') c-='0'; else c-='a'-10;
        x+=(size_t)(((size_t)c)<<(4*(i%(SZSZT__*2))));
        if (i%(SZSZT__*2)==(SZSZT__*2)-1||j==0)
        { Chara__[i/(SZSZT__*2)]=x; x=0; }
    }
    for (i=0;i<((RND_EXP__+SZSZT__*8)/(SZSZT__*8));i++) BackChara__[i]=0;
    for (i=0;i<=RND_EXP__;i++) {
        j=RND_EXP__-i;
        if (Chara__[i/(SZSZT__*8)]&(((size_t)1)<<(i%(SZSZT__*8))))
            BackChara__[j/(SZSZT__*8)]|=((size_t)1)<<(j%(SZSZT__*8));
    }
}

void Gf2Square__(Gf2_t c,const Gf2_t a) {
    size_t ch,cl,aa; int i,j,i2,j2;
    for (i=0;i<RND_NUM2__;i++) c[i]=0;
    for (j2=j=0;j<RND_NUM__;j++,j2+=2) {
        aa=a[j];
        if (aa==0) continue;
        ch=cl=0;
        for (i2=i=0;i<SZSZT__*8;i++,i2+=2) {
            if (aa&(((size_t)1)<<i)) {
                if (i2>=SZSZT__*8) ch|=((size_t)1)<<(i2-SZSZT__*8);
                else cl|=((size_t)1)<<i2;
            }
        }
        c[j2]^=cl;
        if (ch) c[j2+1]^=ch;
    }
}

void Gf2Mult__(Gf2_t c,const Gf2_t a,const Gf2_t b) {
    size_t ch,cl,d,aa,bb; int i,j,k,ij;
    for (i=0;i<RND_NUM2__;i++) c[i]=0;
    for (j=0;j<RND_NUM__;j++) {
        bb=b[j];
        if (bb==0) continue;
        for (i=0;i<RND_NUM__;i++) {
            aa=a[i];
            if (aa==0) continue;
            ij=i+j; cl=ch=0;
            for (k=0;k<SZSZT__*8;k++) {
                if (aa&(((size_t)1)<<k)) ch^=bb;
                d =(ch>>1)|(cl<<(SZSZT__*8-1));
                cl=(cl>>1)|(ch<<(SZSZT__*8-1));
                ch=d;
            }
            c[ij]^=cl;
            if (ch) c[ij+1]^=ch;
        }
    }
}

void Gf2ShiftLeft__(Gf2_t c,const Gf2_t a,const int s) {
    int i,ib=s&(SZSZT__*8-1);
    int na1=RND_NUM2__-(s-ib)/(SZSZT__*8),iw=s/(SZSZT__*8);
    size_t x;
    for (i=0;i<RND_NUM2__;i++) c[i]=0;
    if (ib==0) {
        for (i=0;i<na1;i++) c[i+iw]^=a[i];
        return;
    }
    for (i=0;i<na1;i++) {
        x=a[i];
        c[i+iw+1]^=x>>(SZSZT__*8-ib);
        c[i+iw]^=x<<ib;
    }
}

int Gf2Deg__(const Gf2_t a) {
    int i;
    for (i=RND_NUM2__-1;i>=0;i--) if (a[i]) break;
    if (i==-1) return -1;
#if USE_BSR
    return i*SZSZT__*8+Bsr(a[i]);
#else
    {
        int j; size_t x=a[i];
        for (j=SZSZT__*8-1;(x&(((size_t)1)<<j))==0;j--) ;
        return i*SZSZT__*8+j;
    }
#endif
 }

void Gf2Rem__(Gf2_t r,const Gf2_t a,const Gf2_t b) {
    Gf2_t t; int i,dr=Gf2Deg__(a);
    for (i=0;i<RND_NUM2__;i++) r[i]=a[i];
    while (dr>=RND_EXP__) {
        Gf2ShiftLeft__(t,b,dr-RND_EXP__);
        for (i=0;i<RND_NUM2__;i++) r[i]^=t[i];
        dr=Gf2Deg__(r);
    }
}

void Jump(double lag,xor514_t*p) {
    Gf2_t c,d,s; size_t*m=Chara__,w,h; xor514_t y; int i,j,rest;
    if (Chara__[0]==0) SetPoly();
    for (i=0;i<RND_NUM2__;i++) c[i]=s[i]=0;
    c[0]=1; s[0]=2;
    if (lag<0) { lag=-lag; m=BackChara__; }
    rest=(int)(modf(lag/4.0,&lag)*4);
    for (i=0;i<rest;i++) if (m==BackChara__) Back514(p); else Next514(p);
    for (;;) {
        if (modf(lag/2.0,&lag)!=0) { Gf2Mult__(d,c,s); Gf2Rem__(c,d,m); }
        if (lag==0) break;    else { Gf2Square__(d,s); Gf2Rem__(s,d,m); }
    }
    j=(RND_EXP__-1)/(SZSZT__*8);
    h=((size_t)1)<<((RND_EXP__-1)&(SZSZT__*8-1)); w=c[j];
    y.m[0]=p->m[0];
    for (i=1;i<5;i++) y.m[i]=_mm_setzero_si128();
    for (;;) {
        if (w&h) for (i=1;i<5;i++) y.m[i]=_mm_xor_si128(y.m[i],p->m[i]);
        if ((h>>=1)==0) {
            if (j==0) break;
            h=((size_t)1)<<(SZSZT__*8-1); w=c[--j];
        }
        if (m==BackChara__) BackState(&y);
        else NextState(&y);
    }
    for (i=1;i<5;i++) p->m[i]=y.m[i];
#if MUL514 != 1
    Update514(p);
#endif
}


#define N 1000000000

int main(int argc,char**argv) {
    clock_t lap,min1,min2; uint32_t i,k,sum;
#if defined(__GNUC__)
    xor128_t tbl128[1]={{123456789,362436069,521288629,88675123}};
    xor514_t tbl514[1];
#else
    static xor128_t tbl128[1]={{123456789,362436069,521288629,88675123}};
    static xor514_t tbl514[1];
#endif

#if DIEHARDER
    if (argc==2) {
        uint32_t buf[1024];
        Srand514(tbl514,atoi(argv[1]));
        for (;;) {
            for (i=0;i<1024;i++) buf[i]=Next514(tbl514);
            fwrite(buf,4,1024,stdout);
        }
    }
#else
    (void)argc; (void)argv;
#endif
    Srand514(tbl514,1);

    {
        uint32_t x[1000];
        for (i=0;i<1000;i++) x[i]=Back514(tbl514);
        for (i=1000;i--;)    if (x[i]!=Next514(tbl514)) printf("*");
        printf("\n");
    }

    printf("sizeof(size_t)=%d\n",(int)sizeof(size_t));

    {
        enum { kk=12345 }; uint32_t x[kk]; double z=1;
        for (i=0;i<514;i++) z*=2;
        for (i=0;i<kk;i++) x[i]=Next514(tbl514);
        Jump(z,tbl514);
        Jump(-kk-4,tbl514);
        for (i=0;i<kk;i++) if (x[i]!=Next514(tbl514)) printf("*");
        printf("\n");
    }

    min1=0x7fffffff; sum=0; printf("Next128");
    for (k=0;k<8;k++) {
        lap=clock();
        for (i=0;i<N;i++) sum+=Next128(tbl128);
        lap=(clock()-lap)*1000/CLOCKS_PER_SEC;
        printf(" %.0f",(double)lap);
#if defined(__GNUC__)
        fflush(stdout);
#endif
        if (lap<min1) min1=lap;
    }
    printf("(%d)%g\n",(int)min1,(double)sum);

    min2=0x7fffffff; sum=0; printf("Next514");
    for (k=0;k<8;k++) {
        lap=clock();
        for (i=0;i<N;i++) sum+=Next514(tbl514);
        lap=(clock()-lap)*1000/CLOCKS_PER_SEC;
        printf(" %.0f",(double)lap);
#if defined(__GNUC__)
        fflush(stdout);
#endif
        if (lap<min2) min2=lap;
    }
    printf("(%d)%g ",(int)min2,(double)sum);
    if (min2<min1) printf("%.2f%%fast\n",100.0*(min1-min2)/min1);
    else           printf("%.2f%%slow\n",100.0*(min2-min1)/min1);

    min2=0x7fffffff; printf("Back514");
    for (k=0;k<8;k++) {
        lap=clock();
        for (i=0;i<N;i++) sum-=Back514(tbl514);
        lap=(clock()-lap)*1000/CLOCKS_PER_SEC;
        printf(" %.0f",(double)lap);
#if defined(__GNUC__)
        fflush(stdout);
#endif
        if (lap<min2) min2=lap;
    }
    printf("(%d)%g ",(int)min2,(double)sum);
    if (min2<min1) printf("%.2f%%fast\n",100.0*(min1-min2)/min1);
    else           printf("%.2f%%slow\n",100.0*(min2-min1)/min1);

    return 0;
}