PerformanceTests/JetStream2/wasm/gcc-loops.cpp - WebKit - Git at Google

 #include <sys/time.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <iostream>
 #include <vector>
 #include <numeric>

 /// This test contains some of the loops from the GCC vectrorizer example page [1].
 /// Dorit Nuzman who developed the gcc vectorizer said that we can use them in our test suite.
 ///
 /// [1] - http://gcc.gnu.org/projects/tree-ssa/vectorization.html

 #define N 1024
 #define M 32
 #define K 4
 #define ALIGNED16 __attribute__((aligned(16)))

 unsigned short usa[N];
 short sa[N];
 short sb[N];
 short sc[N];
 unsigned int   ua[N];
 int   ia[N] ALIGNED16;
 int   ib[N] ALIGNED16;
 int   ic[N] ALIGNED16;
 unsigned int ub[N];
 unsigned int uc[N];
 float fa[N], fb[N];
 float da[N], db[N], dc[N], dd[N];
 int dj[N];

 struct A {
   int ca[N];
 } s;

 int a[N*2] ALIGNED16;
 int b[N*2] ALIGNED16;
 int c[N*2] ALIGNED16;
 int d[N*2] ALIGNED16;

 __attribute__((noinline))
 void example1 () {
   int i;

   for (i=0; i<256; i++){
     a[i] = b[i] + c[i];
   }
 }

 __attribute__((noinline))
 void example2a (int n, int x) {
    int i;

    /* feature: support for unknown loop bound  */
    /* feature: support for loop invariants  */
    for (i=0; i<n; i++) {
       b[i] = x;
    }
 }

 __attribute__((noinline))
 void example2b (int n, int x) {
   int i = 0;
    /* feature: general loop exit condition  */
    /* feature: support for bitwise operations  */
    while (n--){
       a[i] = b[i]&c[i]; i++;
    }
 }


 typedef int aint __attribute__ ((__aligned__(16)));
 __attribute__((noinline))
 void example3 (int n, aint * __restrict__ p, aint * __restrict q) {

    /* feature: support for (aligned) pointer accesses.  */
    while (n--){
       *p++ = *q++;
    }
 }

 __attribute__((noinline))
 void example4a (int n, aint * __restrict__ p, aint * __restrict__ q) {
    /* feature: support for (aligned) pointer accesses  */
    /* feature: support for constants  */
    while (n--){
       *p++ = *q++ + 5;
    }
 }

 __attribute__((noinline))
 void example4b (int n, aint * __restrict__ p, aint * __restrict__ q) {
    int i;

    /* feature: support for read accesses with a compile time known misalignment  */
    for (i=0; i<n; i++){
       a[i] = b[i+1] + c[i+3];
    }
 }

 __attribute__((noinline))
 void example4c (int n, aint * __restrict__ p, aint * __restrict__ q) {
    int i;
     const int MAX = 4;
    /* feature: support for if-conversion  */
    for (i=0; i<n; i++){
       int j = a[i];
       b[i] = (j > MAX ? MAX : 0);
    }
 }

 __attribute__((noinline))
 void  example5 (int n, struct A *s) {
   int i;
   for (i = 0; i < n; i++) {
     /* feature: support for alignable struct access  */
     s->ca[i] = 5;
   }
 }

 __attribute__((noinline))
 void  example7 (int x) {
    int i;

    /* feature: support for read accesses with an unknown misalignment  */
    for (i=0; i<N; i++){
       a[i] = b[i+x];
    }
 }

 int G[M][N];
 __attribute__((noinline))
 void example8 (int x) {
    int i,j;

    /* feature: support for multidimensional arrays  */
    for (i=0; i<M; i++) {
      for (j=0; j<N; j++) {
        G[i][j] = x;
      }
    }
 }


 __attribute__((noinline))
 void example9 (unsigned *ret) {
   int i;

   /* feature: support summation reduction.
      note: in case of floats use -funsafe-math-optimizations  */
   unsigned int diff = 0;
   for (i = 0; i < N; i++) {
     diff += (ub[i] - uc[i]);
   }

   *ret = diff;
 }


 /* feature: support data-types of different sizes.
    Currently only a single vector-size per target is supported;
    it can accommodate n elements such that n = vector-size/element-size
    (e.g, 4 ints, 8 shorts, or 16 chars for a vector of size 16 bytes).
    A combination of data-types of different sizes in the same loop
    requires special handling. This support is now present in mainline,
    and also includes support for type conversions.  */
 __attribute__((noinline))
 void example10a(short *__restrict__ sa, short *__restrict__ sb, short *__restrict__ sc, int* __restrict__ ia, int* __restrict__ ib, int* __restrict__ ic) {
   int i;
   for (i = 0; i < N; i++) {
     ia[i] = ib[i] + ic[i];
     sa[i] = sb[i] + sc[i];
   }
 }

 __attribute__((noinline))
 void example10b(short *__restrict__ sa, short *__restrict__ sb, short *__restrict__ sc, int* __restrict__ ia, int* __restrict__ ib, int* __restrict__ ic) {
   int i;
   for (i = 0; i < N; i++) {
     ia[i] = (int) sb[i];
   }
 }

 /* feature: support strided accesses - the data elements
    that are to be operated upon in parallel are not consecutive - they
    are accessed with a stride > 1 (in the example, the stride is 2):  */
 __attribute__((noinline))
 void example11() {
    int i;
   for (i = 0; i < N/2; i++){
     a[i] = b[2*i+1] * c[2*i+1] - b[2*i] * c[2*i];
     d[i] = b[2*i] * c[2*i+1] + b[2*i+1] * c[2*i];
   }
 }


 __attribute__((noinline))
 void example12() {
   for (int i = 0; i < N; i++) {
     a[i] = i;
   }
 }

 __attribute__((noinline))
 void example13(int **A, int **B, int *out) {
   int i,j;
   for (i = 0; i < M; i++) {
     int diff = 0;
     for (j = 0; j < N; j+=8) {
       diff += (A[i][j] - B[i][j]);
     }
     out[i] = diff;
   }
 }

 __attribute__((noinline))
 void example14(int **in, int **coeff, int *out) {
   int k,j,i=0;
   for (k = 0; k < K; k++) {
     int sum = 0;
     for (j = 0; j < M; j++)
       for (i = 0; i < N; i++)
           sum += in[i+k][j] * coeff[i][j];

     out[k] = sum;
   }

 }


 __attribute__((noinline))
 void example21(int *b, int n) {
   int i, a = 0;

   for (i = n-1; i >= 0; i--)
     a += b[i];

   b[0] = a;
 }

 __attribute__((noinline))
 void example23 (unsigned short *src, unsigned int *dst)
 {
   int i;

   for (i = 0; i < 256; i++)
     *dst++ = *src++ << 7;
 }


 __attribute__((noinline))
 void example24 (short x, short y)
 {
   int i;
   for (i = 0; i < N; i++)
     ic[i] = fa[i] < fb[i] ? x : y;
 }


 __attribute__((noinline))
 void example25 (void)
 {
   int i;
   char x, y;
   for (i = 0; i < N; i++)
     {
       x = (da[i] < db[i]);
       y = (dc[i] < dd[i]);
       dj[i] = x & y;
     }
 }

 void init_memory(void *start, void* end) {
   unsigned char state = 1;
   while (start != end) {
     state *= 7; state ^= 0x27; state += 1;
     *((unsigned char*)start) = state;
     start = ((char*)start) + 1;
   }
 }

 void init_memory_float(float *start, float* end) {
   float state = 1.0;
   while (start != end) {
     state *= 1.1;
     *start = state;
     start++;
   }
 }

 unsigned digest_memory(void *start, void* end) {
   unsigned state = 1;
   while (start != end) {
     state *= 3;
     state ^= *((unsigned char*)start);
     state = (state >> 8  ^ state << 8);
     start = ((char*)start) + 1;
   }
   return state;
 }

 class Timer {

 public:
   Timer(const char* title, bool print) {
     Title = title;
     Print = print;
     gettimeofday(&Start, 0);
   }

   ~Timer() {
     gettimeofday(&End, 0);
     long mtime, s,us;
     s = End.tv_sec  - Start.tv_sec;
     us = End.tv_usec - Start.tv_usec;
     mtime = (s*1000 + us/1000.0)+0.5;
     if (Print)
       std::cout<<Title<<", "<<mtime<<", msec\n";
   }

 private:
   const char* Title;
   bool Print;
   struct timeval Start, End;
 };


 // Warmup and then measure.
 #define BENCH(NAME, RUN_LINE, ITER, DIGEST_LINE) {\
   RUN_LINE;\
   Timer atimer(NAME, print_times);\
   for (int i=0; i < (ITER); ++i) RUN_LINE;\
   unsigned r = DIGEST_LINE;\
   results.push_back(r);\
  }

 int main(int argc,char* argv[]){

   bool print_times = argc > 1;

   std::vector<unsigned> results;
   unsigned dummy = 0;
 #ifdef SMALL_PROBLEM_SIZE
   const int Mi = 1<<10;
 #else
   const int Mi = 1<<18;
 #endif
   init_memory(&ia[0], &ia[N]);
   init_memory(&ib[0], &ib[N]);
   init_memory(&ic[0], &ic[N]);
   init_memory(&sa[0], &sa[N]);
   init_memory(&sb[0], &sb[N]);
   init_memory(&sc[0], &sc[N]);
   init_memory(&a[0], &a[N*2]);
   init_memory(&b[0], &b[N*2]);
   init_memory(&c[0], &c[N*2]);
   init_memory(&ua[0], &ua[N]);
   init_memory(&ub[0], &ub[N]);
   init_memory(&uc[0], &uc[N]);
   init_memory(&G[0][0], &G[0][N]);
   init_memory_float(&fa[0], &fa[N]);
   init_memory_float(&fb[0], &fb[N]);
   init_memory_float(&da[0], &da[N]);
   init_memory_float(&db[0], &db[N]);
   init_memory_float(&dc[0], &dc[N]);
   init_memory_float(&dd[0], &dd[N]);

   BENCH("Example1",   example1(), Mi*10, digest_memory(&a[0], &a[256]));
   BENCH("Example2a",  example2a(N, 2), Mi*4, digest_memory(&b[0], &b[N]));
   BENCH("Example2b",  example2b(N, 2), Mi*2, digest_memory(&a[0], &a[N]));
   BENCH("Example3",   example3(N, ia, ib), Mi*2, digest_memory(&ia[0], &ia[N]));
   BENCH("Example4a",  example4a(N, ia, ib), Mi*2, digest_memory(&ia[0], &ia[N]));
   BENCH("Example4b",  example4b(N-10, ia, ib), Mi*2, digest_memory(&ia[0], &ia[N]));
   BENCH("Example4c",  example4c(N, ia, ib), Mi*2, digest_memory(&ib[0], &ib[N]));
   BENCH("Example7",   example7(4), Mi*4, digest_memory(&a[0], &a[N]));
   BENCH("Example8",   example8(8), Mi/4, digest_memory(&G[0][0], &G[0][N]));
   BENCH("Example9",   example9(&dummy), Mi*2, dummy);
   BENCH("Example10a", example10a(sa,sb,sc,ia,ib,ic), Mi*2, digest_memory(&ia[0], &ia[N]) + digest_memory(&sa[0], &sa[N]));
   BENCH("Example10b", example10b(sa,sb,sc,ia,ib,ic), Mi*4, digest_memory(&ia[0], &ia[N]));
   BENCH("Example11",  example11(), Mi*2, digest_memory(&d[0], &d[N]));
   BENCH("Example12",  example12(), Mi*4, digest_memory(&a[0], &a[N]));
   //BENCH("Example21",  example21(ia, N), Mi*4, digest_memory(&ia[0], &ia[N]));
   BENCH("Example23",  example23(usa,ua), Mi*8, digest_memory(&usa[0], &usa[256]));
   BENCH("Example24",  example24(2,4), Mi*2, 0);
   BENCH("Example25",  example25(), Mi*2, digest_memory(&dj[0], &dj[N]));

   std::cout<<std::hex;
   std::cout<<"Results: ("<<std::accumulate(results.begin(), results.end(), 0)<<"):";
   for (unsigned i=0; i < results.size(); ++i) {
     std::cout<<" "<<results[i];
   }
   std::cout<<"\n";

   return 0;
 }
	#include <sys/time.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <unistd.h>
	#include <iostream>
	#include <vector>
	#include <numeric>

	/// This test contains some of the loops from the GCC vectrorizer example page [1].
	/// Dorit Nuzman who developed the gcc vectorizer said that we can use them in our test suite.
	///
	/// [1] - http://gcc.gnu.org/projects/tree-ssa/vectorization.html

	#define N 1024
	#define M 32
	#define K 4
	#define ALIGNED16 __attribute__((aligned(16)))

	unsigned short usa[N];
	short sa[N];
	short sb[N];
	short sc[N];
	unsigned int ua[N];
	int ia[N] ALIGNED16;
	int ib[N] ALIGNED16;
	int ic[N] ALIGNED16;
	unsigned int ub[N];
	unsigned int uc[N];
	float fa[N], fb[N];
	float da[N], db[N], dc[N], dd[N];
	int dj[N];

	struct A {
	int ca[N];
	} s;

	int a[N*2] ALIGNED16;
	int b[N*2] ALIGNED16;
	int c[N*2] ALIGNED16;
	int d[N*2] ALIGNED16;

	__attribute__((noinline))
	void example1 () {
	int i;

	for (i=0; i<256; i++){
	a[i] = b[i] + c[i];
	}
	}

	__attribute__((noinline))
	void example2a (int n, int x) {
	int i;

	/* feature: support for unknown loop bound */
	/* feature: support for loop invariants */
	for (i=0; i<n; i++) {
	b[i] = x;
	}
	}

	__attribute__((noinline))
	void example2b (int n, int x) {
	int i = 0;
	/* feature: general loop exit condition */
	/* feature: support for bitwise operations */
	while (n--){
	a[i] = b[i]&c[i]; i++;
	}
	}


	typedef int aint __attribute__ ((__aligned__(16)));
	__attribute__((noinline))
	void example3 (int n, aint * __restrict__ p, aint * __restrict q) {

	/* feature: support for (aligned) pointer accesses. */
	while (n--){
	p++ = q++;
	}
	}

	__attribute__((noinline))
	void example4a (int n, aint * __restrict__ p, aint * __restrict__ q) {
	/* feature: support for (aligned) pointer accesses */
	/* feature: support for constants */
	while (n--){
	p++ = q++ + 5;
	}
	}

	__attribute__((noinline))
	void example4b (int n, aint * __restrict__ p, aint * __restrict__ q) {
	int i;

	/* feature: support for read accesses with a compile time known misalignment */
	for (i=0; i<n; i++){
	a[i] = b[i+1] + c[i+3];
	}
	}

	__attribute__((noinline))
	void example4c (int n, aint * __restrict__ p, aint * __restrict__ q) {
	int i;
	const int MAX = 4;
	/* feature: support for if-conversion */
	for (i=0; i<n; i++){
	int j = a[i];
	b[i] = (j > MAX ? MAX : 0);
	}
	}

	__attribute__((noinline))
	void example5 (int n, struct A *s) {
	int i;
	for (i = 0; i < n; i++) {
	/* feature: support for alignable struct access */
	s->ca[i] = 5;
	}
	}

	__attribute__((noinline))
	void example7 (int x) {
	int i;

	/* feature: support for read accesses with an unknown misalignment */
	for (i=0; i<N; i++){
	a[i] = b[i+x];
	}
	}

	int G[M][N];
	__attribute__((noinline))
	void example8 (int x) {
	int i,j;

	/* feature: support for multidimensional arrays */
	for (i=0; i<M; i++) {
	for (j=0; j<N; j++) {
	G[i][j] = x;
	}
	}
	}


	__attribute__((noinline))
	void example9 (unsigned *ret) {
	int i;

	/* feature: support summation reduction.
	note: in case of floats use -funsafe-math-optimizations */
	unsigned int diff = 0;
	for (i = 0; i < N; i++) {
	diff += (ub[i] - uc[i]);
	}

	*ret = diff;
	}


	/* feature: support data-types of different sizes.
	Currently only a single vector-size per target is supported;
	it can accommodate n elements such that n = vector-size/element-size
	(e.g, 4 ints, 8 shorts, or 16 chars for a vector of size 16 bytes).
	A combination of data-types of different sizes in the same loop
	requires special handling. This support is now present in mainline,
	and also includes support for type conversions. */
	__attribute__((noinline))
	void example10a(short __restrict__ sa, short __restrict__ sb, short __restrict__ sc, int __restrict__ ia, int* __restrict__ ib, int* __restrict__ ic) {
	int i;
	for (i = 0; i < N; i++) {
	ia[i] = ib[i] + ic[i];
	sa[i] = sb[i] + sc[i];
	}
	}

	__attribute__((noinline))
	void example10b(short __restrict__ sa, short __restrict__ sb, short __restrict__ sc, int __restrict__ ia, int* __restrict__ ib, int* __restrict__ ic) {
	int i;
	for (i = 0; i < N; i++) {
	ia[i] = (int) sb[i];
	}
	}

	/* feature: support strided accesses - the data elements
	that are to be operated upon in parallel are not consecutive - they
	are accessed with a stride > 1 (in the example, the stride is 2): */
	__attribute__((noinline))
	void example11() {
	int i;
	for (i = 0; i < N/2; i++){
	a[i] = b[2i+1] c[2i+1] - b[2i] * c[2*i];
	d[i] = b[2i] c[2i+1] + b[2i+1] * c[2*i];
	}
	}


	__attribute__((noinline))
	void example12() {
	for (int i = 0; i < N; i++) {
	a[i] = i;
	}
	}

	__attribute__((noinline))
	void example13(int A, int B, int *out) {
	int i,j;
	for (i = 0; i < M; i++) {
	int diff = 0;
	for (j = 0; j < N; j+=8) {
	diff += (A[i][j] - B[i][j]);
	}
	out[i] = diff;
	}
	}

	__attribute__((noinline))
	void example14(int in, int coeff, int *out) {
	int k,j,i=0;
	for (k = 0; k < K; k++) {
	int sum = 0;
	for (j = 0; j < M; j++)
	for (i = 0; i < N; i++)
	sum += in[i+k][j] * coeff[i][j];

	out[k] = sum;
	}

	}


	__attribute__((noinline))
	void example21(int *b, int n) {
	int i, a = 0;

	for (i = n-1; i >= 0; i--)
	a += b[i];

	b[0] = a;
	}

	__attribute__((noinline))
	void example23 (unsigned short src, unsigned int dst)
	{
	int i;

	for (i = 0; i < 256; i++)
	dst++ = src++ << 7;
	}


	__attribute__((noinline))
	void example24 (short x, short y)
	{
	int i;
	for (i = 0; i < N; i++)
	ic[i] = fa[i] < fb[i] ? x : y;
	}


	__attribute__((noinline))
	void example25 (void)
	{
	int i;
	char x, y;
	for (i = 0; i < N; i++)
	{
	x = (da[i] < db[i]);
	y = (dc[i] < dd[i]);
	dj[i] = x & y;
	}
	}

	void init_memory(void start, void end) {
	unsigned char state = 1;
	while (start != end) {
	state *= 7; state ^= 0x27; state += 1;
	((unsigned char)start) = state;
	start = ((char*)start) + 1;
	}
	}

	void init_memory_float(float start, float end) {
	float state = 1.0;
	while (start != end) {
	state *= 1.1;
	*start = state;
	start++;
	}
	}

	unsigned digest_memory(void start, void end) {
	unsigned state = 1;
	while (start != end) {
	state *= 3;
	state ^= ((unsigned char)start);
	state = (state >> 8 ^ state << 8);
	start = ((char*)start) + 1;
	}
	return state;
	}

	class Timer {

	public:
	Timer(const char* title, bool print) {
	Title = title;
	Print = print;
	gettimeofday(&Start, 0);
	}

	~Timer() {
	gettimeofday(&End, 0);
	long mtime, s,us;
	s = End.tv_sec - Start.tv_sec;
	us = End.tv_usec - Start.tv_usec;
	mtime = (s*1000 + us/1000.0)+0.5;
	if (Print)
	std::cout<<Title<<", "<<mtime<<", msec\n";
	}

	private:
	const char* Title;
	bool Print;
	struct timeval Start, End;
	};


	// Warmup and then measure.
	#define BENCH(NAME, RUN_LINE, ITER, DIGEST_LINE) {\
	RUN_LINE;\
	Timer atimer(NAME, print_times);\
	for (int i=0; i < (ITER); ++i) RUN_LINE;\
	unsigned r = DIGEST_LINE;\
	results.push_back(r);\
	}

	int main(int argc,char* argv[]){

	bool print_times = argc > 1;

	std::vector<unsigned> results;
	unsigned dummy = 0;
	#ifdef SMALL_PROBLEM_SIZE
	const int Mi = 1<<10;
	#else
	const int Mi = 1<<18;
	#endif
	init_memory(&ia[0], &ia[N]);
	init_memory(&ib[0], &ib[N]);
	init_memory(&ic[0], &ic[N]);
	init_memory(&sa[0], &sa[N]);
	init_memory(&sb[0], &sb[N]);
	init_memory(&sc[0], &sc[N]);
	init_memory(&a[0], &a[N*2]);
	init_memory(&b[0], &b[N*2]);
	init_memory(&c[0], &c[N*2]);
	init_memory(&ua[0], &ua[N]);
	init_memory(&ub[0], &ub[N]);
	init_memory(&uc[0], &uc[N]);
	init_memory(&G[0][0], &G[0][N]);
	init_memory_float(&fa[0], &fa[N]);
	init_memory_float(&fb[0], &fb[N]);
	init_memory_float(&da[0], &da[N]);
	init_memory_float(&db[0], &db[N]);
	init_memory_float(&dc[0], &dc[N]);
	init_memory_float(&dd[0], &dd[N]);

	BENCH("Example1", example1(), Mi*10, digest_memory(&a[0], &a[256]));
	BENCH("Example2a", example2a(N, 2), Mi*4, digest_memory(&b[0], &b[N]));
	BENCH("Example2b", example2b(N, 2), Mi*2, digest_memory(&a[0], &a[N]));
	BENCH("Example3", example3(N, ia, ib), Mi*2, digest_memory(&ia[0], &ia[N]));
	BENCH("Example4a", example4a(N, ia, ib), Mi*2, digest_memory(&ia[0], &ia[N]));
	BENCH("Example4b", example4b(N-10, ia, ib), Mi*2, digest_memory(&ia[0], &ia[N]));
	BENCH("Example4c", example4c(N, ia, ib), Mi*2, digest_memory(&ib[0], &ib[N]));
	BENCH("Example7", example7(4), Mi*4, digest_memory(&a[0], &a[N]));
	BENCH("Example8", example8(8), Mi/4, digest_memory(&G[0][0], &G[0][N]));
	BENCH("Example9", example9(&dummy), Mi*2, dummy);
	BENCH("Example10a", example10a(sa,sb,sc,ia,ib,ic), Mi*2, digest_memory(&ia[0], &ia[N]) + digest_memory(&sa[0], &sa[N]));
	BENCH("Example10b", example10b(sa,sb,sc,ia,ib,ic), Mi*4, digest_memory(&ia[0], &ia[N]));
	BENCH("Example11", example11(), Mi*2, digest_memory(&d[0], &d[N]));
	BENCH("Example12", example12(), Mi*4, digest_memory(&a[0], &a[N]));
	//BENCH("Example21", example21(ia, N), Mi*4, digest_memory(&ia[0], &ia[N]));
	BENCH("Example23", example23(usa,ua), Mi*8, digest_memory(&usa[0], &usa[256]));
	BENCH("Example24", example24(2,4), Mi*2, 0);
	BENCH("Example25", example25(), Mi*2, digest_memory(&dj[0], &dj[N]));

	std::cout<<std::hex;
	std::cout<<"Results: ("<<std::accumulate(results.begin(), results.end(), 0)<<"):";
	for (unsigned i=0; i < results.size(); ++i) {
	std::cout<<" "<<results[i];
	}
	std::cout<<"\n";

	return 0;
	}