/*
 * dct.cpp
 *
 *  Created on: Feb 24, 2013
 *      Author: nick
 */

#include <stdint.h>
#include <stdio.h>
#include <mmintrin.h>
#include <x86intrin.h>
#include <unistd.h>
#include <signal.h>

volatile int alarmed = 1;

static void sighandler( int signum )
{
    alarmed = 1;
    signal( signum, sighandler );
}

void set_alarm( int seconds )
{
    alarmed = 0;
    signal( SIGALRM, sighandler );
    alarm( seconds );
}

unsigned long hardclock( void )
{
    unsigned long lo, hi;
    asm( "rdtsc" : "=a" (lo), "=d" (hi) );
    return( lo | (hi << 32) );
}

static void dct4x4dc( int16_t d[16] )
{
    int16_t tmp[16];
    int s01, s23;
    int d01, d23;
    int i;

    for( i = 0; i < 4; i++ )
    {
        s01 = d[i*4+0] + d[i*4+1];
        d01 = d[i*4+0] - d[i*4+1];
        s23 = d[i*4+2] + d[i*4+3];
        d23 = d[i*4+2] - d[i*4+3];

        tmp[0*4+i] = s01 + s23;
        tmp[1*4+i] = s01 - s23;
        tmp[2*4+i] = d01 - d23;
        tmp[3*4+i] = d01 + d23;
    }

    for( i = 0; i < 4; i++ )
    {
        s01 = tmp[i*4+0] + tmp[i*4+1];
        d01 = tmp[i*4+0] - tmp[i*4+1];
        s23 = tmp[i*4+2] + tmp[i*4+3];
        d23 = tmp[i*4+2] - tmp[i*4+3];

        d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
        d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
        d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
        d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
    }
}


void printWithMsg(char* msg, __m64 m)
{
	printf("%-40s[0x", msg);
	unsigned char* pointer1 = (unsigned char*)&m;
	for (int i = 0; i < 8; i ++)
	{
		printf("%02x", pointer1[i]);
	}
	printf("]\n");

	printf("%-40s[", msg);
	int16_t* pointer2 = (int16_t*)&m;
	for (int i = 0; i < 4; i ++)
	{
		printf("%02i,", pointer2[i]);
	}
	printf("]\n");
}

void printHexWithMsg(char* msg, __m64 m)
{
	printf("%-40s[0x", msg);
	unsigned char* pointer1 = (unsigned char*)&m;
	for (int i = 0; i < 8; i ++)
	{
		printf("%02x", pointer1[i]);
	}
	printf("]\n");
}

void printDecWithMsg(char* msg, __m64 m)
{
	printf("%-40s[", msg);
	int16_t* pointer2 = (int16_t*)&m;
	for (int i = 0; i < 4; i ++)
	{
		printf("%02i,", pointer2[i]);
	}
	printf("]\n");
}


void printMatrixWithMsg(char* msg, int16_t Matrix[4][4])
{
	for (int i = 0; i <4; i ++)
	{
		char buf[32];
		sprintf(buf, "I%d", i+1);
		printDecWithMsg(buf, *((__m64*)Matrix[i]));
	}
}

void test1()
{
	int16_t v1[4] =
	{
			0xffff, 0x0000, 0xf0f0, 0x00ff,
	};
	int16_t v2[4] =
	{
			0x0f0f, 0x0ff0, 0xff00, 0xf00f,
	};
	printWithMsg("v1", *((__m64*)v1));
	printWithMsg("v2", *((__m64*)v2));

	__m64 v3 = _mm_add_pi16(*((__m64*)v1), *((__m64*)v2));
	printWithMsg("v3", v3);
}

void test2()
{
	int16_t I1[4] =
	{
			0x0001, 0x0002, 0x0003, 0x0004,
	};
	int16_t I2[4] =
	{
			0x0005, 0x0006, 0x0007, 0x0008,
	};
	int16_t I3[4] =
	{
			0x0009, 0x000a, 0x000b, 0x000c,
	};
	int16_t I4[4] =
	{
			0x000d, 0x000e, 0x000f, 0x0010,
	};
	printDecWithMsg("I1", *((__m64*)I1));
	printDecWithMsg("I2", *((__m64*)I2));
	printDecWithMsg("I3", *((__m64*)I3));
	printDecWithMsg("I4", *((__m64*)I4));
	__m64 T1 = _mm_unpacklo_pi16(*((__m64*)I1), *((__m64*)I3));
	__m64 T2 = _mm_unpacklo_pi16(*((__m64*)I2), *((__m64*)I4));
	__m64 T3 = _mm_unpackhi_pi16(*((__m64*)I1), *((__m64*)I3));
	__m64 T4 = _mm_unpackhi_pi16(*((__m64*)I2), *((__m64*)I4));

	printDecWithMsg("T1", T1);
	printDecWithMsg("T2", T2);
	printDecWithMsg("T3", T3);
	printDecWithMsg("T4", T4);

	__m64 O1 = _mm_unpacklo_pi16(T1, T2);
	__m64 O2 = _mm_unpackhi_pi16(T1, T2);
	__m64 O3 = _mm_unpacklo_pi16(T3, T4);
	__m64 O4 = _mm_unpackhi_pi16(T3, T4);
	printDecWithMsg("O1", O1);
	printDecWithMsg("O2", O2);
	printDecWithMsg("O3", O3);
	printDecWithMsg("O4", O4);
	_mm_stream_pi((__m64*)I1, O1);
	_mm_stream_pi((__m64*)I2, O2);
	_mm_stream_pi((__m64*)I3, O3);
	_mm_stream_pi((__m64*)I4, O4);
	printDecWithMsg("I1", *((__m64*)I1));
	printDecWithMsg("I2", *((__m64*)I2));
	printDecWithMsg("I3", *((__m64*)I3));
	printDecWithMsg("I4", *((__m64*)I4));

}

#define ROUND 10000

int16_t Matrix[4][4] =
{
	{
		0x0001, 0x0002, 0x0003, 0x0004,
	},

	{
		0x0005, 0x0006, 0x0007, 0x0008,
	},

	{
		0x0009, 0x000a, 0x000b, 0x000c,
	},

	{
		0x000d, 0x000e, 0x000f, 0x0010,
	},
};

void test3()
{

	unsigned long long tsc = hardclock();
	for (int counter = 0; counter < ROUND; counter ++)
	{
		for (int i = 0; i < 4; i ++)
		{
			for (int j = i+1; j < 4; j ++)
			{
				int16_t swap = Matrix[i][j];
				Matrix[i][j] = Matrix[j][i];
				Matrix[j][i] = swap;
			}
		}
	}
	printf( "clock cycle is %9lu\n",  hardclock() - tsc);

}


void test4()
{

	unsigned long long tsc = hardclock();
	for (int counter=0; counter< ROUND; counter++)
	{
		__m64 T1 = _mm_unpacklo_pi16(*((__m64*)Matrix[0]), *((__m64*)Matrix[2]));
		__m64 T2 = _mm_unpacklo_pi16(*((__m64*)Matrix[1]), *((__m64*)Matrix[3]));
		__m64 T3 = _mm_unpackhi_pi16(*((__m64*)Matrix[0]), *((__m64*)Matrix[2]));
		__m64 T4 = _mm_unpackhi_pi16(*((__m64*)Matrix[1]), *((__m64*)Matrix[3]));


		__m64 O1 = _mm_unpacklo_pi16(T1, T2);
		__m64 O2 = _mm_unpackhi_pi16(T1, T2);
		__m64 O3 = _mm_unpacklo_pi16(T3, T4);
		__m64 O4 = _mm_unpackhi_pi16(T3, T4);

		_mm_stream_pi((__m64*)Matrix[0], O1);
		_mm_stream_pi((__m64*)Matrix[1], O2);
		_mm_stream_pi((__m64*)Matrix[2], O3);
		_mm_stream_pi((__m64*)Matrix[3], O4);
	}

	printf( "clock cycle is %9lu\n",  hardclock() - tsc);
}



void test5()
{
	set_alarm(1);
	unsigned int counter;
	for (counter = 0; !alarmed; counter ++)
	{
		for (int i = 0; i < 4; i ++)
		{
			for (int j = i+1; j < 4; j ++)
			{
				int16_t swap = Matrix[i][j];
				Matrix[i][j] = Matrix[j][i];
				Matrix[j][i] = swap;
			}
		}
	}
	printf( "running  %9lu rounds\n",  counter);

}


void test6()
{
	set_alarm(1);
	unsigned int counter;
	for (counter = 0; !alarmed; counter ++)
	{
		__m64 T1 = _mm_unpacklo_pi16(*((__m64*)Matrix[0]), *((__m64*)Matrix[2]));
		__m64 T2 = _mm_unpacklo_pi16(*((__m64*)Matrix[1]), *((__m64*)Matrix[3]));
		__m64 T3 = _mm_unpackhi_pi16(*((__m64*)Matrix[0]), *((__m64*)Matrix[2]));
		__m64 T4 = _mm_unpackhi_pi16(*((__m64*)Matrix[1]), *((__m64*)Matrix[3]));


		*((__m64*)Matrix[0]) = _mm_unpacklo_pi16(T1, T2);
		*((__m64*)Matrix[1]) = _mm_unpackhi_pi16(T1, T2);
		*((__m64*)Matrix[2]) = _mm_unpacklo_pi16(T3, T4);
		*((__m64*)Matrix[3]) = _mm_unpackhi_pi16(T3, T4);
	}
	printf( "running  %9lu rounds\n",  counter);
}


void test7()
{
	int16_t I1[4] =
	{
			0x0001, 0x0002, 0x0003, 0x0004,
	};
	int16_t I2[4] =
	{
			0x0005, 0x0006, 0x0007, 0x0008,
	};
	int16_t I3[4] =
	{
			0x0009, 0x000a, 0x000b, 0x000c,
	};
	int16_t I4[4] =
	{
			0x000d, 0x000e, 0x000f, 0x0010,
	};
	printDecWithMsg("I1", *((__m64*)I1));
	printDecWithMsg("I2", *((__m64*)I2));
	printDecWithMsg("I3", *((__m64*)I3));
	printDecWithMsg("I4", *((__m64*)I4));
	__m64 T1 = _mm_unpacklo_pi16(*((__m64*)I1), *((__m64*)I3));
	__m64 T2 = _mm_unpacklo_pi16(*((__m64*)I2), *((__m64*)I4));
	__m64 T3 = _mm_unpackhi_pi16(*((__m64*)I1), *((__m64*)I3));
	__m64 T4 = _mm_unpackhi_pi16(*((__m64*)I2), *((__m64*)I4));

	printDecWithMsg("T1", T1);
	printDecWithMsg("T2", T2);
	printDecWithMsg("T3", T3);
	printDecWithMsg("T4", T4);

	*((__m64*)I1) = _mm_unpacklo_pi16(T1, T2);
	*((__m64*)I2) = _mm_unpackhi_pi16(T1, T2);
	*((__m64*)I3) = _mm_unpacklo_pi16(T3, T4);
	*((__m64*)I4) = _mm_unpackhi_pi16(T3, T4);


	printDecWithMsg("I1", *((__m64*)I1));
	printDecWithMsg("I2", *((__m64*)I2));
	printDecWithMsg("I3", *((__m64*)I3));
	printDecWithMsg("I4", *((__m64*)I4));

}
int main()
{
	//test1();
	//test2();
	//test3();
	//test4();
	test5();
	test6();
	//test7();
	return 0;
}
