///////////////////////////////////////////////////////////////////////////
//
// Copyright (c) 2004 Intel Corporation 
// All rights reserved. 
//
// Redistribution and use in source and binary forms, with or without 
// modification, are permitted provided that the following conditions are met: 
//
// * Redistributions of source code must retain the above copyright notice, 
// this list of conditions and the following disclaimer. 
// * Redistributions in binary form must reproduce the above copyright notice, 
// this list of conditions and the following disclaimer in the documentation 
// and/or other materials provided with the distribution. 
// * Neither name of Intel Corporation nor the names of its contributors 
// may be used to endorse or promote products derived from this software 
// without specific prior written permission.
// 
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR 
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
///////////////////////////////////////////////////////////////////////////

#include "stdafx.h"
#include "ipp.h"

main()
{
	int i,j,k;

	const int MAX = (1<<16)+(1<<14);
	Ipp32f *pVecSrc = ippsMalloc_32f(MAX*4);
	Ipp32f pMatrix[16] = {
		1.0f, 0.0f, 0.0f, 0.0f,
		0.0f, 1.0f, 0.0f, 0.0f,
		0.0f, 0.0f, 1.0f, 0.0f,
		0.0f, 0.0f, 0.0f, 1.0f } ;
	Ipp32f *pVecDst = ippsMalloc_32f(MAX*4);
	Ipp64u start, pTime[64];

	ippsSet_32f(1.0f, pVecSrc, MAX*4);

	for (j=0; j<2; j++)
		for (i=0; i<16; i++)
		{
			start = ippCoreGetCpuClocks();
			start = ippCoreGetCpuClocks() * 2 - start;
			for(k=0; k<(1<<i); k++)
			{
				ippmMul_mv_32f_4x4(pMatrix, 16,
					&pVecSrc[k*4], &pVecDst[k*4]);
			}
			pTime[i] = ippCoreGetCpuClocks() - start;

			start = ippCoreGetCpuClocks();
			start = ippCoreGetCpuClocks() * 2 - start;
			for(k=0; k<(1<<i); k++)
			{
				pVecDst[k*4] = pMatrix[0]*pVecSrc[k*4] +
					pMatrix[1]*pVecSrc[k*4+1];
					pMatrix[2]*pVecSrc[k*4+2];
					pMatrix[3]*pVecSrc[k*4+3];
				pVecDst[k*4+1] = pMatrix[4]*pVecSrc[k*4] +
					pMatrix[5]*pVecSrc[k*4+1];
					pMatrix[6]*pVecSrc[k*4+2];
					pMatrix[7]*pVecSrc[k*4+3];
				pVecDst[k*4+2] = pMatrix[8]*pVecSrc[k*4] +
					pMatrix[9]*pVecSrc[k*4+1];
					pMatrix[10]*pVecSrc[k*4+2];
					pMatrix[11]*pVecSrc[k*4+3];
				pVecDst[k*4+3] = pMatrix[12]*pVecSrc[k*4] +
					pMatrix[13]*pVecSrc[k*4+1];
					pMatrix[14]*pVecSrc[k*4+2];
					pMatrix[15]*pVecSrc[k*4+3];
			}
			pTime[i+32] = ippCoreGetCpuClocks() - start;

			start = ippCoreGetCpuClocks();
			start = ippCoreGetCpuClocks() * 2 - start;
			ippmMul_mva_32f_4x4(pMatrix, 16,
				pVecSrc, 16, pVecDst, 16, (1<<i));
			pTime[i+16] = ippCoreGetCpuClocks() - start;
		}

	printf("Len:\tC:\tmv:\tmva:\n");
	for (i=0; i<16; i++)
		printf("%d,\t%d,\t%d,\t%d,\n", 1<<i,(int)pTime[i+32],
		(int)pTime[i], (int)pTime[i+16]);

	ippsFree(pVecSrc);
	ippsFree(pVecDst);
}
