///////////////////////////////////////////////////////////////////////////
//
// Copyright (c) 2004 Intel Corporation 
// All rights reserved. 
//
// Redistribution and use in source and binary forms, with or without 
// modification, are permitted provided that the following conditions are met: 
//
// * Redistributions of source code must retain the above copyright notice, 
// this list of conditions and the following disclaimer. 
// * Redistributions in binary form must reproduce the above copyright notice, 
// this list of conditions and the following disclaimer in the documentation 
// and/or other materials provided with the distribution. 
// * Neither name of Intel Corporation nor the names of its contributors 
// may be used to endorse or promote products derived from this software 
// without specific prior written permission.
// 
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR 
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
///////////////////////////////////////////////////////////////////////////

#include "stdafx.h"
#include "ipp.h"

main()
{
	int i,j,k;

	const int MAX = (1<<16)+(1<<14);
	Ipp32f *pVecSrc1 = ippsMalloc_32f(MAX*4);
	Ipp32f *pVecSrc2 = ippsMalloc_32f(MAX*4);
	Ipp32f *pTmp1 = ippsMalloc_32f(MAX*4);
	Ipp32f *pTmp2 = ippsMalloc_32f(MAX*4);
	
	Ipp32f pMatrix[16] = {
		1.0f, 0.0f, 0.0f, 0.0f,
		0.0f, 1.0f, 0.0f, 0.0f,
		0.0f, 0.0f, 1.0f, 0.0f,
		0.0f, 0.0f, 0.0f, 1.0f } ;
	Ipp32f *pVecDst = ippsMalloc_32f(MAX*4);
	Ipp64u start, pTime[64];

	int loopSize, bufSize = 128;
	ippsSet_32f(1.0f, pVecSrc1, MAX*4);
	ippsSet_32f(2.0f, pVecSrc2, MAX*4);
 
	for (j=0; j<2; j++)
	{
		for (i=0; i<16; i++)
		{
			loopSize = bufSize;
			int size = (1<<i);
			start = ippCoreGetCpuClocks();
			start = ippCoreGetCpuClocks() * 2 - start;
			for(k=0; k<size; k++)
			{
				ippmMul_mv_32f_4x4(pMatrix, 16,
					&pVecSrc1[k*4], &pTmp1[k*4]);
				ippmMul_mv_32f_4x4(pMatrix, 16,
					&pVecSrc2[k*4], &pTmp2[k*4]);
				ippmAdd_vv_32f_4x1( &pTmp1[k*4],
					&pTmp2[k*4], &pVecDst[k*4]);
			}
			pTime[i] = ippCoreGetCpuClocks() - start;

			start = ippCoreGetCpuClocks();
			start = ippCoreGetCpuClocks() * 2 - start;
			ippmMul_mva_32f_4x4(pMatrix, 16,
				pVecSrc1, 16, pTmp1, 16, size);
			ippmMul_mva_32f_4x4(pMatrix, 16,
				pVecSrc2, 16, pTmp2, 16, size);
			ippmAdd_vava_32f_4x1(pTmp1, 16,
				pTmp2, 16, pVecDst, 16, size);
			pTime[i+16] = ippCoreGetCpuClocks() - start;

			start = ippCoreGetCpuClocks();
			start = ippCoreGetCpuClocks() * 2 - start;
			for (k=0; k<size; k+=bufSize)
			{
				if (k+bufSize > size) loopSize = size-k;
				ippmMul_mva_32f_4x4(pMatrix, 16,
					&pVecSrc1[k*4], 16, pTmp1, 16,
					loopSize);
				ippmMul_mva_32f_4x4(pMatrix, 16,
					&pVecSrc2[k*4], 16, pTmp2, 16,
					loopSize);
				ippmAdd_vava_32f_4x1(pTmp1, 16,
					pTmp2, 16, &pVecDst[k*4], 16,
					loopSize);
			}
			pTime[i+32] = ippCoreGetCpuClocks() - start;
		}
	}

	printf("Len,\tv,\tva,\tva w/buf\n");
	for (i=1; i<16; i++)
		printf("%d,\t%d,\t%d,\t%d,\n", 1<<i,(int)pTime[i],
		(int)pTime[i+16], (int)pTime[i+32]);

	ippsFree(pVecSrc1);
	ippsFree(pVecSrc2);
	ippsFree(pTmp1);
	ippsFree(pTmp2);
	ippsFree(pVecDst);
}
