/*  ============================================================================
 *   Copyright (c) Texas Instruments Inc 2010
 *
 *   Use of this software is controlled by the terms and conditions found in the
 *   license agreement under which this software has been supplied.
* ============================================================================
Example to show the performance of DSP core access memory
* =============================================================================
 *  Revision History
 *  ===============
 *  Nov 5, 2011 Brighton Feng   File Created
 * ============================================================================
 */

#include <csl_cacheAux.h>
#include <stdio.h>
#include <c6x.h>
#include "KeyStone_common.h"

#define 	COPY_TEST_SIZE 			(64*1024)
#define 	LL2_TEST_BASE_ADDR 		(0x820000)
#define 	HyperLink_LL2_TEST_BASE_ADDR	(0x41800000)
#define 	HyperLink_SL2_TEST_BASE_ADDR	(0x4C100000)
#define 	HyperLink_DDR_TEST_BASE_ADDR	(0x48000000)

#define 	LOAD_STORE_TIMES 		(512)
#define 	LD_ST_TEST_OVERHEAD 	40
#define 	LL2_LOAD_STORE_SIZE 	(256*1024)
#define 	SL2_LOAD_STORE_SIZE 	(1024*1024)
#define 	DDR_LOAD_STORE_SIZE 	(16*1024*1024)

extern void MemCopy8(long long * restrict dst, long long * restrict src, Uint32 uiCount);

void MemCopy8Test(long long * srcBuff, long long * dstBuff, Uint32 uiSize)
{
	unsigned int cycles;
	Uint32 iBandWidth;

	//Clear cache
	CACHE_wbInvAllL1d(CACHE_WAIT);
	CACHE_invAllL1p(CACHE_WAIT);
	cycles= TSCL;
	MemCopy8(dstBuff, srcBuff, uiSize/8);
	cycles= TSC_count_cycle_from(cycles);
	iBandWidth= (unsigned long long)uiSize*gDSP_Core_Speed_Hz/cycles/1000000;
	printf("%4d MB/s, copy %d bytes from 0x%x to 0x%x consumes %ld cycles\n", 
		iBandWidth, uiSize, srcBuff, dstBuff, cycles);
}

void MemCopyTest(Uint32 uiAddress, Uint32 uiSize)
{
	//make external memory noncacheable, nonprefetchable
	gpCGEM_regs->MAR[(uiAddress/16/1024/1024)]=0;	

	puts("     noncacheable, nonprefetchable memory copy");
	MemCopy8Test((long long *)LL2_TEST_BASE_ADDR,(long long *)uiAddress, uiSize);
	MemCopy8Test((long long *)uiAddress,(long long *)LL2_TEST_BASE_ADDR, uiSize);
	MemCopy8Test((long long *)uiAddress,(long long *)(uiAddress+uiSize), uiSize);

	//make external memory prefetchable cacheable, use L1D cache only
	gpCGEM_regs->MAR[(uiAddress/16/1024/1024)]=1|
		(1<<CSL_CGEM_MAR0_PFX_SHIFT);	
	CACHE_setL2Size(CACHE_0KCACHE);

	puts("     32KB L1D cache, prefetchable memory copy");
	MemCopy8Test((long long *)LL2_TEST_BASE_ADDR,(long long *)uiAddress, uiSize);
	MemCopy8Test((long long *)uiAddress,(long long *)LL2_TEST_BASE_ADDR, uiSize);
	MemCopy8Test((long long *)uiAddress,(long long *)(uiAddress+uiSize), uiSize);

	//use both L1D and L2 cache
	CACHE_setL2Size(CACHE_256KCACHE);

	puts("     32KB L1D cache, 256KB L2 cache, prefetchable memory copy");
	CACHE_wbInvAllL2(CACHE_WAIT);
	MemCopy8Test((long long *)LL2_TEST_BASE_ADDR,(long long *)uiAddress, uiSize);
	CACHE_wbInvAllL2(CACHE_WAIT);
	MemCopy8Test((long long *)uiAddress,(long long *)LL2_TEST_BASE_ADDR, uiSize);
	CACHE_wbInvAllL2(CACHE_WAIT);
	MemCopy8Test((long long *)uiAddress,(long long *)(uiAddress+uiSize), uiSize);

}

void LDDWTest(Uint8 * srcBuff, Uint32 uiSize)
{
	unsigned int cycles;
	Uint32 		uiIndex;

	printf("Memory access performance test at 0x%x\n", srcBuff);
	uiIndex=0;
	while(uiIndex*8*LOAD_STORE_TIMES<uiSize)
	{
		CACHE_wbInvAllL1d(CACHE_WAIT);
		CACHE_wbInvAllL2(CACHE_WAIT);
		_mfence();
		_mfence();

		cycles= TSCL;
		Asm_LDDW_Test(srcBuff, uiIndex, LOAD_STORE_TIMES);
		cycles= TSC_count_cycle_from(cycles)-LD_ST_TEST_OVERHEAD;
		printf("Index=%4d  Cycles/LDDW= %.2f\n", uiIndex, (float)cycles/(LOAD_STORE_TIMES));
		if(uiIndex<8)
			uiIndex++;
		else if(uiIndex<32)
			uiIndex+=2;
		else if(uiIndex<64)
			uiIndex+=8;
		else if(uiIndex<128)
			uiIndex+=32;
		else if(uiIndex<256)
			uiIndex+=64;
		else if(uiIndex<512)
			uiIndex+=128;
		else if(uiIndex<1024)
			uiIndex+=256;
		else if(uiIndex<2048)
			uiIndex+=512;
		else if(uiIndex<4096)
			uiIndex+=1024;
		else if(uiIndex<8192)
			uiIndex+=2048;
		else 
			uiIndex+=4096;
	}
}

void STDWTest(Uint8 * srcBuff, Uint32 uiSize)
{
	unsigned int cycles;
	Uint32 		uiIndex;

	printf("Memory access performance test at 0x%x\n", srcBuff);
	uiIndex=0;
	while(uiIndex*8*LOAD_STORE_TIMES<uiSize)
	{
		CACHE_wbInvAllL1d(CACHE_WAIT);
		CACHE_wbInvAllL2(CACHE_WAIT);
		_mfence();
		_mfence();

		cycles= TSCL;
		Asm_STDW_Test(srcBuff, uiIndex, LOAD_STORE_TIMES);
		cycles= TSC_count_cycle_from(cycles)-LD_ST_TEST_OVERHEAD;
		printf("Index=%4d  Cycles/STDW= %.2f\n", uiIndex, (float)cycles/(LOAD_STORE_TIMES));
		if(uiIndex<8)
			uiIndex++;
		else if(uiIndex<32)
			uiIndex+=2;
		else if(uiIndex<64)
			uiIndex+=8;
		else if(uiIndex<128)
			uiIndex+=32;
		else if(uiIndex<256)
			uiIndex+=64;
		else if(uiIndex<512)
			uiIndex+=128;
		else if(uiIndex<1024)
			uiIndex+=256;
		else if(uiIndex<2048)
			uiIndex+=512;
		else if(uiIndex<4096)
			uiIndex+=1024;
		else if(uiIndex<8192)
			uiIndex+=2048;
		else 
			uiIndex+=4096;
	}
}

void LoadStoreCycleTest(Uint32 uiAddress, Uint32 uiSize)
{
	//set 0KB L2 cache
	CACHE_setL2Size(CACHE_0KCACHE);

	//make external memory nonprefetchable, noncacheable
	gpCGEM_regs->MAR[uiAddress/16/1024/1024]=0;	

	puts("\nLDDW test: nonprefetchable, noncacheable");
	LDDWTest((Uint8 *)(uiAddress), uiSize);
	puts("\nSTDW test: nonprefetchable, noncacheable");
	STDWTest((Uint8 *)(uiAddress), uiSize);
	
	//make external memory prefetchable, cacheable
	gpCGEM_regs->MAR[uiAddress/16/1024/1024]= 1|
		(1<<CSL_CGEM_MAR0_PFX_SHIFT);	 	
	CACHE_setL2Size(CACHE_0KCACHE);

	puts("\nLDDW test: prefetchable, 32KB L1D cahce");
	LDDWTest((Uint8 *)(uiAddress), uiSize);
	puts("\nSTDW test: prefetchable, 32KB L1D cahce");
	STDWTest((Uint8 *)(uiAddress), uiSize);
	
	//set 256KB L2 cache
	CACHE_setL2Size(CACHE_256KCACHE);

	puts("\nLDDW test: prefetchable, 32KB L1D, 256KB L2 cahce");
	LDDWTest((Uint8 *)(uiAddress), uiSize);
	puts("\nSTDW test: prefetchable, 32KB L1D, 256KB L2 cahce");
	STDWTest((Uint8 *)(uiAddress), uiSize);
	
}

void HyperLink_DSP_core_test()
{
	//Configure L1 as 32KB cache
	CACHE_setL1PSize(CACHE_L1_32KCACHE);
	CACHE_setL1DSize(CACHE_L1_32KCACHE);

	MemCopyTest(HyperLink_LL2_TEST_BASE_ADDR, COPY_TEST_SIZE);
	MemCopyTest(HyperLink_SL2_TEST_BASE_ADDR, COPY_TEST_SIZE);
	MemCopyTest(HyperLink_DDR_TEST_BASE_ADDR, COPY_TEST_SIZE);
#if 1
	LoadStoreCycleTest(HyperLink_LL2_TEST_BASE_ADDR, LL2_LOAD_STORE_SIZE);
	LoadStoreCycleTest(HyperLink_SL2_TEST_BASE_ADDR, SL2_LOAD_STORE_SIZE);
	LoadStoreCycleTest(HyperLink_DDR_TEST_BASE_ADDR, DDR_LOAD_STORE_SIZE);
#endif
}
