/*  ============================================================================
 *     Copyright (C) 2011, 2012, 2013 Texas Instruments Incorporated.       *
 *
 *   Use of this software is controlled by the terms and conditions found in the
 *   license agreement under which this software has been supplied.
 *   ===========================================================================
 */
/** ============================================================================
 *   @n This is an example to benchmark the EDMA3 performance
 */
/* =============================================================================
 *  Revision History
 *  ===============
 *  June 23, 2013 Brighton Feng   File Created
 * ============================================================================
 */

#include <stdio.h>
#include <csl_cacheAux.h>
#include <csl_edma3.h>
#include <cslr_device.h>
#include "KeyStone_common.h"
#include "PCIE_test.h"
#include <csl_xmcAux.h>

#define 	A_COUNT 	(16*1024)
#define 	B_COUNT 	(4)

#define 	Local_LL2_TEST_SRC (0x11810000)
#define 	Local_LL2_TEST_DST (0x11830000)

#define 	Local_SL2_TEST_SRC 	(0xC080000)
#define 	Local_SL2_TEST_DST 	(0xC0C0000)

#define 	Local_DDR_TEST_SRC 	(0x88000000)
#define 	Local_DDR_TEST_DST 	(0x88100000)

void edma_Throughput_Test (Uint32 src, Uint32 dst, Uint32 uiACount, 
	Uint32 uiBCount, Uint32 uiIndex, CSL_TpccRegs*  EDMACCRegs, Uint32 TC_channel)
{
	unsigned int cycles;
	Uint32 loopIndex;
	unsigned long long *srcBuff=(unsigned long long *)src;
	unsigned long long *dstBuff=(unsigned long long *)dst;

	/* Initialize data buffers */
	for (loopIndex = 0; loopIndex < uiACount*uiBCount/8; loopIndex++) {
		srcBuff[loopIndex] = _itoll(dst+loopIndex*4+4,dst+loopIndex*4);
		dstBuff[loopIndex] = 0xaaaaaaaaaaaaaaaaULL;
	}      

	//Clear cache    
	CACHE_wbInvAllL1d(CACHE_WAIT);
	CACHE_wbInvAllL2(CACHE_WAIT);
	_mfence();
	_mfence();

	//clear completion flag
	EDMACCRegs->TPCC_ICR=(1<<TC_channel); 

	EDMACCRegs->PARAMSET[TC_channel].OPT= 
		CSL_EDMA3_OPT_MAKE(CSL_EDMA3_ITCCH_DIS, 
			CSL_EDMA3_TCCH_DIS, 
			CSL_EDMA3_ITCINT_DIS, 
			CSL_EDMA3_TCINT_EN,
			TC_channel,
			CSL_EDMA3_TCC_NORMAL,
			CSL_EDMA3_FIFOWIDTH_NONE, 
			CSL_EDMA3_STATIC_DIS, 
			CSL_EDMA3_SYNC_AB, 
			CSL_EDMA3_ADDRMODE_INCR, 
			CSL_EDMA3_ADDRMODE_INCR);
	EDMACCRegs->PARAMSET[TC_channel].SRC= src;
	EDMACCRegs->PARAMSET[TC_channel].A_B_CNT= (uiBCount<<16)|uiACount;
	EDMACCRegs->PARAMSET[TC_channel].DST= dst;
	EDMACCRegs->PARAMSET[TC_channel].SRC_DST_BIDX= (uiIndex<<16)|uiIndex;
	EDMACCRegs->PARAMSET[TC_channel].LINK_BCNTRLD= (uiBCount<<16)|0xFFFF;
	EDMACCRegs->PARAMSET[TC_channel].SRC_DST_CIDX= 0;
	EDMACCRegs->PARAMSET[TC_channel].CCNT= 1;

	/*Manually trigger the EDMA*/
	EDMACCRegs->TPCC_ESR= 1<<(TC_channel);

	cycles = TSCL; 	/*record start time*/
	/* Wait for completion */
	while ((EDMACCRegs->TPCC_IPR&(1<<(TC_channel))) ==0);
	cycles= TSC_count_cycle_from(cycles);

	printf("transfer %4d * %5d Bytes with index=%5d from 0x%8x to 0x%8x, ", uiBCount, uiACount, uiIndex, src, dst);
	printf("consumes %6d cycles, achieve bandwidth %5lld MB/s\n", cycles, (unsigned long long)uiACount*uiBCount*gDSP_Core_Speed_Hz/cycles/1000000);

	//clear completion flag
	EDMACCRegs->TPCC_ICR=(1<<TC_channel); 

	CSL_XMC_invalidatePrefetchBuffer();
	/* Verify data transfered */
	if((dst&0xffffff)>=0xe00000&&(dst&0xffffff)<0xf00000)
		return; 	//L1P can't be verified by following codes
	if(uiIndex!=uiACount)
		return; 	//non-linear transfer, do not verify it
	for (loopIndex = 0; loopIndex < uiACount*uiBCount/8; loopIndex++) {
		if(dstBuff[loopIndex] != _itoll(dst+loopIndex*4+4,dst+loopIndex*4))
		{
			printf("EDMA data transfer failed at 0x%x\n", dstBuff+loopIndex);
			break;
		}
	}
	return;  

}

void edma_performance_test(Uint32 uiEDMA, Uint32 uiTC, PCIERemoteTestAddress* remoteAddr)
{   
	CSL_TpccRegs*  EDMACCRegs= gpEDMA_CC_regs[uiEDMA];
#if 0
	printf("Overhead test with EDMA%d TC%d\n", uiEDMA, uiTC);

	edma_Throughput_Test(Local_LL2_TEST_SRC, remoteAddr->LL2_DST_ADDR, 8, 1, 8, EDMACCRegs, uiTC);
	edma_Throughput_Test(Local_LL2_TEST_SRC, remoteAddr->SL2_DST_ADDR, 8, 1, 8, EDMACCRegs, uiTC);
	edma_Throughput_Test(Local_LL2_TEST_SRC, remoteAddr->DDR_DST_ADDR, 8, 1, 8, EDMACCRegs, uiTC);

	edma_Throughput_Test(remoteAddr->LL2_SRC_ADDR, Local_LL2_TEST_DST, 8, 1, 8, EDMACCRegs, uiTC);
	edma_Throughput_Test(remoteAddr->SL2_SRC_ADDR, Local_LL2_TEST_DST, 8, 1, 8, EDMACCRegs, uiTC);
	edma_Throughput_Test(remoteAddr->DDR_SRC_ADDR, Local_LL2_TEST_DST, 8, 1, 8, EDMACCRegs, uiTC);

	edma_Throughput_Test(Local_SL2_TEST_SRC, remoteAddr->LL2_DST_ADDR, 8, 1, 8, EDMACCRegs, uiTC);
	edma_Throughput_Test(Local_SL2_TEST_SRC, remoteAddr->SL2_DST_ADDR, 8, 1, 8, EDMACCRegs, uiTC);
	edma_Throughput_Test(Local_SL2_TEST_SRC, remoteAddr->DDR_DST_ADDR, 8, 1, 8, EDMACCRegs, uiTC);

	edma_Throughput_Test(remoteAddr->LL2_SRC_ADDR, Local_SL2_TEST_DST, 8, 1, 8, EDMACCRegs, uiTC);
	edma_Throughput_Test(remoteAddr->SL2_SRC_ADDR, Local_SL2_TEST_DST, 8, 1, 8, EDMACCRegs, uiTC);
	edma_Throughput_Test(remoteAddr->DDR_SRC_ADDR, Local_SL2_TEST_DST, 8, 1, 8, EDMACCRegs, uiTC);

	edma_Throughput_Test(Local_DDR_TEST_SRC, remoteAddr->LL2_DST_ADDR, 8, 1, 8, EDMACCRegs, uiTC);
	edma_Throughput_Test(Local_DDR_TEST_SRC, remoteAddr->SL2_DST_ADDR, 8, 1, 8, EDMACCRegs, uiTC);
	edma_Throughput_Test(Local_DDR_TEST_SRC, remoteAddr->DDR_DST_ADDR, 8, 1, 8, EDMACCRegs, uiTC);

	edma_Throughput_Test(remoteAddr->LL2_SRC_ADDR, Local_DDR_TEST_DST, 8, 1, 8, EDMACCRegs, uiTC);
	edma_Throughput_Test(remoteAddr->SL2_SRC_ADDR, Local_DDR_TEST_DST, 8, 1, 8, EDMACCRegs, uiTC);
	edma_Throughput_Test(remoteAddr->DDR_SRC_ADDR, Local_DDR_TEST_DST, 8, 1, 8, EDMACCRegs, uiTC);

	edma_Throughput_Test(remoteAddr->LL2_SRC_ADDR, remoteAddr->LL2_DST_ADDR, 8, 1, 8, EDMACCRegs, uiTC);
	edma_Throughput_Test(remoteAddr->SL2_SRC_ADDR, remoteAddr->SL2_DST_ADDR, 8, 1, 8, EDMACCRegs, uiTC);
	edma_Throughput_Test(remoteAddr->DDR_SRC_ADDR, remoteAddr->DDR_DST_ADDR, 8, 1, 8, EDMACCRegs, uiTC);
   
#endif	
	printf("Throughput test with EDMA%d TC%d\n", uiEDMA, uiTC);

	edma_Throughput_Test(Local_LL2_TEST_SRC, remoteAddr->LL2_DST_ADDR, A_COUNT, B_COUNT, A_COUNT, EDMACCRegs, uiTC);
	edma_Throughput_Test(Local_LL2_TEST_SRC, remoteAddr->SL2_DST_ADDR, A_COUNT, B_COUNT, A_COUNT, EDMACCRegs, uiTC);
	edma_Throughput_Test(Local_LL2_TEST_SRC, remoteAddr->DDR_DST_ADDR, A_COUNT, B_COUNT, A_COUNT, EDMACCRegs, uiTC);

	edma_Throughput_Test(remoteAddr->LL2_SRC_ADDR, Local_LL2_TEST_DST, A_COUNT, B_COUNT, A_COUNT, EDMACCRegs, uiTC);
	edma_Throughput_Test(remoteAddr->SL2_SRC_ADDR, Local_LL2_TEST_DST, A_COUNT, B_COUNT, A_COUNT, EDMACCRegs, uiTC);
	edma_Throughput_Test(remoteAddr->DDR_SRC_ADDR, Local_LL2_TEST_DST, A_COUNT, B_COUNT, A_COUNT, EDMACCRegs, uiTC);

	edma_Throughput_Test(Local_SL2_TEST_SRC, remoteAddr->LL2_DST_ADDR, A_COUNT, B_COUNT, A_COUNT, EDMACCRegs, uiTC);
	edma_Throughput_Test(Local_SL2_TEST_SRC, remoteAddr->SL2_DST_ADDR, A_COUNT, B_COUNT, A_COUNT, EDMACCRegs, uiTC);
	edma_Throughput_Test(Local_SL2_TEST_SRC, remoteAddr->DDR_DST_ADDR, A_COUNT, B_COUNT, A_COUNT, EDMACCRegs, uiTC);

	edma_Throughput_Test(remoteAddr->LL2_SRC_ADDR, Local_SL2_TEST_DST, A_COUNT, B_COUNT, A_COUNT, EDMACCRegs, uiTC);
	edma_Throughput_Test(remoteAddr->SL2_SRC_ADDR, Local_SL2_TEST_DST, A_COUNT, B_COUNT, A_COUNT, EDMACCRegs, uiTC);
	edma_Throughput_Test(remoteAddr->DDR_SRC_ADDR, Local_SL2_TEST_DST, A_COUNT, B_COUNT, A_COUNT, EDMACCRegs, uiTC);

	edma_Throughput_Test(Local_DDR_TEST_SRC, remoteAddr->LL2_DST_ADDR, A_COUNT, B_COUNT, A_COUNT, EDMACCRegs, uiTC);
	edma_Throughput_Test(Local_DDR_TEST_SRC, remoteAddr->SL2_DST_ADDR, A_COUNT, B_COUNT, A_COUNT, EDMACCRegs, uiTC);
	edma_Throughput_Test(Local_DDR_TEST_SRC, remoteAddr->DDR_DST_ADDR, A_COUNT, B_COUNT, A_COUNT, EDMACCRegs, uiTC);

	edma_Throughput_Test(remoteAddr->LL2_SRC_ADDR, Local_DDR_TEST_DST, A_COUNT, B_COUNT, A_COUNT, EDMACCRegs, uiTC);
	edma_Throughput_Test(remoteAddr->SL2_SRC_ADDR, Local_DDR_TEST_DST, A_COUNT, B_COUNT, A_COUNT, EDMACCRegs, uiTC);
	edma_Throughput_Test(remoteAddr->DDR_SRC_ADDR, Local_DDR_TEST_DST, A_COUNT, B_COUNT, A_COUNT, EDMACCRegs, uiTC);

	edma_Throughput_Test(remoteAddr->LL2_SRC_ADDR, remoteAddr->LL2_DST_ADDR, A_COUNT, B_COUNT, A_COUNT, EDMACCRegs, uiTC);
	edma_Throughput_Test(remoteAddr->SL2_SRC_ADDR, remoteAddr->SL2_DST_ADDR, A_COUNT, B_COUNT, A_COUNT, EDMACCRegs, uiTC);
	edma_Throughput_Test(remoteAddr->DDR_SRC_ADDR, remoteAddr->DDR_DST_ADDR, A_COUNT, B_COUNT, A_COUNT, EDMACCRegs, uiTC);

   
    return;
}

void PCIE_edma_test(PCIERemoteTestAddress* remoteAddr)
{  
	int i;

	//make PCIE space non-cacheable
	for(i=64; i<80; i++)
		gpCGEM_regs->MAR[i]=0;
	
	//Configure L1D as 16KB cache, 16K RAM for test
	CACHE_setL1PSize(CACHE_L1_16KCACHE);
	CACHE_setL1DSize(CACHE_L1_16KCACHE);
	CACHE_setL2Size(CACHE_0KCACHE);

	EDMA_init();

	for(i=0; i<1; i++)
		edma_performance_test(0, i, remoteAddr);
	for(i=0; i<2; i++)
		edma_performance_test(1, i, remoteAddr);
	for(i=0; i<0; i++)
		edma_performance_test(2, i, remoteAddr);
	
	puts("EDMA test complete");
    return;
}

