/*  ============================================================================
 *   Copyright (c) Texas Instruments Inc 2013
 *
 *   Use of this software is controlled by the terms and conditions found in the
 *   license agreement under which this software has been supplied.
* ============================================================================
Example to show the performance of Multicore Navigator on KeyStone device
* =============================================================================
 *  Revision History
 *  ===============
 *  Feb 5, 2013 Brighton Feng   File Created
 * ============================================================================
 */

#include <stdio.h>
#include <csl_qm_queue.h>
#include "K2_common.h"
#include "MNav_QM_Intc_setup.h"
#include "MNav_Test.h"

Uint32 uiaDescriptorBuffer[TEST_DESC_BUF_SIZE];
Uint32 uiaDescriptorPingPongBufferHi[(TEST_DESC_BUF_SIZE+1)*2];
Uint32 uiaDescriptorPingPongBufferLo[(TEST_DESC_BUF_SIZE+1)*2];
Qmss_AccCmdCfg hiQuAccCfg=
{
    /** Accumulator channel affected (0-47) */
    0,//channel
    /** Accumulator channel command - Qmss_AccCmd_ENABLE_CHANNEL : Enable channel 
     * Qmss_AccCmd_DISABLE_CHANNEL : Disable channel */
    Qmss_AccCmd_ENABLE_CHANNEL,//command
    /** This field specifies which queues are to be included in the queue group. 
     * Bit 0 corresponds to the base queue index, and bit 31 corresponds to the base 
     * queue index plus 31. For any bit set in this mask, the corresponding queue index 
     * is included in the monitoring function.
     *
     * This field is ignored in single-queue mode.*/
    1,//queueEnMask
    /** Physical pointer to list ping/pong buffer. NULL when channel disabled */
    (Uint32)uiaDescriptorPingPongBufferHi,//listAddress
    /** Queue Manager and Queue Number index to monitor. This serves as a base queue index when the 
     * channel in multi-queue mode, and must be a multiple of 32 when multi-queue mode is enabled. */
    QMSS_HIGH_PRIORITY_QUEUE_BASE,//queMgrIndex
    /** Max entries per list buffer page */
    1+1,//maxPageEntries
    /** Number of timer ticks to delay interrupt */
    1,//timerLoadCount
    /** Interrupt pacing mode. Specifies when the interrupt should be trigerred */
    Qmss_AccPacingMode_NONE,//interruptPacingMode
    /** List entry size. Specifies the size of each data entry */
    Qmss_AccEntrySize_REG_D,//listEntrySize
    /** List count Mode. The number of entries in the list */
    Qmss_AccCountMode_NULL_TERMINATE, //listCountMode
    /** Queue mode. Moitor single or multiple queues */
    Qmss_AccQueueMode_SINGLE_QUEUE //multiQueueMode
};

Qmss_AccCmdCfg loQuAccCfg=
{
    /** Accumulator channel affected (0-47) */
    32,//channel
    /** Accumulator channel command - Qmss_AccCmd_ENABLE_CHANNEL : Enable channel 
     * Qmss_AccCmd_DISABLE_CHANNEL : Disable channel */
    Qmss_AccCmd_ENABLE_CHANNEL,//command
    /** This field specifies which queues are to be included in the queue group. 
     * Bit 0 corresponds to the base queue index, and bit 31 corresponds to the base 
     * queue index plus 31. For any bit set in this mask, the corresponding queue index 
     * is included in the monitoring function.
     *
     * This field is ignored in single-queue mode.*/
    0xFFFFFFFF,//queueEnMask
    /** Physical pointer to list ping/pong buffer. NULL when channel disabled */
    (Uint32)uiaDescriptorPingPongBufferLo,//listAddress
    /** Queue Manager and Queue Number index to monitor. This serves as a base queue index when the 
     * channel in multi-queue mode, and must be a multiple of 32 when multi-queue mode is enabled. */
    QMSS_LOW_PRIORITY_QUEUE_BASE,//queMgrIndex
    /** Max entries per list buffer page */
    1+1,//maxPageEntries
    /** Number of timer ticks to delay interrupt */
    1,//timerLoadCount
    /** Interrupt pacing mode. Specifies when the interrupt should be trigerred */
    Qmss_AccPacingMode_NONE,//interruptPacingMode
    /** List entry size. Specifies the size of each data entry */
    Qmss_AccEntrySize_REG_D,//listEntrySize
    /** List count Mode. The number of entries in the list */
    Qmss_AccCountMode_NULL_TERMINATE, //listCountMode
    /** Queue mode. Moitor single or multiple queues */
    Qmss_AccQueueMode_MULTI_QUEUE //multiQueueMode
};

/*note: proxy registers can not be used for pop*/
void QueuePushCycleTest(QueueManageRegs * queueRegs, Uint32 uiDescriptorNumber,
	QueueManageRegs * queuePopRegs)
{
	int i, j, k=0;
	Uint32 cycles;
	volatile Uint32 * restrict uipREG_D_Descriptor= &queueRegs->REG_D_Descriptor;

	/*pop descriptors before push*/
	for(i=uiDescriptorNumber/2; i>0; i>>=1)
	{
		for(j=0; j<i; j++)
		{
			uiaDescriptorBuffer[k]= queuePopRegs->REG_D_Descriptor;
			if(NULL==uiaDescriptorBuffer[k++])
			{
				printf("Queue is empty!\n");
				return;
			}
		}
	}

	k= 0;
	for(i=uiDescriptorNumber/2; i>0; i>>=1)
	{
		cycles= CP15_read_CCNT();
		__asm__(" ISB");
		for(j=0; j<i; j++)
		{
			*uipREG_D_Descriptor= uiaDescriptorBuffer[k++];
		}
		__asm__(" ISB");
		cycles= CCNT_count_cycle_from(cycles);
		printf("consumes %6d cycles to push         %4d descriptors, average %3d cycles\n", 
			cycles, i, cycles/i);
	}
}

/*note: proxy registers can not be used for pop*/
void QueuePopCycleTest(QueueManageRegs * queueRegs, Uint32 uiDescriptorNumber)
{
	int i, j, k=0;
	Uint32 cycles;

	for(i=uiDescriptorNumber/2; i>0; i>>=1)
	{
		cycles= CP15_read_CCNT();
		__asm__(" ISB");
		for(j=0; j<i; j++)
		{
			uiaDescriptorBuffer[k++]= queueRegs->REG_D_Descriptor;
		}
		__asm__(" ISB");
		cycles= CCNT_count_cycle_from(cycles);
		printf("consumes %6d cycles to pop          %4d descriptors, average %3d cycles\n", 
			cycles, i, cycles/i);
	}

	/*push back the descriptors*/
	k= 0;
	for(i=uiDescriptorNumber/2; i>0; i>>=1)
	{
		for(j=0; j<i; j++)
		{
			queueRegs->REG_D_Descriptor= uiaDescriptorBuffer[k++];
		}
	}
}

void QueuePushToPopDelayTest(QueueManageRegs * queueRegs, Uint32 uiDescriptorNumber)
{
	int i, j;
	Uint32 cycles;
	Uint32 uiDescriptor;

	/*pop a descriptor before test*/
	uiDescriptor= queueRegs->REG_D_Descriptor;

	for(i=uiDescriptorNumber/2; i>0; i>>=1)
	{
		cycles= CP15_read_CCNT();
		__asm__(" ISB");
		for(j=0; j<i; j++)
		{
			queueRegs->REG_D_Descriptor= uiDescriptor;
			uiDescriptor= queueRegs->REG_D_Descriptor;
		}
		__asm__(" ISB");
		cycles= CCNT_count_cycle_from(cycles);
		printf("consumes %6d cycles to push and pop %4d descriptors, average %3d cycles\n", 
			cycles, i, cycles/i);
	}

	/*return the descriptor to original queue*/
	queueRegs->REG_D_Descriptor= uiDescriptor;
}

void QueuePushToAccumulationDelayTest(Qmss_IntdId intd_id, 
	Uint32 uiSrcQu)
{
	int i;
	Uint32 cycles, uiDescriptor, uiPingPongIndex;
	volatile Uint32 * uipDescList;
	QueueManageRegs * restrict lowPriQueue;
	QueueManageRegs * restrict highPriQueue;

#if defined(DEVICE_K2H) || defined(DEVICE_K2K)
	hiQuAccCfg.queMgrIndex= QMSS_HIGH_PRIORITY_QUEUE_BASE+8192*intd_id;
	loQuAccCfg.queMgrIndex= QMSS_LOW_PRIORITY_QUEUE_BASE+8192*intd_id;
#else 	//for K2L and K2E
	hiQuAccCfg.queMgrIndex= QMSS_HIGH_PRIORITY_QUEUE_BASE;
	loQuAccCfg.queMgrIndex= QMSS_LOW_PRIORITY_QUEUE_BASE;
#endif

	/*configure high priority queue accumulation channel*/
	KeyStone_Qmss_Config_Acc_Channel(2*intd_id, &hiQuAccCfg);

	/*configure low priority queue accumulation channel*/
#ifdef ACC_48_CHANNEL
	printf("\nQueue Push/accumulation test with ACC48 firmware on INTD%d\n", intd_id+1);
	loQuAccCfg.channel=32;
	KeyStone_Qmss_Config_Acc_Channel(2*intd_id, &loQuAccCfg);

#else
	printf("\nQueue Push/accumulation test with ACC32+ACC16 firmware on INTD%d\n", intd_id+1);
	loQuAccCfg.channel=0;
	KeyStone_Qmss_Config_Acc_Channel(2*intd_id+1, &loQuAccCfg);
#endif

	lowPriQueue= &gpQueueManageVBUSM[loQuAccCfg.queMgrIndex];
	highPriQueue= &gpQueueManageVBUSM[hiQuAccCfg.queMgrIndex];

	/*pop descriptors before push*/
	uiDescriptor= gpQueueManageVBUSM[uiSrcQu].REG_D_Descriptor;

	for(i=0; i<10; i++) 	//try multiple times to get statistics 
	{
		/*test high priority queue accumulation*/
		uipDescList= (Uint32 *)hiQuAccCfg.listAddress;
		uiPingPongIndex= hiQuAccCfg.maxPageEntries*(i&1);
		uipDescList[uiPingPongIndex]= 0; 	//clear the descriptor list buffer
		/*invalid cache before read descriptor RAM*/
		CP15_DCacheCleanInvalidateBuff((unsigned int)&uipDescList[uiPingPongIndex], 16);
 		
		cycles= CP15_read_CCNT();
		__asm__(" ISB");
		highPriQueue->REG_D_Descriptor= uiDescriptor ; 	//push descriptor
		intCCNT= cycles;
		/*waiting for interrupt, which will change the intCCNT*/
 		while(intCCNT == cycles);

		__asm__(" ISB");
		cycles= ((unsigned int)((0xFFFFFFFFl+intCCNT)- (unsigned long long)cycles)+ 1);
		printf("consumes %6u cycles to push and accumulate one descriptor with high priority\n", 
			cycles);

		/*invalid cache before read descriptor RAM*/
		CP15_DCacheInvalidateBuff((unsigned int)&uipDescList[uiPingPongIndex], 16);

		if(uiDescriptor !=uipDescList[uiPingPongIndex])
		{
			printf("descriptor does not match , expect 0x%x, get 0x%x\n",
				uiDescriptor , uipDescList[uiPingPongIndex]);
			break;
		}
 

		/*test low priority queue accumulation*/
		uipDescList= (Uint32 *)loQuAccCfg.listAddress;
		uiPingPongIndex= loQuAccCfg.maxPageEntries*(i&1);
		uipDescList[uiPingPongIndex]= 0; 	//clear the descriptor list buffer
		/*invalid cache before read descriptor RAM*/
		CP15_DCacheCleanInvalidateBuff((unsigned int)&uipDescList[uiPingPongIndex], 16);
 		
		cycles= CP15_read_CCNT();
		__asm__(" ISB");
		lowPriQueue->REG_D_Descriptor= uiDescriptor; 	//push descriptor

		intCCNT= cycles;
		/*waiting for interrupt, which will change the intCCNT*/
 		while(intCCNT == cycles);

		__asm__(" ISB");
 		cycles= ((unsigned int)((0xFFFFFFFFl+intCCNT)- (unsigned long long)cycles)+ 1);
		printf("consumes %6u cycles to push and accumulate one descriptor with  low priority\n", 
			cycles);

		/*invalid cache before read descriptor RAM*/
		CP15_DCacheInvalidateBuff((unsigned int)&uipDescList[uiPingPongIndex], 16);
		if(uiDescriptor!=uipDescList[uiPingPongIndex])
		{
			printf("descriptor does not match, expect 0x%x, get 0x%x\n",
				uiDescriptor , uipDescList[uiPingPongIndex]);
			break;
		}
	}

	/*push descriptors back after test*/
	gpQueueManageVBUSM[uiSrcQu].REG_D_Descriptor= uiDescriptor ;
 
	/*disable accumulation channels*/
	KeyStone_Qmss_disable_Acc_Channel(2*intd_id, 0);
#ifdef ACC_48_CHANNEL
	KeyStone_Qmss_disable_Acc_Channel(2*intd_id, 32);
#else
	KeyStone_Qmss_disable_Acc_Channel(2*intd_id+1, 0);
#endif

}

void QueuePushToPendInterruptDelayTest(QueueManageRegs * queueRegs)
{
	Uint32 cycles;
	Uint32 uiDescriptor;

	/*pop a descriptor for test*/
	uiDescriptor= queueRegs->REG_D_Descriptor;

	cycles= CP15_read_CCNT();
	__asm__(" ISB");
	/*push to queue 662, which has pend event to CPU INTC*/
	gpQueueManageVBUSM[662].REG_D_Descriptor= uiDescriptor;

	intCCNT= cycles;
	/*waiting for interrupt, which will change the intCCNT*/
	while(intCCNT == cycles);

	__asm__(" ISB");
	cycles= ((unsigned int)((0xFFFFFFFFl+intCCNT)- (unsigned long long)cycles)+ 1);
	printf("consumes %6d cycles between push queue 662 and pend interrupt\n", cycles);

#if defined(DEVICE_K2H) || defined(DEVICE_K2K)
	cycles= CP15_read_CCNT();
	__asm__(" ISB");
	/*push to queue 8706, which has pend event to CPU INTC*/
	gpQueueManageVBUSM[8706].REG_D_Descriptor= uiDescriptor;

	intCCNT= cycles;
	/*waiting for interrupt, which will change the intCCNT*/
	while(intCCNT == cycles);

	__asm__(" ISB");
	cycles= ((unsigned int)((0xFFFFFFFFl+intCCNT)- (unsigned long long)cycles)+ 1);
	printf("consumes %6d cycles between push queue 8854 and pend interrupt\n", cycles);
#endif

	//return the descriptor to original queue
	queueRegs->REG_D_Descriptor= uiDescriptor;
}

void QueueReclamationDelayTest(Uint32 uiSrcQu)
{
	int i;
	
	Uint32 startCCNT, cycles;
	Uint32 uiDescriptor;
	Uint32 uiOriginalEntryCount, uiCurrentEntryCount;

	/*read original entry count*/
	uiOriginalEntryCount= gpQueueStatusConfigRegs[uiSrcQu].REG_A_EntryCount;

	printf("\ndescriptor Reclamation test in queue %d\n", uiSrcQu);
	for(i=0; i<10; i++) 	//try multiple times to get statistics
	{
		/*pop a descriptor for test*/
		uiDescriptor= gpQueueManageVBUSM[uiSrcQu].REG_D_Descriptor;

		startCCNT= CP15_read_CCNT();
		__asm__(" ISB");

		/*push to reclamation queue*/
		gpQueueManageVBUSM[RECLAMATION_QUEUE].REG_D_Descriptor= uiDescriptor;

		/*wait the descriptor return to original queue*/
		do
		{
			uiCurrentEntryCount= gpQueueStatusConfigRegs[uiSrcQu].REG_A_EntryCount;
		}while(uiCurrentEntryCount!= uiOriginalEntryCount);

		__asm__(" ISB");
		cycles= CCNT_count_cycle_from(startCCNT);
		printf("consumes %6d cycles for one descriptor Reclamation\n", cycles);
	}
}

void PktDmaThroughput(Uint32 uiSrcQuNum, Uint32 uiTxQuNum, 
	Uint32 uiFlowID, Uint32 uiByteCount, Uint32 uiNumCh)
{
	int i, j;
	HostPacketDescriptor * hostDescriptor;
	MonolithicPacketDescriptor * monoDescriptor;
	Uint32 uiDescriptor;
	Uint32 * uipSrcBufPtr, * uipDstBufPtr;
	Uint32 startCCNT, cycles, uiEntryCount;

	/*prepare source packets, each channel transfer one packet for this test*/
	for(j= 0; j< uiNumCh; j++)
	{
		uiDescriptor= KeyStone_queuePop(uiSrcQuNum);
		if(0x800000>uiDescriptor)
		{
			printf("Source queue %d invalid descriptor pointer: 0x%x. Entry count= %d\n", 
				uiSrcQuNum, uiDescriptor, KeyStone_GetQueueEntryCount(uiSrcQuNum));
			return;
		}
	
		/*invalid cache before read descriptor RAM*/
		CP15_DCacheInvalidateBuff(uiDescriptor, 64);

		hostDescriptor= (HostPacketDescriptor *)uiDescriptor;
		monoDescriptor= (MonolithicPacketDescriptor *)uiDescriptor;

		/*get the source buffer*/
		if(Cppi_DescType_HOST==hostDescriptor->type_id)
		{
			uiaDescriptorBuffer[j]= uiDescriptor|FETCH_SIZE_32;

			uipSrcBufPtr= (Uint32 *)hostDescriptor->buffer_ptr;

			hostDescriptor->packet_length= uiByteCount;
			hostDescriptor->buffer_len= uiByteCount;
			/*the SRC_TAG_LO field in the Tx descriptor is used as RX flow ID*/
			hostDescriptor->src_tag_lo= uiFlowID;

		}		
		else
		{
			uiaDescriptorBuffer[j]= uiDescriptor|FETCH_SIZE_16;

			uipSrcBufPtr= (Uint32 *)(uiDescriptor + 
				monoDescriptor->data_offset);

			monoDescriptor->packet_length= uiByteCount;
			/*the SRC_TAG_LO field in the Tx descriptor is used as RX flow ID*/
			monoDescriptor->src_tag_lo= uiFlowID;
		}

		/*write back data from cache to descriptor RAM*/
		CP15_DCacheCleanBuff(uiDescriptor, 64);

		/*initialize the source buffer, same for all channels*/
		if(NULL==uipSrcBufPtr)
		{
			printf("monoDescriptor= 0x%x, hostDescriptor= 0x%x\n", monoDescriptor, hostDescriptor);
		}
		for(i=0; i<uiByteCount/4; i++)
			uipSrcBufPtr[i]= (uiSrcQuNum<<16)|(uiFlowID<<8)|i;

		/*write back data from cache to buffer*/
		CP15_DCacheCleanBuff((Uint32)uipSrcBufPtr, uiByteCount);
	}

	startCCNT= CP15_read_CCNT();
	__asm__(" ISB");
	/*push the packet descriptor to Packet DMA TX queue*/
	for(j= 0; j< uiNumCh; j++)
	{
		//each channel transfer one packet
		KeyStone_queuePush(uiTxQuNum+j, uiaDescriptorBuffer[j]);
	}
	__asm__(" ISB");
	
	/*poll the packet descriptor in destination queue*/	
	do
	{
		/*received packet number should equal to channel number 
		since each channel transfer on packet*/
		uiEntryCount= KeyStone_GetQueueEntryCount(PKT_DMA_TEST_DST_Q);
	}while(uiEntryCount<uiNumCh);
	__asm__(" ISB");
	cycles= CCNT_count_cycle_from(startCCNT);
	printf("Packet DMA achieves %4lld MB/s when transfer %5d bytes ",
		(unsigned long long)uiNumCh*uiByteCount*gMain_Core_Speed_Hz/cycles/1000000,
		uiByteCount);

	/*check recieved data and recycle descriptors and buffers*/
	for(j= 0; j< uiNumCh; j++)
	{
		uiDescriptor= KeyStone_queuePop(PKT_DMA_TEST_DST_Q);

		/*invalid cache before read descriptor RAM*/
		CP15_DCacheInvalidateBuff(uiDescriptor, 64);

		uiaDescriptorBuffer[j]= uiDescriptor;
		hostDescriptor= (HostPacketDescriptor *)uiDescriptor;
		monoDescriptor= (MonolithicPacketDescriptor *)uiDescriptor;

		/*get the destination buffer*/
		if(Cppi_DescType_HOST==hostDescriptor->type_id)
		{
			uipDstBufPtr= (Uint32 *)hostDescriptor->buffer_ptr;
		}
		else
		{
			uipDstBufPtr= (Uint32 *)(uiDescriptor + 
				monoDescriptor->data_offset);
		}
		printf("from 0x%8x to 0x%8x with channel %2d, ",
			(Uint32)uipSrcBufPtr, (Uint32)uipDstBufPtr, j);

		/*invalid old contents of destination buffer in cache before read*/
		CP15_DCacheInvalidateBuff((unsigned int)uipDstBufPtr, uiByteCount);

		/*compare the destination data against source data*/
		for(i=0; i<uiByteCount/4; i++)
		{
			if(uipSrcBufPtr[i]!= uipDstBufPtr[i])
			{
				printf("consumes %5d cycles\n", cycles);
				printf("destination data 0x%8x does not match source data 0x%8x at word %d of packet %d (0x%08x)\n",
					uipDstBufPtr[i], uipSrcBufPtr[i], i, j, (Uint32)uipDstBufPtr);
				break;
			}
		}
		
		/*descriptor Reclamation*/
#if 1
		gpQueueManageVBUSM[RECLAMATION_QUEUE].REG_D_Descriptor= uiaDescriptorBuffer[j];
#else
		gpQueueManageVBUSM[(hostDescriptor->pkt_return_qmgr<<12)|
			hostDescriptor->pkt_return_qnum].REG_D_Descriptor= 
			uiaDescriptorBuffer[j];
#endif
	}

	printf("consumes %5d cycles\n", cycles);
}

