if ((coeffsHeight == 3) && (dilationHeight== 1) && (typeid(Tin) == typeid(uint8_t)) && height/strideHeight >= 8)
{
printf("Sy-gPtrL2RAM:%p--------------------------------20220303_1\n", gPtrL2RAM); //gPtrL2RAM:0x00800000
uint32_t uiMemUsedSize = 0;
//Input
Tin *pInData = (Tin *)(uiMemUsedSize + (uint32_t)gPtrL2RAM);
uiMemUsedSize += (uint32_t)(numInChannels * inChPitch * sizeof(Tin));
appMemCacheWb(pInChannel, numInChannels * inChPitch * sizeof(Tin));
appMemCacheWb(pInData, numInChannels * inChPitch * sizeof(Tin));
app_udma_copy_2d_prms_t prms_2d_pInBuf;
appUdmaCopy2DPrms_Init(&prms_2d_pInBuf);
prms_2d_pInBuf.width = inChPitch * sizeof(Tin);
prms_2d_pInBuf.height = numInChannels;
prms_2d_pInBuf.dest_pitch = inChPitch * sizeof(Tin);
prms_2d_pInBuf.src_pitch = inChPitch * sizeof(Tin);
prms_2d_pInBuf.dest_addr = Udma_appVirtToPhyFxn((void *)pInData, 1, NULL); //Udma_appVirtToPhyFxn
prms_2d_pInBuf.src_addr = Udma_appVirtToPhyFxn((void *)pInChannel, 1, NULL); //appMemGetVirt2PhyBufPtr
//prms_2d_pInBuf.dest_addr = (uint64_t)pInData;
//prms_2d_pInBuf.src_addr = (uint64_t)pInChannel;
appUdmaCopy2D(NULL, &prms_2d_pInBuf, 1);
appMemCacheInv(pInData, numInChannels * inChPitch * sizeof(Tin));
//Weight
Tw *pWeightData = (Tw *)(uiMemUsedSize + (uint32_t)gPtrL2RAM);
uiMemUsedSize += (uint32_t)(numOutChannels * numInChannels * coeffsHeight * coeffsWidth * sizeof(Tw));
appMemCacheWb(pCoeffs, numOutChannels * numInChannels * coeffsHeight * coeffsWidth * sizeof(Tw));
appMemCacheWb(pWeightData, numOutChannels * numInChannels * coeffsHeight * coeffsWidth * sizeof(Tw));
app_udma_copy_2d_prms_t prms_2d_pWeightBuf;
appUdmaCopy2DPrms_Init(&prms_2d_pWeightBuf);
prms_2d_pWeightBuf.width = coeffsHeight * coeffsWidth * sizeof(Tw);
prms_2d_pWeightBuf.height = numOutChannels * numInChannels;
prms_2d_pWeightBuf.dest_pitch = coeffsHeight * coeffsWidth * sizeof(Tw);
prms_2d_pWeightBuf.src_pitch = coeffsHeight * coeffsWidth * sizeof(Tw);
prms_2d_pWeightBuf.dest_addr = Udma_appVirtToPhyFxn((void *)pWeightData, 1, NULL);
prms_2d_pWeightBuf.src_addr = Udma_appVirtToPhyFxn((void *)pCoeffs, 1, NULL);
//prms_2d_pWeightBuf.dest_addr = (uint64_t)pWeightData;
//prms_2d_pWeightBuf.src_addr = (uint64_t)pCoeffs;
appUdmaCopy2D(NULL, &prms_2d_pWeightBuf, 1);
appMemCacheInv(pWeightData, numOutChannels * numInChannels * coeffsHeight * coeffsWidth * sizeof(Tw));
//Bias
Tb *pBiasData = (Tb *)(uiMemUsedSize + (uint32_t)gPtrL2RAM);
uiMemUsedSize += (uint32_t)(numOutChannels * sizeof(Tb));
appMemCacheWb(pBias, numOutChannels * sizeof(Tb));
appMemCacheWb(pBiasData, numOutChannels * sizeof(Tb));
app_udma_copy_1d_prms_t prms_1d_pBaisBuf;
appUdmaCopy1DPrms_Init(&prms_1d_pBaisBuf);
prms_1d_pBaisBuf.dest_addr = Udma_appVirtToPhyFxn((void *)pBiasData, 1, NULL);
prms_1d_pBaisBuf.src_addr = Udma_appVirtToPhyFxn((void *)pBias, 1, NULL);
//prms_1d_pBaisBuf.dest_addr = (uint64_t)pBiasData;
//prms_1d_pBaisBuf.src_addr = (uint64_t)pBias;
prms_1d_pBaisBuf.length = numOutChannels * sizeof(Tb);
appUdmaCopy1D(NULL, &prms_1d_pBaisBuf);
appMemCacheInv(pBiasData, numOutChannels * sizeof(Tb));
Tacc *pOutData = (Tacc *)(uiMemUsedSize + (uint32_t)gPtrL2RAM);
memset(pOutData, 0, numOutChannels * outChPitch * sizeof(Tacc));
uiMemUsedSize += (uint32_t)(numOutChannels * outChPitch * sizeof(Tacc));
ullCyclesStart = get_tsc();
TIDL_refConv2dKernel_i8u_c8s_o32s_3x3s1d1(pInData, pWeightData, pBiasData, pOutData , &min, &max, numTotRoi, numGroups, numInChannels,
numOutChannels, inChPitch, outChPitch, width, height, inImPitch, outImPitch,
coeffsWidth, coeffsHeight, dilationWidth, dilationHeight, strideWidth, strideHeight, params->enableBias);
ullCyclesEnd = get_tsc();
printf("Sy-Conv cost cycles %llu\n", (ullCyclesEnd - ullCyclesStart));
appMemCacheWb(pOutData, numOutChannels * outChPitch * sizeof(Tacc));
appMemCacheWb(accPtr, numOutChannels * outChPitch * sizeof(Tacc));
app_udma_copy_2d_prms_t prms_2d_pOutBuf;
appUdmaCopy2DPrms_Init(&prms_2d_pOutBuf);
prms_2d_pOutBuf.width = outChPitch * sizeof(Tacc);
prms_2d_pOutBuf.height = numOutChannels;
prms_2d_pOutBuf.dest_pitch = outChPitch * sizeof(Tacc);
prms_2d_pOutBuf.src_pitch = outChPitch * sizeof(Tacc);
prms_2d_pOutBuf.dest_addr = Udma_appVirtToPhyFxn((void *)accPtr, 1, NULL); //Udma_appVirtToPhyFxn
prms_2d_pOutBuf.src_addr = Udma_appVirtToPhyFxn((void *)pOutData, 1, NULL); //Udma_appPhyToVirtFxn
//prms_2d_pOutBuf.dest_addr = (uint64_t)accPtr;
//prms_2d_pOutBuf.src_addr = (uint64_t)pOutData;
appUdmaCopy2D(NULL, &prms_2d_pOutBuf, 1);
appMemCacheInv(accPtr, numOutChannels * outChPitch * sizeof(Tacc));
}
This is the code I used to call the function using UDMA to carry the data on C66. gPtrL2RAM is the base address of L2 (0x00800000).
Now the data transport results are correct, but there is no performance improvement; Am I using udMA correctly?
Who can answer for me? Thank you very much!!