What is the physical address of C66?0x00800000?Why is there no change in performance when I use memcpy to calculate data at this address?
This thread has been locked.
If you have a related question, please click the "Ask a related question" button in the top right corner. The newly created question will be automatically linked to this question.
if ((coeffsHeight == 3) && (dilationHeight== 1) && (typeid(Tin) == typeid(uint8_t)) && height/strideHeight >= 8)
{
printf("Sy-gPtrL2RAM:%p--------------------------------20220303_1\n", gPtrL2RAM); //gPtrL2RAM:0x00800000
uint32_t uiMemUsedSize = 0;
//Input
Tin *pInData = (Tin *)(uiMemUsedSize + (uint32_t)gPtrL2RAM);
uiMemUsedSize += (uint32_t)(numInChannels * inChPitch * sizeof(Tin));
appMemCacheWb(pInChannel, numInChannels * inChPitch * sizeof(Tin));
appMemCacheWb(pInData, numInChannels * inChPitch * sizeof(Tin));
app_udma_copy_2d_prms_t prms_2d_pInBuf;
appUdmaCopy2DPrms_Init(&prms_2d_pInBuf);
prms_2d_pInBuf.width = inChPitch * sizeof(Tin);
prms_2d_pInBuf.height = numInChannels;
prms_2d_pInBuf.dest_pitch = inChPitch * sizeof(Tin);
prms_2d_pInBuf.src_pitch = inChPitch * sizeof(Tin);
prms_2d_pInBuf.dest_addr = Udma_appVirtToPhyFxn((void *)pInData, 1, NULL); //Udma_appVirtToPhyFxn
prms_2d_pInBuf.src_addr = Udma_appVirtToPhyFxn((void *)pInChannel, 1, NULL); //appMemGetVirt2PhyBufPtr
//prms_2d_pInBuf.dest_addr = (uint64_t)pInData;
//prms_2d_pInBuf.src_addr = (uint64_t)pInChannel;
appUdmaCopy2D(NULL, &prms_2d_pInBuf, 1);
appMemCacheInv(pInData, numInChannels * inChPitch * sizeof(Tin));
//Weight
Tw *pWeightData = (Tw *)(uiMemUsedSize + (uint32_t)gPtrL2RAM);
uiMemUsedSize += (uint32_t)(numOutChannels * numInChannels * coeffsHeight * coeffsWidth * sizeof(Tw));
appMemCacheWb(pCoeffs, numOutChannels * numInChannels * coeffsHeight * coeffsWidth * sizeof(Tw));
appMemCacheWb(pWeightData, numOutChannels * numInChannels * coeffsHeight * coeffsWidth * sizeof(Tw));
app_udma_copy_2d_prms_t prms_2d_pWeightBuf;
appUdmaCopy2DPrms_Init(&prms_2d_pWeightBuf);
prms_2d_pWeightBuf.width = coeffsHeight * coeffsWidth * sizeof(Tw);
prms_2d_pWeightBuf.height = numOutChannels * numInChannels;
prms_2d_pWeightBuf.dest_pitch = coeffsHeight * coeffsWidth * sizeof(Tw);
prms_2d_pWeightBuf.src_pitch = coeffsHeight * coeffsWidth * sizeof(Tw);
prms_2d_pWeightBuf.dest_addr = Udma_appVirtToPhyFxn((void *)pWeightData, 1, NULL);
prms_2d_pWeightBuf.src_addr = Udma_appVirtToPhyFxn((void *)pCoeffs, 1, NULL);
//prms_2d_pWeightBuf.dest_addr = (uint64_t)pWeightData;
//prms_2d_pWeightBuf.src_addr = (uint64_t)pCoeffs;
appUdmaCopy2D(NULL, &prms_2d_pWeightBuf, 1);
appMemCacheInv(pWeightData, numOutChannels * numInChannels * coeffsHeight * coeffsWidth * sizeof(Tw));
//Bias
Tb *pBiasData = (Tb *)(uiMemUsedSize + (uint32_t)gPtrL2RAM);
uiMemUsedSize += (uint32_t)(numOutChannels * sizeof(Tb));
appMemCacheWb(pBias, numOutChannels * sizeof(Tb));
appMemCacheWb(pBiasData, numOutChannels * sizeof(Tb));
app_udma_copy_1d_prms_t prms_1d_pBaisBuf;
appUdmaCopy1DPrms_Init(&prms_1d_pBaisBuf);
prms_1d_pBaisBuf.dest_addr = Udma_appVirtToPhyFxn((void *)pBiasData, 1, NULL);
prms_1d_pBaisBuf.src_addr = Udma_appVirtToPhyFxn((void *)pBias, 1, NULL);
//prms_1d_pBaisBuf.dest_addr = (uint64_t)pBiasData;
//prms_1d_pBaisBuf.src_addr = (uint64_t)pBias;
prms_1d_pBaisBuf.length = numOutChannels * sizeof(Tb);
appUdmaCopy1D(NULL, &prms_1d_pBaisBuf);
appMemCacheInv(pBiasData, numOutChannels * sizeof(Tb));
Tacc *pOutData = (Tacc *)(uiMemUsedSize + (uint32_t)gPtrL2RAM);
memset(pOutData, 0, numOutChannels * outChPitch * sizeof(Tacc));
uiMemUsedSize += (uint32_t)(numOutChannels * outChPitch * sizeof(Tacc));
ullCyclesStart = get_tsc();
TIDL_refConv2dKernel_i8u_c8s_o32s_3x3s1d1(pInData, pWeightData, pBiasData, pOutData , &min, &max, numTotRoi, numGroups, numInChannels,
numOutChannels, inChPitch, outChPitch, width, height, inImPitch, outImPitch,
coeffsWidth, coeffsHeight, dilationWidth, dilationHeight, strideWidth, strideHeight, params->enableBias);
ullCyclesEnd = get_tsc();
printf("Sy-Conv cost cycles %llu\n", (ullCyclesEnd - ullCyclesStart));
appMemCacheWb(pOutData, numOutChannels * outChPitch * sizeof(Tacc));
appMemCacheWb(accPtr, numOutChannels * outChPitch * sizeof(Tacc));
app_udma_copy_2d_prms_t prms_2d_pOutBuf;
appUdmaCopy2DPrms_Init(&prms_2d_pOutBuf);
prms_2d_pOutBuf.width = outChPitch * sizeof(Tacc);
prms_2d_pOutBuf.height = numOutChannels;
prms_2d_pOutBuf.dest_pitch = outChPitch * sizeof(Tacc);
prms_2d_pOutBuf.src_pitch = outChPitch * sizeof(Tacc);
prms_2d_pOutBuf.dest_addr = Udma_appVirtToPhyFxn((void *)accPtr, 1, NULL); //Udma_appVirtToPhyFxn
prms_2d_pOutBuf.src_addr = Udma_appVirtToPhyFxn((void *)pOutData, 1, NULL); //Udma_appPhyToVirtFxn
//prms_2d_pOutBuf.dest_addr = (uint64_t)accPtr;
//prms_2d_pOutBuf.src_addr = (uint64_t)pOutData;
appUdmaCopy2D(NULL, &prms_2d_pOutBuf, 1);
appMemCacheInv(accPtr, numOutChannels * outChPitch * sizeof(Tacc));
}
这段代码是我用UDMA拷贝做的,现在数据搬运的结果是对的,但是测试cycles基本没变化,UDMA搬运数据性能不是应该会快很多嘛
是我UDMA拷贝用的不对还是L2没用起来?
我地址没用这个函数(Udma_appVirtToPhyFxn)偏移直接用它本来的地址就会报错:UDMA : ERROR: TR Response not completed!!
gPtrL2RAM这个地址就是0x00800000