This thread has been locked.

If you have a related question, please click the "Ask a related question" button in the top right corner. The newly created question will be automatically linked to this question.

TDA4VM: C66

Part Number: TDA4VM


if ((coeffsHeight == 3) && (dilationHeight== 1) && (typeid(Tin) == typeid(uint8_t)) && height/strideHeight >= 8)
    {
      printf("Sy-gPtrL2RAM:%p--------------------------------20220303_1\n", gPtrL2RAM); //gPtrL2RAM:0x00800000
      uint32_t uiMemUsedSize = 0;

      //Input
      Tin  *pInData = (Tin *)(uiMemUsedSize + (uint32_t)gPtrL2RAM);   
      uiMemUsedSize += (uint32_t)(numInChannels * inChPitch * sizeof(Tin));

      appMemCacheWb(pInChannel, numInChannels * inChPitch * sizeof(Tin));
      appMemCacheWb(pInData, numInChannels * inChPitch * sizeof(Tin));
      
      app_udma_copy_2d_prms_t prms_2d_pInBuf;
      appUdmaCopy2DPrms_Init(&prms_2d_pInBuf);
      prms_2d_pInBuf.width        = inChPitch * sizeof(Tin);
      prms_2d_pInBuf.height       = numInChannels;
      prms_2d_pInBuf.dest_pitch   = inChPitch * sizeof(Tin);
      prms_2d_pInBuf.src_pitch    = inChPitch * sizeof(Tin);
      prms_2d_pInBuf.dest_addr    = Udma_appVirtToPhyFxn((void *)pInData, 1, NULL);  //Udma_appVirtToPhyFxn
      prms_2d_pInBuf.src_addr     = Udma_appVirtToPhyFxn((void *)pInChannel, 1, NULL);  //appMemGetVirt2PhyBufPtr 
      //prms_2d_pInBuf.dest_addr    = (uint64_t)pInData;
      //prms_2d_pInBuf.src_addr     = (uint64_t)pInChannel;  
      appUdmaCopy2D(NULL, &prms_2d_pInBuf, 1);

      appMemCacheInv(pInData, numInChannels * inChPitch * sizeof(Tin));

      //Weight
      Tw   *pWeightData = (Tw *)(uiMemUsedSize + (uint32_t)gPtrL2RAM);
      uiMemUsedSize += (uint32_t)(numOutChannels * numInChannels * coeffsHeight * coeffsWidth * sizeof(Tw));

      appMemCacheWb(pCoeffs, numOutChannels * numInChannels * coeffsHeight * coeffsWidth * sizeof(Tw));
      appMemCacheWb(pWeightData, numOutChannels * numInChannels * coeffsHeight * coeffsWidth * sizeof(Tw));

      app_udma_copy_2d_prms_t prms_2d_pWeightBuf;
      appUdmaCopy2DPrms_Init(&prms_2d_pWeightBuf);
      prms_2d_pWeightBuf.width        = coeffsHeight * coeffsWidth * sizeof(Tw);
      prms_2d_pWeightBuf.height       = numOutChannels * numInChannels;
      prms_2d_pWeightBuf.dest_pitch   = coeffsHeight * coeffsWidth * sizeof(Tw);
      prms_2d_pWeightBuf.src_pitch    = coeffsHeight * coeffsWidth * sizeof(Tw);  
      prms_2d_pWeightBuf.dest_addr    = Udma_appVirtToPhyFxn((void *)pWeightData, 1, NULL); 
      prms_2d_pWeightBuf.src_addr     = Udma_appVirtToPhyFxn((void *)pCoeffs, 1, NULL);  
      //prms_2d_pWeightBuf.dest_addr    = (uint64_t)pWeightData; 
      //prms_2d_pWeightBuf.src_addr     = (uint64_t)pCoeffs;
      appUdmaCopy2D(NULL, &prms_2d_pWeightBuf, 1);   

      appMemCacheInv(pWeightData, numOutChannels * numInChannels * coeffsHeight * coeffsWidth * sizeof(Tw));

      //Bias
      Tb   *pBiasData = (Tb *)(uiMemUsedSize + (uint32_t)gPtrL2RAM);
      uiMemUsedSize += (uint32_t)(numOutChannels * sizeof(Tb));

      appMemCacheWb(pBias, numOutChannels * sizeof(Tb));
      appMemCacheWb(pBiasData, numOutChannels * sizeof(Tb));

      app_udma_copy_1d_prms_t prms_1d_pBaisBuf;
      appUdmaCopy1DPrms_Init(&prms_1d_pBaisBuf);
      prms_1d_pBaisBuf.dest_addr    = Udma_appVirtToPhyFxn((void *)pBiasData, 1, NULL); 
      prms_1d_pBaisBuf.src_addr     = Udma_appVirtToPhyFxn((void *)pBias, 1, NULL); 
      //prms_1d_pBaisBuf.dest_addr    = (uint64_t)pBiasData; 
      //prms_1d_pBaisBuf.src_addr     = (uint64_t)pBias;
      prms_1d_pBaisBuf.length       = numOutChannels * sizeof(Tb); 
      appUdmaCopy1D(NULL, &prms_1d_pBaisBuf); 

      appMemCacheInv(pBiasData, numOutChannels * sizeof(Tb));       

      Tacc  *pOutData = (Tacc *)(uiMemUsedSize + (uint32_t)gPtrL2RAM);
      memset(pOutData, 0, numOutChannels * outChPitch * sizeof(Tacc));
      uiMemUsedSize += (uint32_t)(numOutChannels * outChPitch * sizeof(Tacc));
    
      ullCyclesStart = get_tsc();

      TIDL_refConv2dKernel_i8u_c8s_o32s_3x3s1d1(pInData, pWeightData, pBiasData, pOutData , &min, &max, numTotRoi, numGroups, numInChannels,
        numOutChannels, inChPitch, outChPitch, width, height, inImPitch, outImPitch,
        coeffsWidth, coeffsHeight, dilationWidth, dilationHeight, strideWidth, strideHeight, params->enableBias);
      
      ullCyclesEnd = get_tsc();   
      
      printf("Sy-Conv cost cycles %llu\n", (ullCyclesEnd - ullCyclesStart));

      appMemCacheWb(pOutData, numOutChannels * outChPitch * sizeof(Tacc));
      appMemCacheWb(accPtr, numOutChannels * outChPitch * sizeof(Tacc));

      app_udma_copy_2d_prms_t prms_2d_pOutBuf;
      appUdmaCopy2DPrms_Init(&prms_2d_pOutBuf);
      prms_2d_pOutBuf.width        = outChPitch * sizeof(Tacc);
      prms_2d_pOutBuf.height       = numOutChannels;
      prms_2d_pOutBuf.dest_pitch   = outChPitch * sizeof(Tacc);
      prms_2d_pOutBuf.src_pitch    = outChPitch * sizeof(Tacc);
      prms_2d_pOutBuf.dest_addr    = Udma_appVirtToPhyFxn((void *)accPtr, 1, NULL);  //Udma_appVirtToPhyFxn
      prms_2d_pOutBuf.src_addr     = Udma_appVirtToPhyFxn((void *)pOutData, 1, NULL);  //Udma_appPhyToVirtFxn
      //prms_2d_pOutBuf.dest_addr    = (uint64_t)accPtr;
      //prms_2d_pOutBuf.src_addr     = (uint64_t)pOutData;  
      appUdmaCopy2D(NULL, &prms_2d_pOutBuf, 1);

      appMemCacheInv(accPtr, numOutChannels * outChPitch * sizeof(Tacc));  
    } 

This is the code I used to call the function using UDMA to carry the data  on C66. gPtrL2RAM is the base address of L2 (0x00800000).

Now the data transport results are correct, but there is no performance improvement; Am I using udMA correctly?

Who can answer for me? Thank you very much!!

  • Hi we've got the issue and escalated to e2e, please expect the response.Thanks!

  • Hi,

    Sorry for the delay in response, could you help provide the following info:

    1. what is the address of pCoeffs? Is it also in C66 L2?

    2. can you confirm how you verified data was right? You used 0x00800000 for gPtrL2RAM, that is the C66 view. UDMA need to view from SOC side, which should be 0x4D80800000. So we're not sure how the data movement could be correct. 

    if you are using UDMA to move data within the C66 L2, the UDMA may not get optimal bandwidht. 

    Thanks.