您好!
我 已经创建了一个 openvx 内核 ,它将一个 vx_tensor 复制到另一个。 每个内核都指定为内核的目标、唯一的区别是 A72使用 memcpy、C7X、C66、R5F 使用 uDMA。
下面是我复制一个大小为864*128*sizeof(float)=442368字节的 vx_tensor 得到的数字
| 内核 | memcpy |
appUdmaCopy1D
|
| a72. | 670 μ s | 不适用 |
| C66 | 4889 us | 2237 us |
| C7X 内核上运行 | 1616 μ s | 2003 US |
| R5F | 18097 us | 5794 us |
我得到的数字非常令人失望、UDMA 似乎比做 A72 memcpy 慢。 在 C7x 上、执行 mempcy 时甚至会更慢。
这些数字是否正常? 我没有找到 uDMA 的任何规格。
以下是我的(简化的)内核代码:
bool TensorBlockCpyDma(const size_t nBytes,
const tivx_obj_desc_tensor_t* const srcDescriptor,
const tivx_obj_desc_tensor_t* const dstDescriptor)
{
bool ret = false;
if ( (0U == nBytes)
|| (NULL == srcDescriptor)
|| (NULL == dstDescriptor))
{
VX_PRINT(VX_ZONE_ERROR, "Invalid input pointer\n");
}
else
{
uint64_t srcPhys = tivxMemShared2PhysPtr(srcDescriptor->mem_ptr.shared_ptr, VX_MEMORY_TYPE_HOST);
uint64_t dstPhys = tivxMemShared2PhysPtr(dstDescriptor->mem_ptr.shared_ptr, VX_MEMORY_TYPE_HOST);
app_udma_copy_1d_prms_t prms;
appUdmaCopy1DPrms_Init(&prms);
prms.dest_addr = dstPhys;
prms.src_addr = srcPhys;
prms.length = nBytes;
if (0 == appUdmaCopy1D(NULL, &prms))
{
ret = true;
}
}
return ret;
}
static vx_status VX_CALLBACK tivxTensorcpyProcess(
tivx_target_kernel_instance kernel,
tivx_obj_desc_t *obj_desc[],
uint16_t num_params, void *priv_arg)
{
vx_status status = (vx_status)VX_SUCCESS;
const tivx_obj_desc_tensor_t *src_desc;
const tivx_obj_desc_tensor_t *dst_desc;
if ( (num_params != TIVX_KERNEL_TENSORCPY_MAX_PARAMS)
|| (NULL == obj_desc[TIVX_KERNEL_TENSORCPY_SRC_IDX])
|| (NULL == obj_desc[TIVX_KERNEL_TENSORCPY_DST_IDX])
)
{
status = (vx_status)VX_FAILURE;
}
if((vx_status)VX_SUCCESS == status)
{
src_desc = (const tivx_obj_desc_tensor_t *)obj_desc[TIVX_KERNEL_TENSORCPY_SRC_IDX];
dst_desc = (const tivx_obj_desc_tensor_t *)obj_desc[TIVX_KERNEL_TENSORCPY_DST_IDX];
}
if((vx_status)VX_SUCCESS == status)
{
void *src_target_ptr;
void *dst_target_ptr;
src_target_ptr = tivxMemShared2TargetPtr(&src_desc->mem_ptr);
tivxCheckStatus(&status, tivxMemBufferMap(src_target_ptr,
src_desc->mem_size, (vx_enum)VX_MEMORY_TYPE_HOST,
(vx_enum)VX_READ_ONLY));
dst_target_ptr = tivxMemShared2TargetPtr(&dst_desc->mem_ptr);
tivxCheckStatus(&status, tivxMemBufferMap(dst_target_ptr,
dst_desc->mem_size, (vx_enum)VX_MEMORY_TYPE_HOST,
(vx_enum)VX_WRITE_ONLY));
{
/* call kernel processing function */
uint32_t start = tivxPlatformGetTimeInUsecs();
#ifdef A72
memcpy(dst_target_ptr, src_target_ptr, src_desc->mem_size);
#else
if (!TensorBlockCpyDma(src_desc->mem_size, src_desc, dst_desc))
{
VX_PRINT(VX_ZONE_ERROR, "TensorBlockCpyDma failed\n");
}
#endif
uint32_t delta = tivxPlatformGetTimeInUsecs() - start;
VX_PRINT(VX_ZONE_WARNING, "TensorBlockCpyDma copied %u bytes in %u us\n", src_desc->mem_size, delta);
/* kernel processing function complete */
}
tivxCheckStatus(&status, tivxMemBufferUnmap(src_target_ptr,
src_desc->mem_size, (vx_enum)VX_MEMORY_TYPE_HOST,
(vx_enum)VX_READ_ONLY));
tivxCheckStatus(&status, tivxMemBufferUnmap(dst_target_ptr,
dst_desc->mem_size, (vx_enum)VX_MEMORY_TYPE_HOST,
(vx_enum)VX_WRITE_ONLY));
}
return status;
}
A72内核调用 memcpy()而不是 appUdmaCopy1D()。
谢谢!
弗雷德