工具/软件:Linux
尊敬的 TI:
我正在 AM5718板上执行 OpenCV+DSP 加速,SDK 版本是 ti-processor-sdk-linux-rt-am57xx-evm-04.01.00.06。
__constant float c_YUV2RGBCoeffs_420[5]={1.163999557f、2.017999649f、-0.390999794f、-0.812999725f、1.5959997177f};
__kernel void YUV422_TO_RGB_OpenCL_1WorkItem (__global uchar* srctr、__global uchar* dstptr)
{
__global uchar* src;
__global uchar* dst;
int clk_start、clk_end;
clk_start =_时钟();
对于(int x=0;x < 480;x++){
for (int y=0;y < 320;y++){
src = srcptr +(x*640*2+(y <<2));
dst=dstptr +(x*640*3 +((y<<1)*3));
__constant float* coeffs = c_YUV2RGBCoeffs_420;
int load_src =*((int*) src);
float VEC_src[4]={load_src & 0xff、(load_src >> 8)& 0xff、(load_src >> 16)& 0xff、(load_src >> 24)& 0xff};
float U = vec_src[1]- 128;
浮点 V = VEC_src[3]- 128;
浮点 Y00 = max (0.f、vec_src[0]- 16.f)* coeffs[0];
float Y01 = max (0.f、vec_src[2]- 16.f)* coeffs[0];
float RUV =((coeffs[4]*V)+0.5f);
float guv =(coeffs[3]*V)+(coeffs[2]*U)+0.5f);
float BUV =((coeffs[1]*U)+0.5f);
dst[2]= convert_uchar_sat (Y00 + RUV);
dst[1]= convert_uchar_sat (y00 + guv);
dst[0]= convert_uchar_sat (Y00 + BUV);
dst[5]= convert_uchar_sat (Y01 + RUV);
dst[4]= convert_uchar_sat (y01 + guv);
dst[3]= convert_uchar_sat (Y01 + BUV);
}
}
clk_end =_时钟();
printf ("TIDSP clockdiff=%d\n"、clk_end - clk_start);
//////////////////////////////// 66549941
}
2.其中 EDMA OpenCL 内核:
__kernel void YUV422_TO_RGB_OpenCL_1WorkItem_EDMA (__global uchar* srcptr、__global uchar* dstptr)
{
int cols = 640;
int rows = 480;
uchar * y_ptr[LINes_cached];
uchar * yurr ptr、* dest_ptr、* dst、* src;
int rd_idx、start_rd_idx、fet_rd_idx;
int r、g、b、cr、cg、 CB、Y1、Y2;
EdmaMgr_handle Evin = EdmaMgr_alloc (LINes_cached);
局部 uchar img_lines[lines_cached+1][MAX_LINE_SIZE];
int i、kk;
int clk_start、clk_end;
clk_start =_时钟();
if (!Evin){printf ("无法分配 edmaIN1句柄。\n");return;}
dest_ptr =(uchar *) dstptr;
for (i = 0;i <(LINS_Cached + 1);i ++){
memset ((void *) img_lines[i]、0、MAX_LINE_SIZE);
}
for (i = 1;i <行缓存;i++){
EdmaMgr_copy1Dd (Evin、(void *)(srcptr)、(void *)(img_LINes[i])、cols*2);
}
fet_rd_idx = cols*2;
start_rd_idx = 0;
对于(int x = 0;x < rows;x++){
EdmaMgr_wait (Evin);
RD_idx = start_rd_idx;
for (kk = 0;kk < LINS_Cached;kk ++){
y_ptr[kkkk]=(uchar *) img_LINes[rd_idx];
RD_idx =(rd_idx + 1)和 LINes_cached;
}
start_rd_idx =(start_rd_idx + 1)& LINes_cached;
EdmaMgr_copyFast (Evin、(void*)(srcptr + Fetch_rd_idx)、(void*)(img_lines [rd_idx]));
fet_rd_idx += cols*2;
cur_ptr =(uchar *) y_ptr[y1];
for (int y=0;<cols/2; y++){
src = yurur_ptr +(y << 2);
dst=dest_ptr +(x*640*3+((y<<1)*3);
__constant float* coeffs = c_YUV2RGBCoeffs_420;
int load_src =*((int*) src);
float VEC_src[4]={load_src & 0xff、(load_src >> 8)& 0xff、(load_src >> 16)& 0xff、(load_src >> 24)& 0xff};
float U = vec_src[1]- 128;
浮点 V = VEC_src[3]- 128;
浮点 Y00 = max (0.f、vec_src[0]- 16.f)* coeffs[0];
float Y01 = max (0.f、vec_src[2]- 16.f)* coeffs[0];
float RUV =((coeffs[4]*V)+0.5f);
float guv =(coeffs[3]*V)+(coeffs[2]*U)+0.5f);
float BUV =((coeffs[1]*U)+0.5f);
dst[2]= convert_uchar_sat (Y00 + RUV);
dst[1]= convert_uchar_sat (y00 + guv);
dst[0]= convert_uchar_sat (Y00 + BUV);
dst[5]= convert_uchar_sat (Y01 + RUV);
dst[4]= convert_uchar_sat (y01 + guv);
dst[3]= convert_uchar_sat (Y01 + BUV);
}
}
EdmaMgr_wait (Evin);
EdmaMgr_free (Evin);
clk_end =_时钟();
printf ("TIDSP clockdiff=%d\n"、clk_end - clk_start);
/////////////////////////////////// 66420487
}
结果显示了两个执行时间 66420487。
4.我的问题是 :为什么 EDMA 在 DSP 加速方面没有更好的性能?
谢谢。