This thread has been locked.

If you have a related question, please click the "Ask a related question" button in the top right corner. The newly created question will be automatically linked to this question.

[参考译文] Linux/PROCESSOR-SDK-AM57X:使用 EdmaMgr 的 OpenCL+DSP 加速无效

Guru**** 2538960 points
Other Parts Discussed in Thread: AM5718

请注意,本文内容源自机器翻译,可能存在语法或其它翻译错误,仅供参考。如需获取准确内容,请参阅链接中的英语原文或自行翻译。

https://e2e.ti.com/support/processors-group/processors/f/processors-forum/798807/linux-processor-sdk-am57x-opencl-dsp-acceleration-using-edmamgr-has-no-effect

器件型号:PROCESSOR-SDK-AM57X

工具/软件:Linux

尊敬的 TI:

   我正在 AM5718板上执行 OpenCV+DSP 加速,SDK 版本是 ti-processor-sdk-linux-rt-am57xx-evm-04.01.00.06。

根据 http://software-dl.ti.com/processor-sdk-linux/esd/docs/latest/linux/Foundational_Components_OpenCV.html#creating-opencl-c-kernel-optimized-for-c66-core 中的文档 


__constant float c_YUV2RGBCoeffs_420[5]={1.163999557f、2.017999649f、-0.390999794f、-0.812999725f、1.5959997177f};

__kernel void YUV422_TO_RGB_OpenCL_1WorkItem (__global uchar* srctr、__global uchar* dstptr)

__global uchar* src;
__global uchar* dst;

int clk_start、clk_end;
clk_start =_时钟();

对于(int x=0;x < 480;x++){
for (int y=0;y < 320;y++){
src = srcptr +(x*640*2+(y <<2));
dst=dstptr +(x*640*3 +((y<<1)*3));

__constant float* coeffs = c_YUV2RGBCoeffs_420;

int load_src =*((int*) src);
float VEC_src[4]={load_src & 0xff、(load_src >> 8)& 0xff、(load_src >> 16)& 0xff、(load_src >> 24)& 0xff};
float U = vec_src[1]- 128;
浮点 V = VEC_src[3]- 128;
浮点 Y00 = max (0.f、vec_src[0]- 16.f)* coeffs[0];
float Y01 = max (0.f、vec_src[2]- 16.f)* coeffs[0];
float RUV =((coeffs[4]*V)+0.5f);
float guv =(coeffs[3]*V)+(coeffs[2]*U)+0.5f);
float BUV =((coeffs[1]*U)+0.5f);

dst[2]= convert_uchar_sat (Y00 + RUV);
dst[1]= convert_uchar_sat (y00 + guv);
dst[0]= convert_uchar_sat (Y00 + BUV);
dst[5]= convert_uchar_sat (Y01 + RUV);
dst[4]= convert_uchar_sat (y01 + guv);
dst[3]= convert_uchar_sat (Y01 + BUV);


clk_end =_时钟();
printf ("TIDSP clockdiff=%d\n"、clk_end - clk_start);
//////////////////////////////// 66549941

2.其中 EDMA OpenCL 内核:

__kernel void YUV422_TO_RGB_OpenCL_1WorkItem_EDMA (__global uchar* srcptr、__global uchar* dstptr)

int cols = 640;
int rows = 480;

uchar * y_ptr[LINes_cached];
uchar * yurr ptr、* dest_ptr、* dst、* src;
int rd_idx、start_rd_idx、fet_rd_idx;

int r、g、b、cr、cg、 CB、Y1、Y2;

EdmaMgr_handle Evin = EdmaMgr_alloc (LINes_cached);
局部 uchar img_lines[lines_cached+1][MAX_LINE_SIZE];

int i、kk;

int clk_start、clk_end;
clk_start =_时钟();

if (!Evin){printf ("无法分配 edmaIN1句柄。\n");return;}

dest_ptr =(uchar *) dstptr;

for (i = 0;i <(LINS_Cached + 1);i ++){
memset ((void *) img_lines[i]、0、MAX_LINE_SIZE);

for (i = 1;i <行缓存;i++){
EdmaMgr_copy1Dd (Evin、(void *)(srcptr)、(void *)(img_LINes[i])、cols*2);

fet_rd_idx = cols*2;
start_rd_idx = 0;

对于(int x = 0;x < rows;x++){
EdmaMgr_wait (Evin);
RD_idx = start_rd_idx;
for (kk = 0;kk < LINS_Cached;kk ++){
y_ptr[kkkk]=(uchar *) img_LINes[rd_idx];
RD_idx =(rd_idx + 1)和 LINes_cached;

start_rd_idx =(start_rd_idx + 1)& LINes_cached;
EdmaMgr_copyFast (Evin、(void*)(srcptr + Fetch_rd_idx)、(void*)(img_lines [rd_idx]));
fet_rd_idx += cols*2;

cur_ptr =(uchar *) y_ptr[y1];

for (int y=0;<cols/2; y++){
src = yurur_ptr +(y << 2);
dst=dest_ptr +(x*640*3+((y<<1)*3);

__constant float* coeffs = c_YUV2RGBCoeffs_420;

int load_src =*((int*) src);
float VEC_src[4]={load_src & 0xff、(load_src >> 8)& 0xff、(load_src >> 16)& 0xff、(load_src >> 24)& 0xff};
float U = vec_src[1]- 128;
浮点 V = VEC_src[3]- 128;
浮点 Y00 = max (0.f、vec_src[0]- 16.f)* coeffs[0];
float Y01 = max (0.f、vec_src[2]- 16.f)* coeffs[0];
float RUV =((coeffs[4]*V)+0.5f);
float guv =(coeffs[3]*V)+(coeffs[2]*U)+0.5f);
float BUV =((coeffs[1]*U)+0.5f);

dst[2]= convert_uchar_sat (Y00 + RUV);
dst[1]= convert_uchar_sat (y00 + guv);
dst[0]= convert_uchar_sat (Y00 + BUV);
dst[5]= convert_uchar_sat (Y01 + RUV);
dst[4]= convert_uchar_sat (y01 + guv);
dst[3]= convert_uchar_sat (Y01 + BUV);


EdmaMgr_wait (Evin);
EdmaMgr_free (Evin);

clk_end =_时钟();
printf ("TIDSP clockdiff=%d\n"、clk_end - clk_start);
/////////////////////////////////// 66420487

 结果显示了两个执行时间  66420487。

4.我的问题是  :为什么 EDMA 在  DSP 加速方面没有更好的性能?

谢谢。