使用6678核0,其中一段128K的浮点运算是在L2SRAM上进行的,使用TSCH,TSCL测了运算circles为4382733(ns),代码如下:L2buff为本地L2SRAM,L2SRAM 512KB全部作为RAM使用,L1D缓存已使能:
CACHE_setL1PSize(CACHE_L1_32KCACHE);
CACHE_setL1DSize(CACHE_L1_32KCACHE);
CACHE_enableCaching(16);
......
memset(GL2buff,0,0x80000);
edma3_2d_trans((char *)pusData+iPhase*sizeof(uint16),(char *)GL2buff,Acnt,Bcnt,Ccnt,dstBidx,dstBidx,srcCidx,dstCidx);
uint32 *temp = (uint32*)L2buff;
float *ftemp = (float*)L2buff;
t_start = _itoll(TSCH,TSCL);
for(j = 0;j < 128*1024;j++)
{
ftemp[j] = (float)temp[j]*fCBF_S2FCoef;
}
t_stop = _itoll(TSCH,TSCL);
t_total = t_stop-t_start-t_overhead;
printf("L277 time cost was %lld\r\n",t_total);
按理论计算,速度不应该为4.3ms这么慢,不知道是不是我哪一点没有设置好?反汇编代码看了一下,也没看出来个所以然:
$C$L5:
0c04ad64: 028029EE LDW.D2T2 *+B15[41],B5
0c04ad68: 0FBF02E6 LDW.D2T2 *+B15[24],B31
0c04ad6c: 03002AEE LDW.D2T2 *+B15[42],B6
0c04ad70: 0F0027EE LDW.D2T2 *+B15[39],B30
0c04ad74: 0C6E NOP 1
0c04ad76: 94DD LDW.D2T2 *B5[B4],B5
0c04ad78: 00006000 NOP 4
0c04ad7c: E4000000 .fphead n, l, W, BU, nobr, nosat, 0100000b
0c04ad80: 0394093A INTSPU.L2 B5,B7
0c04ad84: 00004000 NOP 3
0c04ad88: 021FEE02 MPYSP.M2 B31,B7,B4
0c04ad8c: 00004000 NOP 3
0c04ad90: 021BCAF6 STW.D2T2 B4,*+B6[B30]
253 for(j = 0;j < 128*1024;j++)
0c04ad94: 020027EE LDW.D2T2 *+B15[39],B4
0c04ad98: 6C6E NOP 4
0c04ad9a: 2641 ADD.L2 B4,1,B4
0c04ad9c: E8000000 .fphead n, l, W, BU, nobr, nosat, 1000000b
0c04ada0: 020027FE STW.D2T2 B4,*+B15[39]
0c04ada4: 00208AFA CMPLT.L2 B4,B8,B0
0c04ada8: 2FE2A120 [ B0] BNOP.S1 $C$L5 (PC-60 = 0x0c04ad64),5
272 t_stop = _itoll(TSCH,TSCL);
请大神们不吝赐教,不胜感谢!