Hi,
我在文档“Optimizing Loops on the C66x DSP”上看到一个对循环进行优化的例子:
void example1_gc(cplxf_t *a, cplxf_t *ejalpha, float *abs_a, int n)
{
int i;
float a_sqr, oneOverAbs_a;
for ( i = 0; i < n; i++)
{
a_sqr =a[i].real * a[i].real + a[i].imag * a[i].imag;
oneOverAbs_a =1.f/(float)sqrt(a_sqr);
abs_a[i] = a_sqr * oneOverAbs_a;
ejalpha[i].real =a[i].real * oneOverAbs_a;
ejalpha[i].imag =a[i].imag * oneOverAbs_a;
}
}
然后提供了一个C64x+平台的优化版本:
_nassert(n % 4 == 0);
_nassert((int) a % 8 == 0);
_nassert((int) ejalpha % 8 == 0);
_nassert((int) abs_a % 8 == 0);
#pragma MUST_ITERATE(4,100, 4);
#pragma UNROLL(2);
for ( i = 0; i < n; i++)
{
temp1 = _amem4(&a[i]);
a_sqr = _dotp2(temp1, temp1);
/* 1/sqrt(a_sqr) */
normal = _norm(a_sqr);
normal = normal & 0xFFFFFFFE;
x_norm = _sshvl(a_sqr, normal);
normal = normal >> 1;
Index = _sshvr(_sadd(x_norm,0x800000),24);
oneOverAbs_a=_mpylir( xcbia[Index], x_norm );
oneOverAbs_a=_sadd((int)x3sa[Index]<<16,
_sshvr(oneOverAbs_a,ShiftValDifp1a[Index]));
normal =15 - ShiftVala[Index] + normal;
ejbeta_re =_sadd(_sshvl(_mpyhir(temp1, oneOverAbs_a), normal - 1), 0x8000);
ejbeta_im =_sadd(_sshvl(_mpylir(temp1, oneOverAbs_a), normal - 1), 0x8000);
_amem4(&ejalpha[i])= _packh2(ejbeta_re, ejbeta_im);
abs_a[i] = sshvr(_sadd(_mpyhir(oneOverAbs_a, a_sqr), 1<<(15 - normal)),
16-normal) ;
}
这个优化版本中重要的一点就是用查表法来优化1/sqrt(x),遗憾的是文档没有详细介绍其实现原理。我看了很久也没明白这段代码是怎么工作的。有人能帮我解释一下吗?谢谢!