This thread has been locked.

If you have a related question, please click the "Ask a related question" button in the top right corner. The newly created question will be automatically linked to this question.

TDA4VEN-Q1: FFTLIB重复调用输出结果异常

Part Number: TDA4VEN-Q1
Other Parts Discussed in Thread: FFTLIB

你好

      我在开发dsp c7x时碰到连续调用FFTLIB_fft1dBatched_i16sc_c16sc_o16sc_kernel函数时,第一次fft计算结果正常,第二次fft计算结果异常,单独进行某一次的fft(只使用第一次fftt或者只使用第二次fft)结果均正确。第一次fft输入数组为128通道1024个采样点,第二次输入数组为512通道128个采样点,代码如下:

__attribute__((section(".l2mem"), aligned(64))) int16_t l2_user_array0[128][2048];   // 512k
__attribute__((section(".l2mem"), aligned(64))) int16_t l2_user_array1[128][2048];   // 512k
__attribute__((section(".l2mem"), aligned(64))) int16_t l2_user_array2[512][256];    // 256k
__attribute__((section(".l2mem"), aligned(64))) int16_t l2_user_array3[512][256];    // 256k

void func()
{
    battch_fft1d_info_type l_battch_fft1d_info = {0};
    
    l_battch_fft1d_info.num_shifts = 5;
    l_battch_fft1d_info.channel = 128;
    l_battch_fft1d_info.num_points = 1024;
    l_battch_fft1d_info.data_type = FFTLIB_INT16;
    bsp_dsppro_battch_fft1d((int16_t *)l2_user_array0, (int16_t *)l2_user_array1, &l_battch_fft1d_info);
   
    l_battch_fft1d_info.num_shifts = 3;
    l_battch_fft1d_info.channel = 512;
    l_battch_fft1d_info.num_points = 128;
    l_battch_fft1d_info.data_type = FFTLIB_INT16;
    bsp_dsppro_battch_fft1d((int16_t *)l2_user_array2, (int16_t *)l2_user_array3, &l_battch_fft1d_info);
}

uint8_t bsp_dsppro_battch_fft1d(int16_t *input, int16_t *output, battch_fft1d_info_type *battch_fft1d_info)
{
    uint8_t  l_u8_ret = 0;
    int16_t  *pX;
    int16_t  *pY;
    int16_t  *pW;
    uint32_t *pShift;
    FFTLIB_bufParams1D_t bufParamsData;
    FFTLIB_bufParams1D_t bufParamsShift;
    FFTLIB_bufParams1D_t bufParamsTw;

    FFTLIB_STATUS status_opt = FFTLIB_SUCCESS;

    uint32_t numShifts = battch_fft1d_info->num_shifts;      // 5:1024,   3:128
    uint32_t l_u32_channel = battch_fft1d_info->channel;     // 128 chirp,  1024 point
    uint32_t numPoints  = battch_fft1d_info->num_points;     // 1024 point,  128 chirp
    uint32_t dataMemSize = l_u32_channel * numPoints * 2;     /* Kernel requires input/output */
                                                              /* buffers to be atleast
                                                               * 128 elements long */

    uint8_t *pblock = NULL;
    pblock = FFTLIB_fft1dbatched_i16sc_c16sc_o16sc_pBlock;

    pX = (int16_t *)input;
    pY = (int16_t *)output;
    pW = malloc(numPoints * 2 * sizeof (int16_t));
    pShift = malloc(numShifts * sizeof (uint32_t));

    if ((pX == NULL) || (pY == NULL) || (pW == NULL) || (pShift == NULL))
    {
        DebugP_log("[info]pX is NULL!\r\n");
        l_u8_ret = 1;
        goto error;
    }

    bufParamsData.dim_x     = dataMemSize;
    bufParamsData.data_type = FFTLIB_INT16;

    bufParamsShift.dim_x     = numShifts;
    bufParamsShift.data_type = FFTLIB_UINT32;

    bufParamsTw.dim_x        = numPoints * 2;
    bufParamsTw.data_type    = FFTLIB_INT16;

    tw_gen (pW, numPoints);

    /* 批量fft变换 */
    /* 批量fft初始化 */
    status_opt = FFTLIB_fft1dBatched_i16sc_c16sc_o16sc_init((int16_t *) pX, &bufParamsData, (int16_t *) pW, &bufParamsTw,
                                                            (int16_t *) pY, &bufParamsData, (uint32_t *) pShift, &bufParamsShift, 
                                                            numPoints, l_u32_channel, pblock);
    if (status_opt != FFTLIB_SUCCESS)
    {
        l_u8_ret = 1;
        goto error;

    // /* 批量fft参数检查 */
    // status_opt = FFTLIB_fft1dBatched_i16sc_c16sc_o16sc_checkParams((int16_t *) pX, &bufParamsData, (int16_t *) pW, &bufParamsTw,
    //                                                                (int16_t *) pY, &bufParamsData, (uint32_t *) pShift, &bufParamsShift, 
    //                                                                numPoints, l_u32_channel, pblock);
    // if (status_opt != FFTLIB_SUCCESS)
    // {
    //     l_u8_ret = 2;
    //     goto error;
    // }
    
    /* 批量执行fft */
    status_opt = FFTLIB_fft1dBatched_i16sc_c16sc_o16sc_kernel((int16_t *) pX, &bufParamsData, (int16_t *) pW, &bufParamsTw,
                                                              (int16_t *) pY, &bufParamsData, (uint32_t *) pShift, &bufParamsShift, 
                                                              numPoints, l_u32_channel, pblock);
    if (status_opt != FFTLIB_SUCCESS)
    {
        l_u8_ret = 3;
        goto error;
    }

    error:
    /* 释放内存 */
    if (pW != NULL)
    {
        free(pW);
    }
    if (pShift != NULL)
    {
        free(pShift);
    }

    return l_u8_ret;
}

是否是我遗漏了什么步骤,导致连续调用不能正常工作,若我想连续计算fft,该如何修改代码?希望ti工程师能帮忙指出问题。

  • 已经收到了您的案例,调查需要些时间,感谢您的耐心等待。

  • 你能告诉我们你正在为C7X DSP开发什么吗?这是一个什么项目吗?

  • 你能分享一个测试代码和测试输入吗。
    您还可以共享链接器脚本来检查内存配置。
    此外,由于提供的代码中不存在battch_fft1d_info_type,我创建了一个结构体。你能检查一下是否正确吗

    struct battch_fft1d_info_type{
        uint32_t num_shifts;   
        uint32_t channel;     
        uint32_t num_points;   
        int data_type;   
    };

  • 感谢回复,正在开发毫米波雷达,这部分是做信号处理

  • 结构体这样写是正确的,我的链接器脚本如下:

    --ram_model
    -heap  0x20000
    -stack 0x20000
    --args 0x1000
    --diag_suppress=10068 /* to suppress no matching section error */
    --cinit_compression=off
    -e _c_int00_secure
    
    #define DDR0_ALLOCATED_START  0xAD000000      /* 0xAD000000 */
    
    #define C7X_ALLOCATED_START DDR0_ALLOCATED_START
    
    #define C7X_RESOURCE_TABLE_BASE (C7X_ALLOCATED_START + 0x00100000)
    #define C7X_IPC_TRACE_BUFFER    (C7X_ALLOCATED_START + 0x00100400)
    #define C7X_BOOT_BASE           (C7X_ALLOCATED_START + 0x00200000)
    #define C7X_VECTOR_BASE         (C7X_ALLOCATED_START + 0x00400000)
    #define C7X_DDR_SPACE_BASE      (C7X_ALLOCATED_START + 0x00600000)
    
    MEMORY
    {
        /*L2SRAM_CINIT (RWX)  : org = 0x7E000000, len = 0x000100*/   // for 256byte init     c7x_0 = 7E000000, c7x_1 = 7E200000
        L2SRAM (RWX)        : org = 0x7E000100, len = 0x200000    // for 2MBytes  EL2  0x1fff00
        L2SRAMAUX   (RWX): org = 0x7F000000, len = 0x040000       // for 256 KBytes J7AEN c7x_0 = 7F000000, c7x_1 = 7F800000
    
        /* L2SRAM (RWX):  org = 0x7E000000,                len = 0x200000 */
        DDR0_RESERVED: org = 0x80000000,                len = 0x19800000         /*  Reserved for A53 OS */
        C7X_IPC_D:     org = C7X_ALLOCATED_START,       len = 0x00100000         /*  1MB DDR */
        C7X_BOOT_D:    org = C7X_BOOT_BASE,             len = 0x400              /*  1024B DDR */
        C7X_VECS_D:    org = C7X_VECTOR_BASE,           len = 0x4000             /*  16KB DDR */
        C7X_CIO_MEM:   org = C7X_DDR_SPACE_BASE,        len = 0x1000             /*  4KB */
        /*C7X_DDR_SPACE: org = C7X_DDR_SPACE_BASE+0x1000, len = 0x00BF0000-0x1000*/  /*  11.9MB - 4KB DDR  0x3BE6666-59.9M 0x00BF0000-11.9M*/
        C7X_DDR_SPACE: org = C7X_DDR_SPACE_BASE+0x1000, len = 0x3BE6666-0x1000  /*  59.9MB - 4KB DDR  0x3BE6666-59.9M 0x00BF0000-11.9M*/
        /* For resource table */
        C7X_RT_D:      org = C7X_RESOURCE_TABLE_BASE, len = 0x400         /*  1024B DDR */
        /* IPC trace buffer */
        LINUX_IPC_TRACE_BUFFER: org = C7X_IPC_TRACE_BUFFER, len = 0xFFC00 /* 1023KB DDR */
        /*LOG_SHM_MEM             : ORIGIN = 0xA7000000, LENGTH = 0x40000*/
        /* Shared memory for RTOS NORTOS IPC */
        RTOS_NORTOS_IPC_SHM_MEM: org = 0xA5000000, len = 0x1C00000  /* 8MB DDR ,0x1C00000 = 28M  32MB*/
    
        /* xhq test share memmory */
        /*SHARED_DATA_MEM_CX0 (RWX): org = 0xA0100000, len = 0xF00000 */         /* 2MB, all:15MB DDR */
        SHARED_DATA_MEM_CX0 (RWX): org = 0xA7000000, len = 0x06000000           /* 96MB DDR */
    
        /*SHARED_DATA_MEM_CX0 (RWX): org = 0x880000000, len = 0x06000000*/          /* 2MB, all:15MB DDR */
    }
    
    SECTIONS
    {
        boot:
        {
          boot.*<boot.oe71>(.text)
        } load > C7X_BOOT_D
        .vecs       >       C7X_VECS_D
        .secure_vecs    >   C7X_DDR_SPACE ALIGN(0x100000)
        .text:_c_int00_secure > C7X_DDR_SPACE ALIGN(0x200000)
        .text       >       C7X_DDR_SPACE ALIGN(0x100000)
    
        .l1dmemory  (NOLOAD)(NOINIT) : {} > L2SRAM
        .l2dmemory  (NOLOAD)(NOINIT) : {} > L2SRAM
        .bss        >       C7X_DDR_SPACE  /* Zero-initialized data */
        RUN_START(__BSS_START)
        RUN_END(__BSS_END)
    
        .data       >       C7X_DDR_SPACE  /* Initialized data */
    
        .cinit      >       C7X_DDR_SPACE  /* could be part of const */
        .init_array >       C7X_DDR_SPACE  /* C++ initializations */
        .stack      >       C7X_DDR_SPACE ALIGN(0x2000)
        .args       >       C7X_DDR_SPACE
        .cio        >       C7X_CIO_MEM
        .const      >       C7X_DDR_SPACE
        .switch     >       C7X_DDR_SPACE /* For exception handling. */
        .sysmem     >       C7X_DDR_SPACE /* heap */
    
        GROUP:              >  C7X_DDR_SPACE
        {
            .data.Mmu_tableArray          : type=NOINIT
            .data.Mmu_tableArraySlot      : type=NOINIT
            .data.Mmu_level1Table         : type=NOINIT
            .data.gMmu_tableArray_NS       : type=NOINIT
            .data.Mmu_tableArraySlot_NS   : type=NOINIT
            .data.Mmu_level1Table_NS      : type=NOINIT
        }
    
        .benchmark_buffer:     > C7X_DDR_SPACE ALIGN (32)
    
        /* This is the resource table used by linux to know where the IPC "VRINGs" are located */
        .resource_table: { __RESOURCE_TABLE = .;} > C7X_RT_D
        /* This IPC log can be viewed via ROV in CCS and when linux is enabled, this log can also be viewed via linux debugfs */
        .bss.debug_mem_trace_buf    : {} palign(128)    > LINUX_IPC_TRACE_BUFFER
        /* this is used when Debug log's to shared memory is enabled, else this is not used */
        /*.bss.log_shared_mem  (NOLOAD) : {} > LOG_SHM_MEM*/
        /* this is used only when IPC RPMessage is enabled */
        .bss.ipc_vring_mem   (NOLOAD) : {} > RTOS_NORTOS_IPC_SHM_MEM
    
        /*.l1mem              (NOLOAD)(NOINIT) : {} > L2SRAM_CINIT*/
        .l2mem              (NOLOAD)(NOINIT) : {} > L2SRAM
        .l3mem              (NOLOAD)(NOINIT) : {} > L2SRAMAUX
        
        .user_array0     (NOLOAD) : {} > SHARED_DATA_MEM_CX0
        /*.user_array1     (NOLOAD) : {} > SHARED_DATA_MEM_1*/
        /*.user_array2     (NOLOAD) : {} > SHARED_DATA_MEM_2*/
        /*.user_array3     (NOLOAD) : {} > SHARED_DATA_MEM_3*/
        /*.user_array4     (NOLOAD) : {} > SHARED_DATA_MEM_4*/
    }
    

  • 我该如何给你分享测试工程代码,邮件还是附件

  • 您可以在此处发布独立测试代码。我会核实一下。

  • 我定位到是因为动态分配pw会导致pw中存储的旋转因子会出现问题,导致计算错误,具体原因我尚在定位,不过将pw指向全局数组会避免该问题,目前已经可以正常计算