This thread has been locked.

If you have a related question, please click the "Ask a related question" button in the top right corner. The newly created question will be automatically linked to this question.

TMS320DM6467T: 请问如何打开icache/dcache

Part Number: TMS320DM6467T


我们使用了一段代码,但是发现一个问题,代码运行的 cycle 数和原始代码注释中记录的 cycle 数差10倍,比如 deblock_v_chroma_intra 按原始代码注释是 163 cycle ,但我们实际测试是约 1900 cycle。有什么可能的原因吗?我们猜测是不是需要写某条汇编指令开启 icache/dcache ?可以告诉我们开启 icache/dcache 的指令是什么吗?谢谢!

;* ================================================================================================ *;
;*  deblock-a.sa: deblocking for TI C6000 DSP                                                       *;
;*                                                                                                  *;
;*  --------------------------------------------------------------------------------------------    *;
;*  | cpu cost table of deblocking on DM6467 simulator                                         |    *;
;*  |------------------------------------------------------------------------------------------|    *;
;*  | Function         | Optimized cost (cycles) | Pure C cost (cycles)    | Result            |    *;
;*  |                  |-------------------------|-------------------------|-------------------|    *;
;*  |                  | Debug      | Release    | Debug      | Release    | Debug   | Release |    *;
;*  |------------------|------------|------------|------------|------------|---------|---------|    *;
;*  | deblock_v_chroma_intra | 358  |163         | 2708       |218         |   7.6   |   1.3   |    *;
;*  --------------------------------------------------------------------------------------------    *;

;* ======================================================================== *;
;* deblock_v_chroma_intra_ti -- deblock of chroma for one vertical line     *;
;*                              (16 u/v) for intra mb (bS == 4)             *;
;*                                                                          *;
;* p1: | U V U V | U V U V | U V U V | U V U V |                            *;
;* p0: | U V U V | U V U V | U V U V | U V U V |                            *;
;* q0: | U V U V | U V U V | U V U V | U V U V |                            *;
;* q1: | U V U V | U V U V | U V U V | U V U V |                            *;
;* ======================================================================== *;
		.sect ".text:_deblock_v_chroma_intra_ti"
        .global _deblock_v_chroma_intra_ti
_deblock_v_chroma_intra_ti .cproc pix, stride, alpha, beta
            .no_mdep

            .reg            p1, p0, q1, q0
            .reg            p1_hi, p1_lo, p0_hi, p0_lo
            .reg            q1_hi, q1_lo, q0_hi, q0_lo
            .reg            p0_q0, p1_p0, q1_q0, pq_mk
            .reg            dp, dq, dp_hi, dp_lo, dq_hi, dq_lo
            .reg            pix_4x, pix_rd, pix_wr, ptr_rd, ptr_wr
            .reg            db_con, k_2, i

            MVKL            0x00020002, k_2
            MVKH            0x00020002, k_2                ; k_2 = 0x00020002
            SHR             stride,     2,          pix_4x ; pix_4x = stride >> 2
            SUB             pix,        stride,     pix_wr ; pix_wr = pix - 1 * stride
            SUB             pix_wr,     stride,     pix_rd ; pix_rd = pix - 2 * stride
            MVK             3,          i                  ; i = 3

            ; expand alpha and beta
            PACK2           alpha,      alpha,      alpha
            PACK2           beta,       beta,       beta
            PACKL4          alpha,      alpha,      alpha
            PACKL4          beta,       beta,       beta

loop_deblock_v_chroma_intra: .trip  4,  4,          4
            ; load 4 bytes of pixels (p1, p0, q0, q1)
            MV              pix_rd,     ptr_rd
            LDNW            *ptr_rd++[pix_4x], p1
            LDNW            *ptr_rd++[pix_4x], p0
            LDNW            *ptr_rd++[pix_4x], q0
            LDNW            *ptr_rd++[pix_4x], q1

            ; edge pixel deblocking condition
            SUBABS4         p0,         q0,         p0_q0
            SUBABS4         p1,         p0,         p1_p0
            SUBABS4         q1,         q0,         q1_q0
            CMPLTU4         p0_q0,      alpha,      p0_q0 ; |p0 - q0| < alpha
            CMPLTU4         p1_p0,      beta,       p1_p0 ; |p1 - p0| < beta
            CMPLTU4         q1_q0,      beta,       q1_q0 ; |q1 - q0| < beta
            AND             p1_p0,      q1_q0,      pq_mk
            AND             p0_q0,      pq_mk,      pq_mk ; (|p0 - q0| < alpha && |p1 - p0| < beta && |q1 - q0| < beta)
            XPND4           pq_mk,      pq_mk             ; expand mask of pq
            CMPEQ           pq_mk,      0,          db_con
   [db_con] B               loop_deblock_v_chroma_intra_end

            ; unpack p0, q0, p1, q1
            UNPKHU4         p0,         p0_hi
            UNPKLU4         p0,         p0_lo
            UNPKHU4         q0,         q0_hi
            UNPKLU4         q0,         q0_lo
            UNPKHU4         p1,         p1_hi
            UNPKLU4         p1,         p1_lo
            UNPKHU4         q1,         q1_hi
            UNPKLU4         q1,         q1_lo

            ; deblock for edge pixel: p0 and q0
            ; p0' = (2*p1 + p0 + q1 + 2) >> 2
            ; q0' = (2*q1 + q0 + p1 + 2) >> 2
            ADD2            p1_hi,      p1_hi,      dp_hi
            ADD2            p1_lo,      p1_lo,      dp_lo
            ADD2            q1_hi,      q1_hi,      dq_hi
            ADD2            q1_lo,      q1_lo,      dq_lo
            ADD2            dp_hi,      p0_hi,      dp_hi
            ADD2            dp_lo,      p0_lo,      dp_lo
            ADD2            dq_hi,      q0_hi,      dq_hi
            ADD2            dq_lo,      q0_lo,      dq_lo
            ADD2            dp_hi,      q1_hi,      dp_hi
            ADD2            dp_lo,      q1_lo,      dp_lo
            ADD2            dq_hi,      p1_hi,      dq_hi
            ADD2            dq_lo,      p1_lo,      dq_lo
            ADD2            dp_hi,      k_2,        dp_hi
            ADD2            dp_lo,      k_2,        dp_lo
            ADD2            dq_hi,      k_2,        dq_hi
            ADD2            dq_lo,      k_2,        dq_lo
            SHR2            dp_hi,      2,          dp_hi
            SHR2            dp_lo,      2,          dp_lo
            SHR2            dq_hi,      2,          dq_hi
            SHR2            dq_lo,      2,          dq_lo
            SPACKU4         dp_hi,      dp_lo,      dp
            SPACKU4         dq_hi,      dq_lo,      dq
            ANDN            p0,         pq_mk,      p0
            ANDN            q0,         pq_mk,      q0
            AND             dp,         pq_mk,      dp
            AND             dq,         pq_mk,      dq
            ADD4            p0,         dp,         p0
            ADD4            q0,         dq,         q0

            ; store 4 bytes of filtered pixels (p0, q0)
            MV              pix_wr,     ptr_wr
            STNW            p0,         *ptr_wr++[pix_4x]
            STNW            q0,         *ptr_wr++[pix_4x]

loop_deblock_v_chroma_intra_end:
            ADDK            4,          pix_rd               ; pix_rd   += 4
            ADDK            4,          pix_wr               ; pix_wr   += 4
   [i]      BDEC            loop_deblock_v_chroma_intra, i

            .return
            .endproc