[参考译文] SK-AM62A-LP：可以根据 edgeai-gst-apps /app_cpp 同时运行 cpu+npu。

admin

请注意，本文内容源自机器翻译，可能存在语法或其它翻译错误，仅供参考。如需获取准确内容，请参阅链接中的英语原文或自行翻译。

https://e2e.ti.com/support/processors-group/processors/f/processors-forum/1461427/sk-am62a-lp-could-run-cpu-npu-in-the-same-time-based-on-edgeai-gst-apps-app_cpp

器件型号：SK-AM62A-LP

工具与软件：

可以根据 edgeai-gst-apps /app_cpp 同时运行 cpu+npu。我们试图删除 allownodes.txt 的内容,并尝试添加这些代码后处理部分.

auto allocator_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);

Ort::Value input_tensor_ = Ort::Value::CreateTensor<float>(allocator_info, input_image_.data(), input_image_.size(), input_shape.data(), input_shape.size());

auto cpu_output = ort_session->Run(Ort::RunOptions{ nullptr }, &input_names[0], &input_tensor_, 1, output_names.data(), 1);

const float* output_cpu = cpu_output[0].GetTensorMutableData<float>();

但所有的 fps 将下降到 5。这是否正常？

9 个月前

0 admin 9 个月前

TI__Guru**** 2396115 points

请注意，本文内容源自机器翻译，可能存在语法或其它翻译错误，仅供参考。如需获取准确内容，请参阅链接中的英语原文或自行翻译。

你(们)好

建议测量每个器件的延迟、以了解哪种情况会降低减速。

此致、

Adam

0 admin 9 个月前

TI__Guru**** 2396115 points

请注意，本文内容源自机器翻译，可能存在语法或其它翻译错误，仅供参考。如需获取准确内容，请参阅链接中的英语原文或自行翻译。

您好！

在这种情况下、您是否尝试仅在 CPU 上运行模型？ FPS 当然会比 NPU/C7x 慢得多。 C7x 通常比 AM62A 上的4x A53快20-50倍。

我不知道您是如何在这里创建 ort_session 的、但我认为它使用 CPU 执行提供程序而不是 TIDL、如下所示：

https://github.com/TexasInstruments/edgeai-tidl-tools/blob/09347c5390eb95f80754dfbd7cbc7e98029254b9/examples/osrt_cpp/ort/onnx_main.cpp#L358

BR、
Reese

0 admin 9 个月前

TI__Guru**** 2396115 points

请注意，本文内容源自机器翻译，可能存在语法或其它翻译错误，仅供参考。如需获取准确内容，请参阅链接中的英语原文或自行翻译。

这是很容易使用 onnxruntime CPU 。我尝试了很多, 但在 npu 失败

void * allocTensorMem(int size, int accel)
{
    void * ptr = NULL;
    if (accel)
    {
        #ifdef DEVICE_AM62
        LOG_ERROR("TIDL Delgate mode is not allowed on AM62 devices...\n");
        printf("Could not allocate memory for a Tensor of size %d \n ", size);
        exit(0);
        #else
        ptr = TIDLRT_allocSharedMem(64, size);
        #endif
    }
    else
    {
        ptr = malloc(size);
    }
    if (ptr == NULL)
    {
        printf("Could not allocate memory for a Tensor of size %d \n ", size);
        exit(0);
    }
    return ptr;
}  
/**************************************************************************************************************************/

    Ort::SessionOptions session_cpuop;
    session_cpuop.SetLogSeverityLevel(3);
    Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "ONNXModel");
    string artifacts_path="/opt/model_zoo/c777/artifacts";
    std::string model_path_cpu = "/opt/model_zoo/c777/model/modified_mpiifacegaze-60.onnx";
    c_api_tidl_options *options = (c_api_tidl_options *)malloc(sizeof(c_api_tidl_options));
    OrtStatus *def_status = OrtSessionsOptionsSetDefault_Tidl(options);
    strcpy(options->artifacts_folder, artifacts_path.c_str());
    OrtStatus *status = OrtSessionOptionsAppendExecutionProvider_Tidl(session_cpuop, options);
    session_cpuop.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
    session_cpuop.SetLogSeverityLevel(3);
    Ort::AllocatorWithDefaultOptions allocator;
    Ort::Session session(env, model_path_cpu.c_str(), session_cpuop);
    std::vector<int64_t> input_shape = {1, 3, 60, 60};
    cv::resize(gaze_img, resized_cpu_image, cv::Size(60, 60));  
    normalize_(resized_cpu_image);
    std::vector<char*> input_names,output_names;
    std::vector<std::string> strings = {"156"};
    std::vector<std::string> strings1 = {"input.1"};
    std::vector<Ort::Value> output_tensors;
    Ort::AllocatorWithDefaultOptions allocator;
    //input_names.push_back("input.1");
    for (int i = 0; i < 1; i++){
        const char* exampleString = strings[i].c_str();
        char* newCharPtr = new char[strlen(exampleString) + 1];
        strcpy(newCharPtr, exampleString);
        output_names.push_back(newCharPtr);
    }
    for (int i = 0; i < 1; i++){
        const char* exampleString1 = strings1[i].c_str();
        char* newCharPtr1 = new char[strlen(exampleString1) + 1];
        strcpy(newCharPtr1, exampleString1);
        input_names.push_back(newCharPtr1);
    }
    auto allocator_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);//Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
    std::vector<Ort::Value> input_tensors;
    Ort::Value input_tensor = Ort::Value::CreateTensor<float>(allocator_info, input_image_.data(), input_image_.size(), input_shape.data(), input_shape.size());
    input_tensors.push_back(std::move(input_tensor));
    //Ort::Value input_tensor_ = Ort::Value::CreateTensor<float>(allocator_info, input_image_.data(), input_image_.size(), input_shape.data(), input_shape.size());
    auto run_options = Ort::RunOptions();
    run_options.SetRunLogVerbosityLevel(2);
    run_options.SetRunLogSeverityLevel(3);
    auto cpu_output = session.Run(run_options, input_names.data(), input_tensors.data(), 1, output_names.data(), 1);
    Ort::IoBinding binding(session);
    binding.BindInput(input_names[0], input_tensors[0]);
    for(int idx =0; idx < 1; idx++){
            auto node_dims = cpu_output[idx].GetTypeInfo().GetTensorTypeAndShapeInfo().GetShape();
            size_t tensor_size = 1;
            for(int j = node_dims.size()-1; j >= 0; j--)
                tensor_size *= node_dims[j];
            ONNXTensorElementDataType tensor_type  = cpu_output[idx].GetTypeInfo().GetTensorTypeAndShapeInfo().GetElementType();              
            if(tensor_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT)
                tensor_size *= sizeof(float);
            else if(tensor_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8)
                tensor_size *= sizeof(uint8_t);
            else if(tensor_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64)
                tensor_size *= sizeof(int64_t);              
            else{
            std::cout << "Un Supported output tensor_type\n";
                exit(0);}  
            void * outData = allocTensorMem(tensor_size, 1);
            auto output_tensor = Ort::Value::CreateTensor(allocator_info, (void *)outData, tensor_size, node_dims.data(), node_dims.size(),tensor_type);
            output_tensors.push_back(std::move(output_tensor));
            binding.BindOutput(output_names[idx], output_tensors[idx]);
        }
        session.Run(run_options, binding);
        std::cout<<test<<std::endl;
        float *inDdata = output_tensors.at(0).GetTensorMutableData<float>();
        std::cout<<inDdata<<std::endl;
/**************************************************************************************************************************/

0 admin 8 个月前

TI__Guru**** 2396115 points

请注意，本文内容源自机器翻译，可能存在语法或其它翻译错误，仅供参考。如需获取准确内容，请参阅链接中的英语原文或自行翻译。

您好！

我在上面重新编排了你的代码格式、使它在 e2e 界面中更易阅读--代码块(使用"插入"->代码)帮助更多

Unknown 说：
但所有 fp 都将降至 5。这是否正常？

这适用于运行 CPU 的模型部分吗？我不能说这是否符合预期、这取决于您的网络的复杂性。如上所述、CPU 通常比 NPU 慢20-50倍

Unknown 说：
我们尝试删除 allownodes.txt 的内容并尝试将这些代码添加到后处理部分。

我不确定自己是否理解。你似乎尝试从 allowedNode.txt 中删除后处理层、以便 TIDL 将其委派给 Arm 内核。我想这可能可行、但在编译模型时处理此问题会更好

使用 deny_list 和 Max_num_subgraphs 来控制加速和未加速的图层集。
- 例如、将 max_num_subgraphs 设置为1、并拒绝要停止使用 TIDL 的图层
  - -->这些图层之后的所有内容也应被拒绝、因为 TIDL 不会创建附加子图。

这是因为您使用 NPU 作为模型的一部分、CPU 用于其余部分？

Unknown 说：
这是易于使用的 onnxruntime CPU 。我试过很多次、但在 npu

中失败了

您能更具体一点吗？失败的是什么？您能否提供错误消息？

BR、
Reese

0 admin 7 个月前

TI__Guru**** 2396115 points

请注意，本文内容源自机器翻译，可能存在语法或其它翻译错误，仅供参考。如需获取准确内容，请参阅链接中的英语原文或自行翻译。

[报价 userid="360457" url="~/support/processors-group/processors/f/processors-forum/1461427/sk-am62a-lp-could-run-cpu-npu-in-the-same-time-based-on-edgeai-gst-apps-app_cpp/5664583 #5664583"]

王小军说：
这是很容易使用 onnxruntime CPU 。我尝试了很多, 但在 npu 失败

您能更具体一点吗？失败的是什么？您能否提供错误消息？

[报价]

它仍在 CPU 上运行

0 admin 7 个月前

TI__Guru**** 2396115 points

请注意，本文内容源自机器翻译，可能存在语法或其它翻译错误，仅供参考。如需获取准确内容，请参阅链接中的英语原文或自行翻译。

没有任何的报错、单个模型推理部分20ms、但是cpu占用却离奇的340 没办法确定是不是用在npu上面

void * outData = allocTensorMem (tensorMem、1)；不管是1 ptr = TIDLRT_allocSharedMem (64、size)；

还是0 ptr = malloc (size)； cpu消耗都是340 %

0 admin 7 个月前

TI__Guru**** 2396115 points

请注意，本文内容源自机器翻译，可能存在语法或其它翻译错误，仅供参考。如需获取准确内容，请参阅链接中的英语原文或自行翻译。

同时每次都是只能跑128次。for循环在128次以内能正常结束。但是超过128次或是用while1到128次就直接结束了 μ A

0 admin 7 个月前

TI__Guru**** 2396115 points

请注意，本文内容源自机器翻译，可能存在语法或其它翻译错误，仅供参考。如需获取准确内容，请参阅链接中的英语原文或自行翻译。

如果我在warmup后面实现类似摄像头循环读图片再处理的效果、infertime不一致 μ A

0 admin 7 个月前

TI__Guru**** 2396115 points

请注意，本文内容源自机器翻译，可能存在语法或其它翻译错误，仅供参考。如需获取准确内容，请参阅链接中的英语原文或自行翻译。

您好！

我只能对大部分中文文本提供有限的帮助。

我确实清楚地看到推测时间变量很大、并且看起来具有通用模式(15ms、27ms)。我过去曾看到过、在 Linux 中为 Arm 内核启用频率调节时会出现这种行为。

$cat /sys/devices/system/cpu/cpufreq/policy0/scaling_available_governors
ondemand userspace performance

$cat /sys/devices/system/cpu/cpufreq/policy0/scaling_governor
performance #should be either userspace or performance

我想您可以在代码的多个部分看到类似的行为。是这样吗？

0 admin 7 个月前

TI__Guru**** 2396115 points

请注意，本文内容源自机器翻译，可能存在语法或其它翻译错误，仅供参考。如需获取准确内容，请参阅链接中的英语原文或自行翻译。

0 admin 7 个月前

TI__Guru**** 2396115 points

请注意，本文内容源自机器翻译，可能存在语法或其它翻译错误，仅供参考。如需获取准确内容，请参阅链接中的英语原文或自行翻译。

Ort::SessionOptions session_cpuop;
session_cpuop.SetLogSeverityLevel(3);
Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "ONNXModel");
string artifacts_path="/home/c666-phase1/artifacts";
std::string model_path_cpu = "/home/c666-phase1/model/modified_modified_test_phase1.onnx";
c_api_tidl_options *options = (c_api_tidl_options *)malloc(sizeof(c_api_tidl_options));
OrtStatus *def_status = OrtSessionsOptionsSetDefault_Tidl(options);
strcpy(options->artifacts_folder, artifacts_path.c_str());
OrtStatus *status = OrtSessionOptionsAppendExecutionProvider_Tidl(session_cpuop, options);
session_cpuop.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
Ort::Session session(env, model_path_cpu.c_str(), session_cpuop);
auto run_options = Ort::RunOptions();
run_options.SetRunLogVerbosityLevel(2);
run_options.SetRunLogSeverityLevel(3);



std::vector<int64_t> input_shape = {1, 3, 128, 128};
cv::Mat resized_cpu_image;
cv::Mat gaze_img = cv::imread("/home/an.jpg");
std::vector<char*> input_names,output_names;
std::vector<std::string> strings = {"outputall"}; // output
std::vector<std::string> strings1 = {"images"}; // input
std::vector<Ort::Value> output_tensors;
Ort::AllocatorWithDefaultOptions allocator;
for (int i = 0; i < 1; i++){
const char* exampleString = strings[i].c_str();
char* newCharPtr = new char[strlen(exampleString) + 1];
strcpy(newCharPtr, exampleString);
output_names.push_back(newCharPtr);
}
for (int i = 0; i < 1; i++){
const char* exampleString1 = strings1[i].c_str();
char* newCharPtr1 = new char[strlen(exampleString1) + 1];
strcpy(newCharPtr1, exampleString1);
input_names.push_back(newCharPtr1);
}
auto allocator_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
std::vector<Ort::Value> input_tensors;

gaze_img = cv::imread("/home/3.jpg");

cv::resize(gaze_img, resized_cpu_image, cv::Size(128, 128));
normalize_(resized_cpu_image);

Ort::Value input_tensor = Ort::Value::CreateTensor<float>(allocator_info, input_image_.data(), input_image_.size(), input_shape.data(), input_shape.size());
input_tensors.push_back(std::move(input_tensor));
input_shape.size());
//warmup
auto cpu_output = session.Run(run_options, input_names.data(), input_tensors.data(), 1, output_names.data(), 1);
Ort::IoBinding binding(session);
binding.BindInput(input_names[0], input_tensors[0]);
for(int idx =0; idx < 1; idx++){
auto node_dims = cpu_output[idx].GetTypeInfo().GetTensorTypeAndShapeInfo().GetShape();
size_t tensor_size = 1;
for(int j = node_dims.size()-1; j >= 0; j--)
tensor_size *= node_dims[j];
ONNXTensorElementDataType tensor_type = cpu_output[idx].GetTypeInfo().GetTensorTypeAndShapeInfo().GetElementType();
if(tensor_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT)
tensor_size *= sizeof(float);
else if(tensor_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8)
tensor_size *= sizeof(uint8_t);
else if(tensor_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64)
tensor_size *= sizeof(int64_t);
else{
std::cout << "Un Supported output tensor_type\n";
exit(0);}
//cout<<tensor_size<<endl; 940
void * outData = allocTensorMem(tensor_size, 1);
auto output_tensor = Ort::Value::CreateTensor(allocator_info, (void *)outData, tensor_size, node_dims.data(), node_dims.size(),tensor_type);
output_tensors.push_back(std::move(output_tensor));
binding.BindOutput(output_names[idx], output_tensors[idx]);
}
cv::VideoCapture cap(3);
for (int i = 0; i < 1000; i++){

cap >> gaze_img;

cv::resize(gaze_img, resized_cpu_image, cv::Size(128, 128));
normalize_(resized_cpu_image);
clock_t stavg = clock();
Ort::Value input_tensor = Ort::Value::CreateTensor<float>(allocator_info, input_image_.data(), input_image_.size(), input_shape.data(), input_shape.size());
input_tensors.push_back(std::move(input_tensor));
binding.BindInput(input_names[0], input_tensors[0]);
session.Run(run_options, binding);
clock_t endavg = clock();
printf("infer time:%f ms\n", (double)(endavg - stavg)*1000 / CLOCKS_PER_SEC);
float *inDdata = output_tensors.at(0).GetTensorMutableData<float>();

}

}