首页 > 解决方案 > 通过多线程使用 GPU 委托 v2 api 初始化多个 tflite 模型

问题描述

我尝试通过多线程同时使用GPU delegate v2 api初始化多个tflite模型,但发现多线程同时初始化多个模型和单线程初始化多个模型需要大约相同的时间依次,谁能解释一下为什么~

#include <thread>
#include <pthread.h>
#include "tensorflow/lite/kernels/register.h"
#include "tensorflow/lite/model.h"
#include "tensorflow/lite/optional_debug_tools.h"
#include "tensorflow/lite/delegates/gpu/delegate.h"

static int
modify_graph_with_delegate (tflite_interpreter_t *p, tflite_createopt_t *opt)
{
    TfLiteDelegate *delegate = NULL;

#if defined (USE_GPU_DELEGATEV2)
    const TfLiteGpuDelegateOptionsV2 options = {
        .is_precision_loss_allowed = 1, // FP16
        .inference_preference = TFLITE_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER,
        .inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY,
        .inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
        .inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
    };
    delegate = TfLiteGpuDelegateV2Create(&options);
#endif

    if (!delegate)
        return 0;

    if (p->interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk)
    {
        DBG_LOGE ("ERR: %s(%d)\n", __FILE__, __LINE__);
        return -1;
    }

    return 0;
}


int
tflite_create_interpreter_from_file (tflite_interpreter_t *p, const char *model_path)
{
    p->model = FlatBufferModel::BuildFromFile (model_path);
    if (!p->model)
    {
        DBG_LOGE ("ERR: %s(%d)\n", __FILE__, __LINE__);
        return -1;
    }

    InterpreterBuilder(*(p->model), p->resolver)(&(p->interpreter));
    if (!p->interpreter)
    {
        DBG_LOGE ("ERR: %s(%d)\n", __FILE__, __LINE__);
        return -1;
    }

    int num_threads = std::thread::hardware_concurrency();
    char *env_tflite_num_threads = getenv ("FORCE_TFLITE_NUM_THREADS");
    if (env_tflite_num_threads)
    {
        num_threads = atoi (env_tflite_num_threads);
        DBG_LOGI ("@@@@@@ FORCE_TFLITE_NUM_THREADS=%d\n", num_threads);
    }
    DBG_LOG ("@@@@@@ TFLITE_NUM_THREADS=%d\n", num_threads);
    p->interpreter->SetNumThreads(num_threads);

    if (modify_graph_with_delegate (p, NULL) < 0)
    {
        DBG_LOGE ("ERR: %s(%d)\n", __FILE__, __LINE__);
        //return -1;
    }

    if (p->interpreter->AllocateTensors() != kTfLiteOk)
    {
        DBG_LOGE ("ERR: %s(%d)\n", __FILE__, __LINE__);
        return -1;
    }

    return 0;
}

void *
init_tflite_face_detect(void *model_buf_t)
{
    const char *model_buf = (const char *)model_buf_t;
    /* Face detect */
    int iret = tflite_create_interpreter_from_file(&s_detect_interpreter, model_buf);

    tflite_get_tensor_by_name (&s_detect_interpreter, 0,
            "normalized_input_image_tensor", &s_detect_tensor_input);  //    input
    tflite_get_tensor_by_name (&s_detect_interpreter, 1,
            "TFLite_Detection_PostProcess",  &s_detect_tensor_bboxes);  //   bboxes
    tflite_get_tensor_by_name (&s_detect_interpreter, 1,
            "TFLite_Detection_PostProcess:1",  &s_detect_tensor_classes);  //  classes
    tflite_get_tensor_by_name (&s_detect_interpreter, 1,
            "TFLite_Detection_PostProcess:2", &s_detect_tensor_scores); //   score of faces
    tflite_get_tensor_by_name (&s_detect_interpreter, 1,
            "TFLite_Detection_PostProcess:3", &s_detect_tensor_number); //  number of faces

    return 0;

int 
init_multhread(const char *FD_Model_Path, const char *IQA_Model_Path){
    clock_t start,end;
    start = clock();
    pthread_create(&pid, NULL, init_tflite_face_detect, (void *)FD_Model_Path);
    pthread_create(&pid2, NULL, init_tflite_face_iqa, (void *)IQA_Model_Path);
    pthread_join(pid, NULL);
    pthread_join(pid2, NULL);
    end=clock();
    double endtime=(double)(end-start)/CLOCKS_PER_SEC;
    printf("======>multithread cost time:%f" % endtime);

}

int
init_sigle_thread(const char *FD_Model_Path, const char *IQA_Model_Path){
    clock_t start,end;
    start = clock();
    init_tflite_face_detect(FD_Model_Path);
    init_tflite_face_iqa(IQA_Model_Path);
    end=clock();
    double endtime=(double)(end-start)/CLOCKS_PER_SEC;
    printf("======>single thread cost time:%f" % endtime);
}

int
main(){
    const char * FD_Model_Path = "fd_xxx.tflite";
    const char * IQA_Model_Path = "iqa_xxx.tflite";
    init_multhread(const char *FD_Model_Path, const char *IQA_Model_Path);
    init_sigle_thread(const char *FD_Model_Path, const char *IQA_Model_Path);
}

======>多线程耗时:1.8s
======>单线程耗时:1.9s

标签: multithreadingtensorflowdelegatesinitializationtensorflow-lite

解决方案


推荐阅读