nvjpeg-python 为什么测试解码提升很小？而且与官方demo 使用的api不一样

环境 20核cpu cuda 10.2 T4 单卡

def get_image(image_url): if not image_url: return None try: image_url = parse.unquote(image_url) response = requests.get(image_url) if response.status_code != 200: print("get image filed!!!!!!") return " " return response.content except Exception as e: print(e) raise

def test_load_img(image, count): start = cv2.getTickCount() for num in range(0, count): np_image = np.frombuffer(bytearray(image), np.uint8) cv_image = cv2.imdecode(np_image, cv2.IMREAD_COLOR) end1 = cv2.getTickCount() print("load img1 base line = %s" % ((end1 - start) / cv2.getTickFrequency())) return cv_image

def test_load_img_nvjpeg(image, count): from nvjpeg import NvJpeg nj = NvJpeg() start = cv2.getTickCount() for num in range(0, count): np_image = np.asarray(bytearray(image), dtype="uint8") # cv_image = cv2.imdecode(np_image, cv2.IMREAD_COLOR) cv_image = nj.decode(np_image) end1 = cv2.getTickCount() print("load img nvjpeg base line = %s" % ((end1 - start) / cv2.getTickFrequency())) return cv_image

if name == "main": image = get_image("http://cdn.weipaitang.com/img/20200313rli2rh7p-jdgj-7vyi-91qd-584099256046-W3024H4032") if image != "": count = 100 test_load_img_nvjpeg(image, count) cv_image = test_load_img(image, count)

结果为： opencv 14秒 pynvjpeg 12.6秒（确定观察到GPU的使用率，没有任何报错）

Jun 08 '21 06:06 hacktmz

然后官方的解码demo 是这样写的，里面的api为什么跟pynvjpeg完全不一样

int decode_images(const FileData &img_data, const std::vector<size_t> &img_len, std::vector<nvjpegImage_t> &out, decode_params_t &params, double &time) { CHECK_CUDA(cudaStreamSynchronize(params.stream)); cudaEvent_t startEvent = NULL, stopEvent = NULL; float loopTime = 0;

CHECK_CUDA(cudaEventCreate(&startEvent, cudaEventBlockingSync)); CHECK_CUDA(cudaEventCreate(&stopEvent, cudaEventBlockingSync));

std::vector<const unsigned char*> batched_bitstreams; std::vector<size_t> batched_bitstreams_size; std::vector<nvjpegImage_t> batched_output;

// bit-streams that batched decode cannot handle std::vector<const unsigned char*> otherdecode_bitstreams; std::vector<size_t> otherdecode_bitstreams_size; std::vector<nvjpegImage_t> otherdecode_output;

// if(params.hw_decode_available){ // for(int i = 0; i < params.batch_size; i++){ // // extract bitstream meta data to figure out whether a bit-stream can be decoded // nvjpegJpegStreamParseHeader(params.nvjpeg_handle, (const unsigned char *)img_data[i].data(), img_len[i], params.jpeg_streams[0]); // int isSupported = -1; // nvjpegDecodeBatchedSupported(params.nvjpeg_handle, params.jpeg_streams[0], &isSupported);

// if(isSupported == 0){ // batched_bitstreams.push_back((const unsigned char *)img_data[i].data()); // batched_bitstreams_size.push_back(img_len[i]); // batched_output.push_back(out[i]); // } else { // otherdecode_bitstreams.push_back((const unsigned char *)img_data[i].data()); // otherdecode_bitstreams_size.push_back(img_len[i]); // otherdecode_output.push_back(out[i]); // } // } // } else { for(int i = 0; i < params.batch_size; i++) { otherdecode_bitstreams.push_back((const unsigned char *)img_data[i].data()); otherdecode_bitstreams_size.push_back(img_len[i]); otherdecode_output.push_back(out[i]); } // }

CHECK_CUDA(cudaEventRecord(startEvent, params.stream));

if(batched_bitstreams.size() > 0)
 {
      CHECK_NVJPEG(
           nvjpegDecodeBatchedInitialize(params.nvjpeg_handle, params.nvjpeg_state,
                                        batched_bitstreams.size(), 1, params.fmt));

     CHECK_NVJPEG(nvjpegDecodeBatched(
         params.nvjpeg_handle, params.nvjpeg_state, batched_bitstreams.data(),
         batched_bitstreams_size.data(), batched_output.data(), params.stream));
 }

if(otherdecode_bitstreams.size() > 0)
{
      CHECK_NVJPEG(nvjpegStateAttachDeviceBuffer(params.nvjpeg_decoupled_state, params.device_buffer));
      int buffer_index = 0;
      CHECK_NVJPEG(nvjpegDecodeParamsSetOutputFormat(params.nvjpeg_decode_params, params.fmt));
      for (int i = 0; i < params.batch_size; i++) {
          CHECK_NVJPEG(
              nvjpegJpegStreamParse(params.nvjpeg_handle, otherdecode_bitstreams[i], otherdecode_bitstreams_size[i],
              0, 0, params.jpeg_streams[buffer_index]));

          CHECK_NVJPEG(nvjpegStateAttachPinnedBuffer(params.nvjpeg_decoupled_state,
              params.pinned_buffers[buffer_index]));

          CHECK_NVJPEG(nvjpegDecodeJpegHost(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state,
              params.nvjpeg_decode_params, params.jpeg_streams[buffer_index]));

          CHECK_CUDA(cudaStreamSynchronize(params.stream));

          CHECK_NVJPEG(nvjpegDecodeJpegTransferToDevice(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state,
              params.jpeg_streams[buffer_index], params.stream));

          buffer_index = 1 - buffer_index; // switch pinned buffer in pipeline mode to avoid an extra sync

          CHECK_NVJPEG(nvjpegDecodeJpegDevice(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state,
              &otherdecode_output[i], params.stream));

      }
}

CHECK_CUDA(cudaEventRecord(stopEvent, params.stream));

CHECK_CUDA(cudaEventSynchronize(stopEvent)); CHECK_CUDA(cudaEventElapsedTime(&loopTime, startEvent, stopEvent)); time = static_cast(loopTime);

return EXIT_SUCCESS; }

Jun 08 '21 06:06 hacktmz

中文原文

为什么测试解码提升很小？

GPU硬解码在编解码速度上是比CPU快的，但是处理过程中多了 host to device 和 device to host 内存复制时间。所以在处理小图片的速度不理想。由于GPU核心数远远多于CPU核心数，所以使用多线程也有利于获得更好的结果。
与官方demo 使用的api不一样

pynvjpeg旨在使用nvjpeg的编解码功能，实现兼容opencv的接口。

English Translation

Why the decoding time is similar?

GPU hard decoding is faster than CPU in encoding and decoding process, but there is a process names host to device or device to host memory copying spends time. For the reason, the decoding/encoding time is similar, especially decoding/encoding small pictures . Using multi-threading, The GPU should got the more better score than CPU.
The API is different from the official demo

PyNvjpeg is designed to be compatible with OpenCV.

Jun 08 '21 23:06 zeng-qinghui