为什么测试解码提升很小?而且与官方demo 使用的api不一样
环境 20核cpu cuda 10.2 T4 单卡
def get_image(image_url): if not image_url: return None try: image_url = parse.unquote(image_url) response = requests.get(image_url) if response.status_code != 200: print("get image filed!!!!!!") return " " return response.content except Exception as e: print(e) raise
def test_load_img(image, count): start = cv2.getTickCount() for num in range(0, count): np_image = np.frombuffer(bytearray(image), np.uint8) cv_image = cv2.imdecode(np_image, cv2.IMREAD_COLOR) end1 = cv2.getTickCount() print("load img1 base line = %s" % ((end1 - start) / cv2.getTickFrequency())) return cv_image
def test_load_img_nvjpeg(image, count): from nvjpeg import NvJpeg nj = NvJpeg() start = cv2.getTickCount() for num in range(0, count): np_image = np.asarray(bytearray(image), dtype="uint8") # cv_image = cv2.imdecode(np_image, cv2.IMREAD_COLOR) cv_image = nj.decode(np_image) end1 = cv2.getTickCount() print("load img nvjpeg base line = %s" % ((end1 - start) / cv2.getTickFrequency())) return cv_image
if name == "main": image = get_image("http://cdn.weipaitang.com/img/20200313rli2rh7p-jdgj-7vyi-91qd-584099256046-W3024H4032") if image != "": count = 100 test_load_img_nvjpeg(image, count) cv_image = test_load_img(image, count)
结果为: opencv 14秒 pynvjpeg 12.6秒 (确定观察到GPU的使用率,没有任何报错)
然后官方的解码demo 是这样写的,里面的api为什么跟pynvjpeg完全不一样
int decode_images(const FileData &img_data, const std::vector<size_t> &img_len, std::vector<nvjpegImage_t> &out, decode_params_t ¶ms, double &time) { CHECK_CUDA(cudaStreamSynchronize(params.stream)); cudaEvent_t startEvent = NULL, stopEvent = NULL; float loopTime = 0;
CHECK_CUDA(cudaEventCreate(&startEvent, cudaEventBlockingSync)); CHECK_CUDA(cudaEventCreate(&stopEvent, cudaEventBlockingSync));
std::vector<const unsigned char*> batched_bitstreams; std::vector<size_t> batched_bitstreams_size; std::vector<nvjpegImage_t> batched_output;
// bit-streams that batched decode cannot handle std::vector<const unsigned char*> otherdecode_bitstreams; std::vector<size_t> otherdecode_bitstreams_size; std::vector<nvjpegImage_t> otherdecode_output;
// if(params.hw_decode_available){ // for(int i = 0; i < params.batch_size; i++){ // // extract bitstream meta data to figure out whether a bit-stream can be decoded // nvjpegJpegStreamParseHeader(params.nvjpeg_handle, (const unsigned char *)img_data[i].data(), img_len[i], params.jpeg_streams[0]); // int isSupported = -1; // nvjpegDecodeBatchedSupported(params.nvjpeg_handle, params.jpeg_streams[0], &isSupported);
// if(isSupported == 0){ // batched_bitstreams.push_back((const unsigned char *)img_data[i].data()); // batched_bitstreams_size.push_back(img_len[i]); // batched_output.push_back(out[i]); // } else { // otherdecode_bitstreams.push_back((const unsigned char *)img_data[i].data()); // otherdecode_bitstreams_size.push_back(img_len[i]); // otherdecode_output.push_back(out[i]); // } // } // } else { for(int i = 0; i < params.batch_size; i++) { otherdecode_bitstreams.push_back((const unsigned char *)img_data[i].data()); otherdecode_bitstreams_size.push_back(img_len[i]); otherdecode_output.push_back(out[i]); } // }
CHECK_CUDA(cudaEventRecord(startEvent, params.stream));
if(batched_bitstreams.size() > 0)
{
CHECK_NVJPEG(
nvjpegDecodeBatchedInitialize(params.nvjpeg_handle, params.nvjpeg_state,
batched_bitstreams.size(), 1, params.fmt));
CHECK_NVJPEG(nvjpegDecodeBatched(
params.nvjpeg_handle, params.nvjpeg_state, batched_bitstreams.data(),
batched_bitstreams_size.data(), batched_output.data(), params.stream));
}
if(otherdecode_bitstreams.size() > 0)
{
CHECK_NVJPEG(nvjpegStateAttachDeviceBuffer(params.nvjpeg_decoupled_state, params.device_buffer));
int buffer_index = 0;
CHECK_NVJPEG(nvjpegDecodeParamsSetOutputFormat(params.nvjpeg_decode_params, params.fmt));
for (int i = 0; i < params.batch_size; i++) {
CHECK_NVJPEG(
nvjpegJpegStreamParse(params.nvjpeg_handle, otherdecode_bitstreams[i], otherdecode_bitstreams_size[i],
0, 0, params.jpeg_streams[buffer_index]));
CHECK_NVJPEG(nvjpegStateAttachPinnedBuffer(params.nvjpeg_decoupled_state,
params.pinned_buffers[buffer_index]));
CHECK_NVJPEG(nvjpegDecodeJpegHost(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state,
params.nvjpeg_decode_params, params.jpeg_streams[buffer_index]));
CHECK_CUDA(cudaStreamSynchronize(params.stream));
CHECK_NVJPEG(nvjpegDecodeJpegTransferToDevice(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state,
params.jpeg_streams[buffer_index], params.stream));
buffer_index = 1 - buffer_index; // switch pinned buffer in pipeline mode to avoid an extra sync
CHECK_NVJPEG(nvjpegDecodeJpegDevice(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state,
&otherdecode_output[i], params.stream));
}
}
CHECK_CUDA(cudaEventRecord(stopEvent, params.stream));
CHECK_CUDA(cudaEventSynchronize(stopEvent));
CHECK_CUDA(cudaEventElapsedTime(&loopTime, startEvent, stopEvent));
time = static_cast
return EXIT_SUCCESS; }
- 中文原文
-
为什么测试解码提升很小?
GPU硬解码在编解码速度上是比CPU快的,但是处理过程中多了 host to device 和 device to host 内存复制时间。所以在处理小图片的速度不理想。由于GPU核心数远远多于CPU核心数,所以使用多线程也有利于获得更好的结果。
-
与官方demo 使用的api不一样
pynvjpeg旨在使用nvjpeg的编解码功能,实现兼容opencv的接口。
- English Translation
-
Why the decoding time is similar?
GPU hard decoding is faster than CPU in encoding and decoding process, but there is a process names host to device or device to host memory copying spends time. For the reason, the decoding/encoding time is similar, especially decoding/encoding small pictures . Using multi-threading, The GPU should got the more better score than CPU.
-
The API is different from the official demo
PyNvjpeg is designed to be compatible with OpenCV.