pycuda LogicError: cuMemcpyHtoDAsync failed: invalid argument

I am trying to run a TensorRT Inference script for bert model but I get a LogicError while trying to copy the inputs from the host to the device. Here is the relevant part of the code:

Code

# Load the BERT-Base Engine
    with open(trt_model_path, "rb") as f, \
        trt.Runtime(TRT_LOGGER) as runtime, \
        runtime.deserialize_cuda_engine(f.read()) as engine, \
        engine.create_execution_context() as context:

         # We always use batch size 1.
        input_shape = (1, MAX_LEN)
        print(f'input_shape: {input_shape}')    # prints (1, 256)
        
        input_nbytes = trt.volume(input_shape) * trt.float32.itemsize
        print(f'input_nbytes: {input_nbytes}')  # prints 1024


        # Allocate device memory for inputs.
        d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)]

        # Create a stream in which to copy inputs/outputs and run inference.
        stream = cuda.Stream()

        # Specify input shapes. These must be within the min/max bounds of the active profile (0th profile in this case)
        # Note that input shapes can be specified on a per-inference basis, but in this case, we only have a single shape.
        for binding in range(3):
            context.set_binding_shape(binding, input_shape)
        assert context.all_binding_shapes_specified

        # Allocate output buffer by querying the size from the context. This may be different for different input shapes.
        h_output = cuda.pagelocked_empty(tuple(context.get_binding_shape(3)), dtype=np.float32)
        d_output = cuda.mem_alloc(h_output.nbytes)

        print("\nRunning Inference...")

        _NetworkOutput = collections.namedtuple(  # pylint: disable=invalid-name
            "NetworkOutput",
            ["start_logits", "end_logits", "feature_index"])
        networkOutputs = []

        eval_time_elapsed = 0

        for bi, d in tqdm(enumerate(test_data_loader), total=len(test_data_loader)):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]

            # Copy inputs
            ids = cuda.register_host_memory((np.ascontiguousarray(ids)).ravel())
            token_type_ids = cuda.register_host_memory((np.ascontiguousarray(token_type_ids)).ravel())
            mask = cuda.register_host_memory((np.ascontiguousarray(mask)).ravel())

            eval_start_time = time.time()
            cuda.memcpy_htod_async(d_inputs[0], ids, stream)
            cuda.memcpy_htod_async(d_inputs[1], token_type_ids, stream)
            cuda.memcpy_htod_async(d_inputs[2], mask, stream)

            # Run inference
            context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
            # Synchronize the stream
            stream.synchronize()
            eval_time_elapsed += (time.time() - eval_start_time)

            # Transfer predictions back from GPU
            cuda.memcpy_dtoh_async(h_output, d_output, stream)
            stream.synchronize()

            for index, batch in enumerate(h_output):
#                 Data Post-processing
                networkOutputs.append(_NetworkOutput(
                    start_logits = np.array(batch.squeeze()[:, 0]),
                    end_logits = np.array(batch.squeeze()[:, 1]),
                    feature_index = feature_index
                    ))

        eval_time_elapsed /= len(features)

Error

--------------------------------------------------------------------------
LogicError                                Traceback (most recent call last)
<ipython-input-151-e7543b92825d> in <module>
      1 run_trt_inference(
      2         data_path = "/workspace/data/jigsaw-multilingual/input/jigsaw-data/test.csv",
----> 3         trt_model_path = "mBERT_fp16.engine"
      4     )

<ipython-input-150-fb4fbcaff6fa> in run_trt_inference(data_path, trt_model_path)
    136 
    137             eval_start_time = time.time()
--> 138             cuda.memcpy_htod_async(d_inputs[0], ids, stream)
    139             cuda.memcpy_htod_async(d_inputs[1], token_type_ids, stream)
    140             cuda.memcpy_htod_async(d_inputs[2], mask, stream)

LogicError: cuMemcpyHtoDAsync failed: invalid argument

My Observations

As far as I could comprehend this error occurs because the size of d_inputs[0] and ids is inconsistent because of which the inputs cannot be copied from memory to device. But I do not understand why this is happending and how to fix this?

Jun 16 '21 12:06 ishandutta0098

Check dmesg. Your kernel may have crashed due to invalid memory access and killed the context.

Jun 16 '21 14:06 inducer

The error was fixed, there will be a correction in the calculation of input_nbytes. It should be
input_nbytes = 4 * trt.volume(input_shape) * trt.float32.itemsize

But @inducer I face another error while trying to obtain the networkOutputs. Since I am working on a classification problem this part should be different in the code as I don't need start_logits or end_logits:

for index, batch in enumerate(h_output):
    # Data Post-processing
    networkOutputs.append(_NetworkOutput(
            start_logits = np.array(batch.squeeze()[:, 0]),
            end_logits = np.array(batch.squeeze()[:, 1]),
            feature_index = feature_index
           ))

How to obtain the predictions then?

Jun 17 '21 05:06 ishandutta0098