问题
I have an inference code in TensorRT(with python). I want to run this code in ROS but I get the below error when trying to allocate buffer:
LogicError: explicit_context_dependent failed: invalid device context - no currently active context?
The code works well out of the ROS package. A ROS node publishes an image and the given code get the image to do inference. The inference code is shown below:
#!/usr/bin/env python
# Revision $Id$
import rospy
from std_msgs.msg import String
from cv_bridge import CvBridge
import cv2
import os
import numpy as np
import argparse
import torch
from torch.autograd import Variable
from torchvision import transforms
import torch.nn.functional as F
import torch._utils
from PIL import Image
from sensor_msgs.msg import Image as ImageMsg
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import random
import sys
import common
import shutil
from itertools import chain
TRT_LOGGER = trt.Logger()
# cuda.init()
class ModelData(object):
def __init__(self):
self.MODEL_PATH = "./MobileNet_v2_Final.onnx" ## converted model from pytorch to onnx
self.batch_size = 1
self.num_classes = 3
self.engine = build_int8_engine(self.MODEL_PATH, self.batch_size)
self.context = self.engine.create_execution_context()
### ROS PART
self.bridge_ROS = CvBridge()
self.loop_rate = rospy.Rate(1)
self.pub = rospy.Publisher('Image_Label', String, queue_size=1)
print('INIT Successfully')
def callback(self, msg):
rospy.loginfo('Image received...')
cv_image = self.bridge_ROS.imgmsg_to_cv2(msg, desired_encoding="passthrough")
inputs, outputs, bindings, stream = common.allocate_buffers(context.engine)
[output] = common.do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=effective_batch_size)
def listener(self):
rospy.Subscriber("chatter", ImageMsg, self.callback)
while not rospy.is_shutdown():
rospy.loginfo('Getting image...')
self.loop_rate.sleep()
def build_int8_engine(model_file, batch_size=32):
with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
builder.max_batch_size = batch_size
builder.max_workspace_size = common.GiB(1)
with open(model_file, 'rb') as model:
parser.parse(model.read(),)
return builder.build_cuda_engine(network)
if __name__ == '__main__':
rospy.init_node("listener", anonymous=True)
infer = ModelData()
infer.listener()
The error comes from the below class in stream = cuda.Stream():
#!/usr/bin/env python
# Revision $Id$
from itertools import chain
import argparse
import os
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import tensorrt as trt
# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
ctx.pop()
del ctx
return inputs, outputs, bindings, stream
# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, batch_size=1):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# [cuda.memcpy_htod(inp.device, inp.host) for inp in inputs]
# Run inference.
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# context.execute(batch_size=batch_size, bindings=bindings)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# [cuda.memcpy_dtoh(out.host, out.device) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
More info:
TensorRT: 6.1.5
Python: 2.7
rosversion: 1.14.3
rosdistro: melodic
回答1:
You need to explicitly create Cuda Device and load Cuda Context in the worker thread i.e. your callback function, instead of using import pycuda.autoinit
in the main thread, as follows
import pycuda.driver as cuda
import threading
def callback():
cuda.init()
device = cuda.Device(0) # enter your Gpu id here
ctx = device.make_context()
allocate_buffers() # load Cuda buffers or any other Cuda or TenosrRT operations
ctx.pop() # very important
if __name__ == "__main__":
worker_thread = threading.Thread(target=callback())
worker_thread.start()
worker_thread.join()
Note: do not forget to remove import pycuda.autoinit
in both modules
This is also discussed in a question here
来源:https://stackoverflow.com/questions/60372729/get-logicerror-explicit-context-dependent-failed-invalid-device-context-no