k8s-device-plugin icon indicating copy to clipboard operation
k8s-device-plugin copied to clipboard

Why doesn't my Kubernetes node recognize the GPU after successfully installing my drivers and Containerd?

Open xiaoxiaoboyyds opened this issue 1 year ago • 16 comments

The template below is mostly useful for bug reports and support questions. Feel free to remove anything which doesn't apply to you and add more information where it makes sense.

Important Note: NVIDIA AI Enterprise customers can get support from NVIDIA Enterprise support. Please open a case here.

1. Quick Debug Information

  • OS/Version(e.g. RHEL8.6, Ubuntu22.04):Ubuntu22.04
  • Kernel Version:NVIDIA-SMI 390.157
  • Container Runtime Type/Version(e.g. Containerd, CRI-O, Docker): Containerd
  • K8s Flavor/Version(e.g. K8s, OCP, Rancher, GKE, EKS): K8s

2. Issue or feature description

Why doesn't my Kubernetes node recognize the GPU after successfully installing my drivers and Containerd? This is the content of /etc/containerd/config.toml.

disabled_plugins = []
imports = []
oom_score = 0
plugin_dir = ""
required_plugins = []
root = "/var/lib/containerd"
state = "/run/containerd"
temp = ""
version = 2

[cgroup]
  path = ""

[debug]
  address = ""
  format = ""
  gid = 0
  level = ""
  uid = 0

[grpc]
  address = "/run/containerd/containerd.sock"
  gid = 0
  max_recv_message_size = 16777216
  max_send_message_size = 16777216
  tcp_address = ""
  tcp_tls_ca = ""
  tcp_tls_cert = ""
  tcp_tls_key = ""
  uid = 0

[metrics]
  address = ""
  grpc_histogram = false

[plugins]

  [plugins."io.containerd.gc.v1.scheduler"]
    deletion_threshold = 0
    mutation_threshold = 100
    pause_threshold = 0.02
    schedule_delay = "0s"
    startup_delay = "100ms"

  [plugins."io.containerd.grpc.v1.cri"]
    cdi_spec_dirs = ["/etc/cdi", "/var/run/cdi"]
    device_ownership_from_security_context = false
    disable_apparmor = false
    disable_cgroup = false
    disable_hugetlb_controller = true
    disable_proc_mount = false
    disable_tcp_service = true
    drain_exec_sync_io_timeout = "0s"
    enable_cdi = false
    enable_selinux = false
    enable_tls_streaming = false
    enable_unprivileged_icmp = false
    enable_unprivileged_ports = false
    ignore_image_defined_volumes = false
    image_pull_progress_timeout = "1m0s"
    max_concurrent_downloads = 3
    max_container_log_line_size = 16384
    netns_mounts_under_state_dir = false
    restrict_oom_score_adj = false
    sandbox_image = "registry.aliyuncs.com/google_containers/pause:3.8"
    selinux_category_range = 1024
    stats_collect_period = 10
    stream_idle_timeout = "4h0m0s"
    stream_server_address = "127.0.0.1"
    stream_server_port = "0"
    systemd_cgroup = false
    tolerate_missing_hugetlb_controller = true
    unset_seccomp_profile = ""

    [plugins."io.containerd.grpc.v1.cri".cni]
      bin_dir = "/opt/cni/bin"
      conf_dir = "/etc/cni/net.d"
      conf_template = ""
      ip_pref = ""
      max_conf_num = 1
      setup_serially = false

    [plugins."io.containerd.grpc.v1.cri".containerd]
      default_runtime_name = "runc"
      disable_snapshot_annotations = true
      discard_unpacked_layers = false
      ignore_blockio_not_enabled_errors = false
      ignore_rdt_not_enabled_errors = false
      no_pivot = false
      snapshotter = "overlayfs"

      [plugins."io.containerd.grpc.v1.cri".containerd.default_runtime]
        base_runtime_spec = ""
        cni_conf_dir = ""
        cni_max_conf_num = 0
        container_annotations = []
        pod_annotations = []
        privileged_without_host_devices = false
        privileged_without_host_devices_all_devices_allowed = false
        runtime_engine = ""
        runtime_path = ""
        runtime_root = ""
        runtime_type = ""
        sandbox_mode = ""
        snapshotter = ""

        [plugins."io.containerd.grpc.v1.cri".containerd.default_runtime.options]

      [plugins."io.containerd.grpc.v1.cri".containerd.runtimes]

        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
          base_runtime_spec = ""
          cni_conf_dir = ""
          cni_max_conf_num = 0
          container_annotations = []
          pod_annotations = []
          privileged_without_host_devices = false
          privileged_without_host_devices_all_devices_allowed = false
          runtime_engine = ""
          runtime_path = ""
          runtime_root = ""
          runtime_type = "io.containerd.runc.v2"
          sandbox_mode = "podsandbox"
          snapshotter = ""

          [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
            BinaryName = "/usr/bin/nvidia-container-runtime"
            CriuImagePath = ""
            CriuPath = ""
            CriuWorkPath = ""
            IoGid = 0
            IoUid = 0
            NoNewKeyring = false
            NoPivotRoot = false
            Root = ""
            ShimCgroup = ""

        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
          base_runtime_spec = ""
          cni_conf_dir = ""
          cni_max_conf_num = 0
          container_annotations = []
          pod_annotations = []
          privileged_without_host_devices = false
          privileged_without_host_devices_all_devices_allowed = false
          runtime_engine = ""
          runtime_path = ""
          runtime_root = ""
          runtime_type = "io.containerd.runc.v2"
          sandbox_mode = "podsandbox"
          snapshotter = ""

          [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
            BinaryName = ""
            CriuImagePath = ""
            CriuPath = ""
            CriuWorkPath = ""
            IoGid = 0
            IoUid = 0
            NoNewKeyring = false
            NoPivotRoot = false
            Root = ""
            ShimCgroup = ""

      [plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime]
        base_runtime_spec = ""
        cni_conf_dir = ""
        cni_max_conf_num = 0
        container_annotations = []
        pod_annotations = []
        privileged_without_host_devices = false
        privileged_without_host_devices_all_devices_allowed = false
        runtime_engine = ""
        runtime_path = ""
        runtime_root = ""
        runtime_type = ""
        sandbox_mode = ""
        snapshotter = ""

        [plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime.options]

    [plugins."io.containerd.grpc.v1.cri".image_decryption]
      key_model = "node"

    [plugins."io.containerd.grpc.v1.cri".registry]
      config_path = "/etc/containerd/certs.d"

      [plugins."io.containerd.grpc.v1.cri".registry.auths]

      [plugins."io.containerd.grpc.v1.cri".registry.configs]

      [plugins."io.containerd.grpc.v1.cri".registry.headers]

      [plugins."io.containerd.grpc.v1.cri".registry.mirrors]

    [plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming]
      tls_cert_file = ""
      tls_key_file = ""

  [plugins."io.containerd.internal.v1.opt"]
    path = "/opt/containerd"

  [plugins."io.containerd.internal.v1.restart"]
    interval = "10s"

  [plugins."io.containerd.internal.v1.tracing"]
    sampling_ratio = 1.0
    service_name = "containerd"

  [plugins."io.containerd.metadata.v1.bolt"]
    content_sharing_policy = "shared"

  [plugins."io.containerd.monitor.v1.cgroups"]
    no_prometheus = false

  [plugins."io.containerd.nri.v1.nri"]
    disable = true
    disable_connections = false
    plugin_config_path = "/etc/nri/conf.d"
    plugin_path = "/opt/nri/plugins"
    plugin_registration_timeout = "5s"
    plugin_request_timeout = "2s"
    socket_path = "/var/run/nri/nri.sock"

  [plugins."io.containerd.runtime.v1.linux"]
    no_shim = false
    runtime = "runc"
    runtime_root = ""
    shim = "containerd-shim"
    shim_debug = false

  [plugins."io.containerd.runtime.v2.task"]
    platforms = ["linux/amd64"]
    sched_core = false

  [plugins."io.containerd.service.v1.diff-service"]
    default = ["walking"]

  [plugins."io.containerd.service.v1.tasks-service"]
    blockio_config_file = ""
    rdt_config_file = ""

  [plugins."io.containerd.snapshotter.v1.aufs"]
    root_path = ""

  [plugins."io.containerd.snapshotter.v1.btrfs"]
    root_path = ""

  [plugins."io.containerd.snapshotter.v1.devmapper"]
    async_remove = false
    base_image_size = ""
    discard_blocks = false
    fs_options = ""
    fs_type = ""
    pool_name = ""
    root_path = ""

  [plugins."io.containerd.snapshotter.v1.native"]
    root_path = ""

  [plugins."io.containerd.snapshotter.v1.overlayfs"]
    root_path = ""
    upperdir_label = false

  [plugins."io.containerd.snapshotter.v1.zfs"]
    root_path = ""

  [plugins."io.containerd.tracing.processor.v1.otlp"]
    endpoint = ""
    insecure = false
    protocol = ""

  [plugins."io.containerd.transfer.v1.local"]
    config_path = ""
    max_concurrent_downloads = 3
    max_concurrent_uploaded_layers = 3

    [[plugins."io.containerd.transfer.v1.local".unpack_config]]
      differ = ""
      platform = "linux/amd64"
      snapshotter = "overlayfs"

[proxy_plugins]

[stream_processors]

  [stream_processors."io.containerd.ocicrypt.decoder.v1.tar"]
    accepts = ["application/vnd.oci.image.layer.v1.tar+encrypted"]
    args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"]
    env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"]
    path = "ctd-decoder"
    returns = "application/vnd.oci.image.layer.v1.tar"

  [stream_processors."io.containerd.ocicrypt.decoder.v1.tar.gzip"]
    accepts = ["application/vnd.oci.image.layer.v1.tar+gzip+encrypted"]
    args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"]
    env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"]
    path = "ctd-decoder"
    returns = "application/vnd.oci.image.layer.v1.tar+gzip"

[timeouts]
  "io.containerd.timeout.bolt.open" = "0s"
  "io.containerd.timeout.metrics.shimstats" = "2s"
  "io.containerd.timeout.shim.cleanup" = "5s"
  "io.containerd.timeout.shim.load" = "5s"
  "io.containerd.timeout.shim.shutdown" = "3s"
  "io.containerd.timeout.task.state" = "2s"

[ttrpc]
  address = ""
  gid = 0
  uid = 0

This is the content of nvidia-smi

Wed Jun 26 11:10:27 2024
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 390.157                Driver Version: 390.157                   |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla V100-SXM2...  Off  | 00000000:00:08.0 Off |                    0 |
| N/A   31C    P0    21W / 300W |      0MiB / 32510MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

but

Name:               edgenode-4a9b
Roles:              agent,edge
Labels:             beta.kubernetes.io/arch=amd64
                    beta.kubernetes.io/os=linux
                    kubernetes.io/arch=amd64
                    kubernetes.io/hostname=edgenode-4a9b
                    kubernetes.io/os=linux
                    node-role.kubernetes.io/agent=
                    node-role.kubernetes.io/edge=
Annotations:        node.alpha.kubernetes.io/ttl: 0
                    volumes.kubernetes.io/controller-managed-attach-detach: true
CreationTimestamp:  Wed, 26 Jun 2024 10:48:04 +0800
Taints:             node-role.kubernetes.io/edge:NoSchedule
Unschedulable:      false
Lease:
  HolderIdentity:  edgenode-4a9b
  AcquireTime:     <unset>
  RenewTime:       Wed, 26 Jun 2024 10:48:25 +0800
Conditions:
  Type             Status  LastHeartbeatTime                 LastTransitionTime                Reason                       Message
  ----             ------  -----------------                 ------------------                ------                       -------
  MemoryPressure   False   Wed, 26 Jun 2024 10:48:04 +0800   Wed, 26 Jun 2024 10:48:03 +0800   KubeletHasSufficientMemory   kubelet has sufficient memory available
  DiskPressure     False   Wed, 26 Jun 2024 10:48:04 +0800   Wed, 26 Jun 2024 10:48:03 +0800   KubeletHasNoDiskPressure     kubelet has no disk pressure
  PIDPressure      False   Wed, 26 Jun 2024 10:48:04 +0800   Wed, 26 Jun 2024 10:48:03 +0800   KubeletHasSufficientPID      kubelet has sufficient PID available
  Ready            True    Wed, 26 Jun 2024 10:48:04 +0800   Wed, 26 Jun 2024 10:48:03 +0800   EdgeReady                    edge is posting ready status. AppArmor enabled
Addresses:
  InternalIP:  10.206.16.8
  Hostname:    edgenode-4a9b
Capacity:
  cpu:                10
  ephemeral-storage:  1056753416Ki
  hugepages-1Gi:      0
  hugepages-2Mi:      0
  memory:             40284132Ki
  pods:               110
Allocatable:
  cpu:                10
  ephemeral-storage:  973903946574
  hugepages-1Gi:      0
  hugepages-2Mi:      0
  memory:             40181732Ki
  pods:               110
System Info:
  Machine ID:                 f969dca6f9284efcbd52866be3f39259
  System UUID:                f969dca6-f928-4efc-bd52-866be3f39259
  Boot ID:                    d91faf3a-fe03-4d3c-9873-44cba386a20c
  Kernel Version:             5.15.0-107-generic
  OS Image:                   Ubuntu 22.04 LTS
  Operating System:           linux
  Architecture:               amd64
  Container Runtime Version:  containerd://1.7.12
  Kubelet Version:            v1.23.15-kubeedge-v1.13.0
  Kube-Proxy Version:         v0.0.0-master+$Format:%H$
PodCIDR:                      192.168.7.0/24
PodCIDRs:                     192.168.7.0/24
Non-terminated Pods:          (0 in total)
  Namespace                   Name    CPU Requests  CPU Limits  Memory Requests  Memory Limits  Age
  ---------                   ----    ------------  ----------  ---------------  -------------  ---
Allocated resources:
  (Total limits may be over 100 percent, i.e., overcommitted.)
  Resource           Requests  Limits
  --------           --------  ------
  cpu                0 (0%)    0 (0%)
  memory             0 (0%)    0 (0%)
  ephemeral-storage  0 (0%)    0 (0%)
  hugepages-1Gi      0 (0%)    0 (0%)
  hugepages-2Mi      0 (0%)    0 (0%)
Events:              <none>
root@master01:/home/ubuntu# kubectl describe  nodes edgenode-4a9b
Name:               edgenode-4a9b
Roles:              agent,edge
Labels:             beta.kubernetes.io/arch=amd64
                    beta.kubernetes.io/os=linux
                    kubernetes.io/arch=amd64
                    kubernetes.io/hostname=edgenode-4a9b
                    kubernetes.io/os=linux
                    node-role.kubernetes.io/agent=
                    node-role.kubernetes.io/edge=
Annotations:        node.alpha.kubernetes.io/ttl: 0
                    volumes.kubernetes.io/controller-managed-attach-detach: true
CreationTimestamp:  Wed, 26 Jun 2024 10:48:04 +0800
Taints:             node-role.kubernetes.io/edge:NoSchedule
Unschedulable:      false
Lease:
  HolderIdentity:  edgenode-4a9b
  AcquireTime:     <unset>
  RenewTime:       Wed, 26 Jun 2024 10:53:53 +0800
Conditions:
  Type             Status  LastHeartbeatTime                 LastTransitionTime                Reason                       Message
  ----             ------  -----------------                 ------------------                ------                       -------
  MemoryPressure   False   Wed, 26 Jun 2024 10:53:11 +0800   Wed, 26 Jun 2024 10:48:03 +0800   KubeletHasSufficientMemory   kubelet has sufficient memory available
  DiskPressure     False   Wed, 26 Jun 2024 10:53:11 +0800   Wed, 26 Jun 2024 10:48:03 +0800   KubeletHasNoDiskPressure     kubelet has no disk pressure
  PIDPressure      False   Wed, 26 Jun 2024 10:53:11 +0800   Wed, 26 Jun 2024 10:48:03 +0800   KubeletHasSufficientPID      kubelet has sufficient PID available
  Ready            True    Wed, 26 Jun 2024 10:53:11 +0800   Wed, 26 Jun 2024 10:48:03 +0800   EdgeReady                    edge is posting ready status. AppArmor enabled
Addresses:
  InternalIP:  10.206.16.8
  Hostname:    edgenode-4a9b
Capacity:
  cpu:                10
  ephemeral-storage:  1056753416Ki
  hugepages-1Gi:      0
  hugepages-2Mi:      0
  memory:             40284132Ki
  pods:               110
Allocatable:
  cpu:                10
  ephemeral-storage:  973903946574
  hugepages-1Gi:      0
  hugepages-2Mi:      0
  memory:             40181732Ki
  pods:               110
System Info:
  Machine ID:                 f969dca6f9284efcbd52866be3f39259
  System UUID:                f969dca6-f928-4efc-bd52-866be3f39259
  Boot ID:                    d91faf3a-fe03-4d3c-9873-44cba386a20c
  Kernel Version:             5.15.0-107-generic
  OS Image:                   Ubuntu 22.04 LTS
  Operating System:           linux
  Architecture:               amd64
  Container Runtime Version:  containerd://1.7.12
  Kubelet Version:            v1.23.15-kubeedge-v1.13.0
  Kube-Proxy Version:         v0.0.0-master+$Format:%H$
PodCIDR:                      192.168.7.0/24
PodCIDRs:                     192.168.7.0/24
Non-terminated Pods:          (0 in total)
  Namespace                   Name    CPU Requests  CPU Limits  Memory Requests  Memory Limits  Age
  ---------                   ----    ------------  ----------  ---------------  -------------  ---
Allocated resources:
  (Total limits may be over 100 percent, i.e., overcommitted.)
  Resource           Requests  Limits
  --------           --------  ------
  cpu                0 (0%)    0 (0%)
  memory             0 (0%)    0 (0%)
  ephemeral-storage  0 (0%)    0 (0%)
  hugepages-1Gi      0 (0%)    0 (0%)
  hugepages-2Mi      0 (0%)    0 (0%)
Events:              <none>
### Tasks

xiaoxiaoboyyds avatar Jun 26 '24 03:06 xiaoxiaoboyyds

First, the value of default_runtime_name in containerd should be nvidia. After setting the value, you need to follow the documentation to enable GPU Support in Kubernetes

Just one command.

$ kubectl create -f  https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.15.0/deployments/static/nvidia-device-plugin.yml

Remember to restart containerd and kubelet

ZYWNB666 avatar Jun 30 '24 09:06 ZYWNB666

same error with a tesla card running a rancher deployment:

root@nvidia-device-plugin-daemonset-nvbr7:/# nvidia-device-plugin 
2024/07/21 15:56:05 Starting FS watcher.
2024/07/21 15:56:05 Starting OS watcher.
2024/07/21 15:56:05 Starting Plugins.
2024/07/21 15:56:05 Loading configuration.
2024/07/21 15:56:05 Updating config with default resource matching patterns.
2024/07/21 15:56:05 
Running with config:
{
  "version": "v1",
  "flags": {
    "migStrategy": "none",
    "failOnInitError": false,
    "nvidiaDriverRoot": "/",
    "gdsEnabled": false,
    "mofedEnabled": false,
    "plugin": {
      "passDeviceSpecs": false,
      "deviceListStrategy": "envvar",
      "deviceIDStrategy": "uuid"
    }
  },
  "resources": {
    "gpus": [
      {
        "pattern": "*",
        "name": "nvidia.com/gpu"
      }
    ]
  },
  "sharing": {
    "timeSlicing": {}
  }
}
2024/07/21 15:56:05 Retreiving plugins.
2024/07/21 15:56:05 Detected non-NVML platform: could not load NVML: libnvidia-ml.so.1: cannot open shared object file: No such file or directory
2024/07/21 15:56:05 Detected non-Tegra platform: /sys/devices/soc0/family file not found
2024/07/21 15:56:05 Incompatible platform detected
2024/07/21 15:56:05 If this is a GPU node, did you configure the NVIDIA Container Toolkit?
2024/07/21 15:56:05 You can check the prerequisites at: https://github.com/NVIDIA/k8s-device-plugin#prerequisites
2024/07/21 15:56:05 You can learn how to set the runtime at: https://github.com/NVIDIA/k8s-device-plugin#quick-start
2024/07/21 15:56:05 If this is not a GPU node, you should set up a toleration or nodeSelector to only deploy this plugin on GPU nodes
2024/07/21 15:56:05 No devices found. Waiting indefinitely.
  • nvidia-smi works as expected

  • I am using the above containerd toml file with the default_runtime_name set to nvidia

EDIT:

after reviewing more documetnation and fixing some issues within my config file, I get a different error:

}
I0721 17:00:46.190858      46 main.go:317] Retrieving plugins.
E0721 17:00:46.191062      46 factory.go:87] Incompatible strategy detected auto
E0721 17:00:46.191076      46 factory.go:88] If this is a GPU node, did you configure the NVIDIA Container Toolkit?
E0721 17:00:46.191084      46 factory.go:89] You can check the prerequisites at: https://github.com/NVIDIA/k8s-device-plugin#prerequisites
E0721 17:00:46.191093      46 factory.go:90] You can learn how to set the runtime at: https://github.com/NVIDIA/k8s-device-plugin#quick-start
E0721 17:00:46.191101      46 factory.go:91] If this is not a GPU node, you should set up a toleration or nodeSelector to only deploy this plugin on GPU nodes
I0721 17:00:46.191112      46 main.go:346] No devices found. Waiting indefinitely.

andrewmeyer avatar Jul 21 '24 16:07 andrewmeyer

Please ask if this problem has been solved?

MasonXon avatar Sep 27 '24 05:09 MasonXon

Can someone share any findings on this issue? I've spent the entire last weekend to get this working. But can't seem to make it work.

harshsavasil avatar Oct 07 '24 07:10 harshsavasil

I patched daemonset nvdp-nvidia-device-plugin with following command:

kubectl -n nvidia-device-plugin patch ds nvdp-nvidia-device-plugin \
  --type='json' \
  -p='[{"op": "add", "path": "/spec/template/spec/containers/0/args", "value": ["--device-discovery-strategy=tegra"]}]'

This is equivalent to manually specifying the detection strategy to tegra. My GPU is 4090.

MasonXon avatar Oct 07 '24 08:10 MasonXon

Thanks @MasonXon ! It worked.

harshsavasil avatar Oct 07 '24 09:10 harshsavasil

I had to set the default_runtime_name to nvidia, like @ZYWNB666 recommend. nvidia-ctk runtime configure --runtime=containerd added all the runtime configs for the ctk, but did not change that line.

After manually editing /etc/containerd/config.toml, restarting containerd via systemctl, and restarting the deamonset pod it worked!

mattlgy avatar Oct 10 '24 18:10 mattlgy

Adding --set-as-default would have given you what you want:

nvidia-ctk runtime configure --runtime=containerd --set-as-default

klueska avatar Oct 10 '24 21:10 klueska

I patched daemonset nvdp-nvidia-device-plugin with following command:

kubectl -n nvidia-device-plugin patch ds nvdp-nvidia-device-plugin
--type='json'
-p='[{"op": "add", "path": "/spec/template/spec/containers/0/args", "value": ["--device-discovery-strategy=tegra"]}]' This is equivalent to manually specifying the detection strategy to tegra. My GPU is 4090.

This worked for me, but GPUs show up on all my nodes, even ones without GPUs.....

sipvoip avatar Dec 05 '24 18:12 sipvoip

Maybe --device-discovery-strategy=nvml is better.

MasonXon avatar Dec 06 '24 03:12 MasonXon

In my case, I'm using k3s and its containerd configuration is not under /etc/containerd/, but under /var/lib/rancher/k3s/agent/etc/containerd/. The k3s docs on Advanced Options - Alternative Container Runtime Support mentions the --default-runtime argument to the k3s agent command to set the default runtime.

Since I use a configuration file, I just added default-runtime: nvidia to /etc/rancher/k3s/config.yaml and restarted the k3s-agent systemd service (and containerd, just to be sure). No other modification to the containerd configuration was necessary.

I had to delete the nvidia-device-plugin-XXXXX pod that was on CrashLoopBackoff, then the extended resource nvidia.com/gpu was correctly applied to my node.

SylChamber avatar Mar 03 '25 17:03 SylChamber

I patched daemonset nvdp-nvidia-device-plugin with following command:

kubectl -n nvidia-device-plugin patch ds nvdp-nvidia-device-plugin
--type='json'
-p='[{"op": "add", "path": "/spec/template/spec/containers/0/args", "value": ["--device-discovery-strategy=tegra"]}]' This is equivalent to manually specifying the detection strategy to tegra. My GPU is 4090.

牛逼

yczZz avatar Sep 23 '25 09:09 yczZz

I still have some questions

  1. If I do not set the value of default_runtime_name in containerd to nvidia, the pod will not be able to apply for a gpu.

  2. If I set the value of default_runtime_name in containerd to nvidia, will it cause Pods that only use the cpu to also use nvidia's runtime? Will this affect Pods that use the gpu?

I think this is a rather serious problem

ZYWNB666 avatar Sep 24 '25 08:09 ZYWNB666

I still have some questions

  1. If I do not set the value of default_runtime_name in containerd to nvidia, the pod will not be able to apply for a gpu.
  2. If I set the value of default_runtime_name in containerd to nvidia, will it cause Pods that only use the cpu to also use nvidia's runtime? Will this affect Pods that use the gpu?

I think this is a rather serious problem

It seems that I have solved this problem of my own. If you installed the k8s-device-plugin using a yaml file, you need to manually create a RuntimeClass.

Because, if default_runtime_name is not set to nvidia, runc is used by default, which will result in the inability to inject the relevant dependencies of the gpu. However, setting it to nvidia would drag down cpu applications.

So, just create a new RuntimeClass. If you need to use the gpu's deplayment, add a spec.runtimeClassName: nvidia in the yaml

The following is the method for creating RuntimeClass

apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
  name: nvidia
handler: nvidia

This is a yaml case

apiVersion: v1
kind: Pod
metadata:
  name: gpu-test
spec:
  runtimeClassName: nvidia
  restartPolicy: Never
  nodeSelector:
    kubernetes.io/hostname: aigc-k8s-testnode
  containers:
  - name: gpu-test
    image: nvidia/cuda:12.3.2-base-ubuntu20.04
    command: ["sleep", "infinity"]
    resources:
      limits:
        nvidia.com/t4: 1

ZYWNB666 avatar Sep 24 '25 09:09 ZYWNB666

I still have some questions

  1. If I do not set the value of default_runtime_name in containerd to nvidia, the pod will not be able to apply for a gpu.
  2. If I set the value of default_runtime_name in containerd to nvidia, will it cause Pods that only use the cpu to also use nvidia's runtime? Will this affect Pods that use the gpu?

I think this is a rather serious problem

It seems that I have solved this problem of my own. If you installed the k8s-device-plugin using a yaml file, you need to manually create a RuntimeClass.

Because, if default_runtime_name is not set to nvidia, runc is used by default, which will result in the inability to inject the relevant dependencies of the gpu. However, setting it to nvidia would drag down cpu applications.

So, just create a new RuntimeClass. If you need to use the gpu's deplayment, add a spec.runtimeClassName: nvidia in the yaml

The following is the method for creating RuntimeClass

apiVersion: node.k8s.io/v1 kind: RuntimeClass metadata: name: nvidia handler: nvidia This is a yaml case

apiVersion: v1 kind: Pod metadata: name: gpu-test spec: runtimeClassName: nvidia restartPolicy: Never nodeSelector: kubernetes.io/hostname: aigc-k8s-testnode containers:

  • name: gpu-test image: nvidia/cuda:12.3.2-base-ubuntu20.04 command: ["sleep", "infinity"] resources: limits: nvidia.com/t4: 1

sry but i didnt catch you answer, since you already set the default runtime class to nvidia to make nvidia device plugin work, the default of k8s runtime class is nvidia now, so you should explicitly set runc as runtimeclass for those cpu pod,am i right?

Kevinkevin189 avatar Oct 18 '25 16:10 Kevinkevin189

I still have some questions

  1. If I do not set the value of default_runtime_name in containerd to nvidia, the pod will not be able to apply for a gpu.
  2. If I set the value of default_runtime_name in containerd to nvidia, will it cause Pods that only use the cpu to also use nvidia's runtime? Will this affect Pods that use the gpu?

I think this is a rather serious problem

It seems that I have solved this problem of my own. If you installed the k8s-device-plugin using a yaml file, you need to manually create a RuntimeClass. Because, if default_runtime_name is not set to nvidia, runc is used by default, which will result in the inability to inject the relevant dependencies of the gpu. However, setting it to nvidia would drag down cpu applications. So, just create a new RuntimeClass. If you need to use the gpu's deplayment, add a spec.runtimeClassName: nvidia in the yaml The following is the method for creating RuntimeClass apiVersion: node.k8s.io/v1 kind: RuntimeClass metadata: name: nvidia handler: nvidia This is a yaml case apiVersion: v1 kind: Pod metadata: name: gpu-test spec: runtimeClassName: nvidia restartPolicy: Never nodeSelector: kubernetes.io/hostname: aigc-k8s-testnode containers:

  • name: gpu-test image: nvidia/cuda:12.3.2-base-ubuntu20.04 command: ["sleep", "infinity"] resources: limits: nvidia.com/t4: 1

sry but i didnt catch you answer, since you already set the default runtime class to nvidia to make nvidia device plugin work, the default of k8s runtime class is nvidia now, so you should explicitly set runc as runtimeclass for those cpu pod,am i right?

Yes, that's correct. The reason is that I'm not sure if using NVIDIA's runtime on a pod with a large number of CPUs will have any impact on the GPU.

ZYWNB666 avatar Oct 18 '25 16:10 ZYWNB666