nvidia-docker icon indicating copy to clipboard operation
nvidia-docker copied to clipboard

Cannot kill container running with failed GPU

Open cswinter opened this issue 4 years ago • 0 comments

1. Issue or feature description

I have container running with nvidia-docker. The GPU linked to the container entered some error state that caused it to drop off the bus. The container is still alive, and docker kill simply hangs forever and fails to kill the container. The same is true of calling into the docker API directly.

2. Steps to reproduce the issue

  1. Run a container linked to a GPU
  2. Make the GPU drop off the bus

3. Information to attach (optional if deemed irrelevant)

  • [x] Some nvidia-container information: nvidia-container-cli -k -d /dev/tty info
I1005 15:16:32.476111 2560705 nvc.c:372] initializing library context (version=1.4.0, build=704a698b7a0ceec07a48e56c37365c741718c2df)
I1005 15:16:32.476155 2560705 nvc.c:346] using root /
I1005 15:16:32.476163 2560705 nvc.c:347] using ldcache /etc/ld.so.cache
I1005 15:16:32.476168 2560705 nvc.c:348] using unprivileged user 1000:1000
I1005 15:16:32.476191 2560705 nvc.c:389] attempting to load dxcore to see if we are running under Windows Subsystem for Linux (WSL)
I1005 15:16:32.476302 2560705 nvc.c:391] dxcore initialization failed, continuing assuming a non-WSL environment
W1005 15:16:32.478854 2560706 nvc.c:269] failed to set inheritable capabilities
W1005 15:16:32.478914 2560706 nvc.c:270] skipping kernel modules load due to failure
I1005 15:16:32.479156 2560707 driver.c:101] starting driver service
I1005 15:16:32.481821 2560705 nvc_info.c:676] requesting driver information with ''
I1005 15:16:32.482808 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libnvoptix.so.465.19.01
I1005 15:16:32.482854 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libnvidia-tls.so.465.19.01
I1005 15:16:32.482884 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libnvidia-rtcore.so.465.19.01
I1005 15:16:32.482915 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so.465.19.01
I1005 15:16:32.482955 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libnvidia-opticalflow.so.465.19.01
I1005 15:16:32.482992 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libnvidia-opencl.so.465.19.01
I1005 15:16:32.483025 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libnvidia-ngx.so.465.19.01
I1005 15:16:32.483054 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.465.19.01
I1005 15:16:32.483091 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libnvidia-ifr.so.465.19.01
I1005 15:16:32.483134 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libnvidia-glvkspirv.so.465.19.01
I1005 15:16:32.483161 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libnvidia-glsi.so.465.19.01
I1005 15:16:32.483188 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libnvidia-glcore.so.465.19.01
I1005 15:16:32.483215 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libnvidia-fbc.so.465.19.01
I1005 15:16:32.483250 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libnvidia-encode.so.465.19.01
I1005 15:16:32.483285 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libnvidia-eglcore.so.465.19.01
I1005 15:16:32.483312 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libnvidia-compiler.so.465.19.01
I1005 15:16:32.483342 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libnvidia-cfg.so.465.19.01
I1005 15:16:32.483382 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libnvidia-cbl.so.465.19.01
I1005 15:16:32.483412 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libnvidia-allocator.so.465.19.01
I1005 15:16:32.483451 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libnvcuvid.so.465.19.01
I1005 15:16:32.483664 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libcuda.so.465.19.01
I1005 15:16:32.483791 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libGLX_nvidia.so.465.19.01
I1005 15:16:32.483818 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libGLESv2_nvidia.so.465.19.01
I1005 15:16:32.483843 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libGLESv1_CM_nvidia.so.465.19.01
I1005 15:16:32.483870 2560705 nvc_info.c:169] selecting /usr/lib/x86_64-linux-gnu/libEGL_nvidia.so.465.19.01
I1005 15:16:32.483910 2560705 nvc_info.c:171] skipping /usr/lib/i386-linux-gnu/libnvidia-ptxjitcompiler.so.390.116
I1005 15:16:32.483937 2560705 nvc_info.c:171] skipping /usr/lib/i386-linux-gnu/libnvidia-opencl.so.390.116
I1005 15:16:32.483963 2560705 nvc_info.c:171] skipping /usr/lib/i386-linux-gnu/libnvidia-ml.so.390.116
I1005 15:16:32.483988 2560705 nvc_info.c:171] skipping /usr/lib/i386-linux-gnu/libnvidia-fbc.so.390.116
I1005 15:16:32.484012 2560705 nvc_info.c:171] skipping /usr/lib/i386-linux-gnu/libnvidia-fatbinaryloader.so.390.116
I1005 15:16:32.484035 2560705 nvc_info.c:171] skipping /usr/lib/i386-linux-gnu/libnvidia-encode.so.390.116
I1005 15:16:32.484059 2560705 nvc_info.c:171] skipping /usr/lib/i386-linux-gnu/libnvidia-compiler.so.390.116
I1005 15:16:32.484084 2560705 nvc_info.c:171] skipping /usr/lib/i386-linux-gnu/libnvcuvid.so.390.116
I1005 15:16:32.484119 2560705 nvc_info.c:171] skipping /usr/lib/i386-linux-gnu/libcuda.so.390.116
W1005 15:16:32.484136 2560705 nvc_info.c:350] missing library libnvidia-nscq.so
W1005 15:16:32.484141 2560705 nvc_info.c:350] missing library libnvidia-fatbinaryloader.so
W1005 15:16:32.484148 2560705 nvc_info.c:350] missing library libvdpau_nvidia.so
W1005 15:16:32.484155 2560705 nvc_info.c:354] missing compat32 library libnvidia-ml.so
W1005 15:16:32.484161 2560705 nvc_info.c:354] missing compat32 library libnvidia-cfg.so
W1005 15:16:32.484168 2560705 nvc_info.c:354] missing compat32 library libnvidia-nscq.so
W1005 15:16:32.484175 2560705 nvc_info.c:354] missing compat32 library libcuda.so
W1005 15:16:32.484181 2560705 nvc_info.c:354] missing compat32 library libnvidia-opencl.so
W1005 15:16:32.484186 2560705 nvc_info.c:354] missing compat32 library libnvidia-ptxjitcompiler.so
W1005 15:16:32.484191 2560705 nvc_info.c:354] missing compat32 library libnvidia-fatbinaryloader.so
W1005 15:16:32.484196 2560705 nvc_info.c:354] missing compat32 library libnvidia-allocator.so
W1005 15:16:32.484201 2560705 nvc_info.c:354] missing compat32 library libnvidia-compiler.so
W1005 15:16:32.484205 2560705 nvc_info.c:354] missing compat32 library libnvidia-ngx.so
W1005 15:16:32.484213 2560705 nvc_info.c:354] missing compat32 library libvdpau_nvidia.so
W1005 15:16:32.484220 2560705 nvc_info.c:354] missing compat32 library libnvidia-encode.so
W1005 15:16:32.484225 2560705 nvc_info.c:354] missing compat32 library libnvidia-opticalflow.so
W1005 15:16:32.484231 2560705 nvc_info.c:354] missing compat32 library libnvcuvid.so
W1005 15:16:32.484236 2560705 nvc_info.c:354] missing compat32 library libnvidia-eglcore.so
W1005 15:16:32.484245 2560705 nvc_info.c:354] missing compat32 library libnvidia-glcore.so
W1005 15:16:32.484249 2560705 nvc_info.c:354] missing compat32 library libnvidia-tls.so
W1005 15:16:32.484256 2560705 nvc_info.c:354] missing compat32 library libnvidia-glsi.so
W1005 15:16:32.484261 2560705 nvc_info.c:354] missing compat32 library libnvidia-fbc.so
W1005 15:16:32.484268 2560705 nvc_info.c:354] missing compat32 library libnvidia-ifr.so
W1005 15:16:32.484273 2560705 nvc_info.c:354] missing compat32 library libnvidia-rtcore.so
W1005 15:16:32.484279 2560705 nvc_info.c:354] missing compat32 library libnvoptix.so
W1005 15:16:32.484287 2560705 nvc_info.c:354] missing compat32 library libGLX_nvidia.so
W1005 15:16:32.484291 2560705 nvc_info.c:354] missing compat32 library libEGL_nvidia.so
W1005 15:16:32.484299 2560705 nvc_info.c:354] missing compat32 library libGLESv2_nvidia.so
W1005 15:16:32.484304 2560705 nvc_info.c:354] missing compat32 library libGLESv1_CM_nvidia.so
W1005 15:16:32.484311 2560705 nvc_info.c:354] missing compat32 library libnvidia-glvkspirv.so
W1005 15:16:32.484315 2560705 nvc_info.c:354] missing compat32 library libnvidia-cbl.so
I1005 15:16:32.484709 2560705 nvc_info.c:276] selecting /usr/bin/nvidia-smi
I1005 15:16:32.484725 2560705 nvc_info.c:276] selecting /usr/bin/nvidia-debugdump
I1005 15:16:32.484740 2560705 nvc_info.c:276] selecting /usr/bin/nvidia-persistenced
I1005 15:16:32.484763 2560705 nvc_info.c:276] selecting /usr/bin/nvidia-cuda-mps-control
I1005 15:16:32.484775 2560705 nvc_info.c:276] selecting /usr/bin/nvidia-cuda-mps-server
W1005 15:16:32.484835 2560705 nvc_info.c:376] missing binary nv-fabricmanager
I1005 15:16:32.484853 2560705 nvc_info.c:438] listing device /dev/nvidiactl
I1005 15:16:32.484857 2560705 nvc_info.c:438] listing device /dev/nvidia-uvm
I1005 15:16:32.484864 2560705 nvc_info.c:438] listing device /dev/nvidia-uvm-tools
I1005 15:16:32.484869 2560705 nvc_info.c:438] listing device /dev/nvidia-modeset
I1005 15:16:32.484893 2560705 nvc_info.c:317] listing ipc /run/nvidia-persistenced/socket
W1005 15:16:32.484911 2560705 nvc_info.c:321] missing ipc /var/run/nvidia-fabricmanager/socket
W1005 15:16:32.484922 2560705 nvc_info.c:321] missing ipc /tmp/nvidia-mps
I1005 15:16:32.484926 2560705 nvc_info.c:733] requesting device information with ''
nvidia-container-cli: detection error: nvml error: unknown error
I1005 15:16:32.485234 2560705 nvc.c:423] shutting down library context
I1005 15:16:32.486002 2560707 driver.c:163] terminating driver service
I1005 15:16:32.486418 2560705 driver.c:203] driver service terminated successfully
  • [x] Kernel version from uname -a

Linux Nexus 5.4.0-86-generic #97-Ubuntu SMP Fri Sep 17 19:19:40 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux

  • [x] Any relevant kernel output lines from dmesg
[1084456.074554] NVRM: GPU at PCI:0000:09:00: GPU-26fba6b2-9e52-006e-68db-4ce18382a5ea
[1084456.074559] NVRM: GPU Board Serial Number:
[1084456.074562] NVRM: Xid (PCI:0000:09:00): 79, pid=0, GPU has fallen off the bus.
[1084456.074566] NVRM: GPU 0000:09:00.0: GPU has fallen off the bus.
[1084456.074567] NVRM: GPU 0000:09:00.0: GPU is on Board .
[1084456.074591] NVRM: A GPU crash dump has been created. If possible, please run
                 NVRM: nvidia-bug-report.sh as root to collect this data before
                 NVRM: the NVIDIA kernel module is unloaded.
[1084456.746765] nvidia-gpu 0000:09:00.3: Refused to change power state, currently in D3
[1084456.807851] xhci_hcd 0000:09:00.2: Refused to change power state, currently in D3
[1084456.886514] xhci_hcd 0000:09:00.2: Refused to change power state, currently in D3
[1084456.886523] xhci_hcd 0000:09:00.2: Controller not ready at resume -19
[1084456.886524] xhci_hcd 0000:09:00.2: PCI post-resume error -19!
[1084456.886527] xhci_hcd 0000:09:00.2: HC died; cleaning up
[1084457.868993] nvidia-gpu 0000:09:00.3: i2c timeout error ffffffff
[1084457.868999] ucsi_ccg 1-0008: i2c_transfer failed -110
[1086487.613065] pcieport 0000:00:01.1: AER: Corrected error received: 0000:00:00.0
[1086487.613071] pcieport 0000:00:01.1: AER: PCIe Bus Error: severity=Corrected, type=Data Link Layer, (Receiver ID)
[1086487.613074] pcieport 0000:00:01.1: AER:   device [1022:1453] error status/mask=00000040/00006000
[1086487.613075] pcieport 0000:00:01.1: AER:    [ 6] BadTLP
[1086886.297985] br-c0ec5dd4a4d1: port 2(veth372490a) entered disabled state
[1086886.298302] veth62eab79: renamed from eth0
[1086886.378764] br-c0ec5dd4a4d1: port 2(veth372490a) entered disabled state
[1086886.424116] device veth372490a left promiscuous mode
[1086886.424122] br-c0ec5dd4a4d1: port 2(veth372490a) entered disabled state
[1088629.432059] br-c0ec5dd4a4d1: port 4(vetha0beed6) entered disabled state
[1088629.432189] veth6805632: renamed from eth0
[1088629.488366] br-c0ec5dd4a4d1: port 4(vetha0beed6) entered disabled state
[1088629.511279] device vetha0beed6 left promiscuous mode
[1088629.511282] br-c0ec5dd4a4d1: port 4(vetha0beed6) entered disabled state
[1089057.661775] pcieport 0000:00:01.1: AER: Corrected error received: 0000:00:00.0
[1089057.661783] pcieport 0000:00:01.1: AER: PCIe Bus Error: severity=Corrected, type=Data Link Layer, (Receiver ID)
[1089057.661788] pcieport 0000:00:01.1: AER:   device [1022:1453] error status/mask=00000040/00006000
[1089057.661791] pcieport 0000:00:01.1: AER:    [ 6] BadTLP
[1091659.501703] br-c0ec5dd4a4d1: port 6(veth9e3fed1) entered disabled state
[1091659.501884] vethfb23d27: renamed from eth0
[1091659.565201] br-c0ec5dd4a4d1: port 6(veth9e3fed1) entered disabled state
[1091659.585728] device veth9e3fed1 left promiscuous mode
[1091659.585736] br-c0ec5dd4a4d1: port 6(veth9e3fed1) entered disabled state
[1091865.568618] br-c0ec5dd4a4d1: port 8(vethd911b0c) entered disabled state
[1091865.568908] veth0dc21df: renamed from eth0
[1091865.618992] br-c0ec5dd4a4d1: port 8(vethd911b0c) entered disabled state
[1091865.637512] device vethd911b0c left promiscuous mode
[1091865.637516] br-c0ec5dd4a4d1: port 8(vethd911b0c) entered disabled state
[1094086.994722] pcieport 0000:00:01.1: AER: Corrected error received: 0000:00:00.0
[1094086.994727] pcieport 0000:00:01.1: AER: PCIe Bus Error: severity=Corrected, type=Data Link Layer, (Receiver ID)
[1094086.994732] pcieport 0000:00:01.1: AER:   device [1022:1453] error status/mask=00000040/00006000
[1094086.994734] pcieport 0000:00:01.1: AER:    [ 6] BadTLP
[1096065.301987] pcieport 0000:00:01.1: AER: Corrected error received: 0000:00:00.0
[1096065.301994] pcieport 0000:00:01.1: AER: PCIe Bus Error: severity=Corrected, type=Data Link Layer, (Transmitter ID)
[1096065.301999] pcieport 0000:00:01.1: AER:   device [1022:1453] error status/mask=00001000/00006000
[1096065.302002] pcieport 0000:00:01.1: AER:    [12] Timeout
[1100712.135957] pcieport 0000:00:01.1: AER: Corrected error received: 0000:00:00.0
[1100712.135963] pcieport 0000:00:01.1: AER: PCIe Bus Error: severity=Corrected, type=Data Link Layer, (Transmitter ID)
[1100712.135968] pcieport 0000:00:01.1: AER:   device [1022:1453] error status/mask=00001000/00006000
[1100712.135971] pcieport 0000:00:01.1: AER:    [12] Timeout
[1100747.816310] pcieport 0000:00:01.1: AER: Corrected error received: 0000:00:00.0
[1100747.816316] pcieport 0000:00:01.1: AER: PCIe Bus Error: severity=Corrected, type=Data Link Layer, (Transmitter ID)
[1100747.816321] pcieport 0000:00:01.1: AER:   device [1022:1453] error status/mask=00001000/00006000
[1100747.816324] pcieport 0000:00:01.1: AER:    [12] Timeout
[1104707.662123] pcieport 0000:00:01.1: AER: Corrected error received: 0000:00:00.0
[1104707.662129] pcieport 0000:00:01.1: AER: PCIe Bus Error: severity=Corrected, type=Data Link Layer, (Transmitter ID)
[1104707.662133] pcieport 0000:00:01.1: AER:   device [1022:1453] error status/mask=00001000/00006000
[1104707.662136] pcieport 0000:00:01.1: AER:    [12] Timeout
[1106160.329977] pcieport 0000:00:01.1: AER: Corrected error received: 0000:00:00.0
[1106160.329983] pcieport 0000:00:01.1: AER: PCIe Bus Error: severity=Corrected, type=Data Link Layer, (Receiver ID)
[1106160.329987] pcieport 0000:00:01.1: AER:   device [1022:1453] error status/mask=00000080/00006000
[1106160.329990] pcieport 0000:00:01.1: AER:    [ 7] BadDLLP
[1109655.621167] pcieport 0000:00:01.1: AER: Corrected error received: 0000:00:00.0
[1109655.621173] pcieport 0000:00:01.1: AER: PCIe Bus Error: severity=Corrected, type=Data Link Layer, (Transmitter ID)
[1109655.621178] pcieport 0000:00:01.1: AER:   device [1022:1453] error status/mask=00001000/00006000
[1109655.621181] pcieport 0000:00:01.1: AER:    [12] Timeout
  • [x] Driver information from nvidia-smi -a

nvidia-smi -a -i 0

Unable to determine the device handle for GPU 0000:09:00.0: Unknown Error

nvidia-smi -a -i 1

==============NVSMI LOG==============

Timestamp                                 : Tue Oct  5 08:18:21 2021
Driver Version                            : 465.19.01
CUDA Version                              : 11.3

Attached GPUs                             : 3
GPU 00000000:0A:00.0
    Product Name                          : NVIDIA GeForce RTX 2080 Ti
    Product Brand                         : GeForce
    Display Mode                          : Disabled
    Display Active                        : Disabled
    Persistence Mode                      : Enabled
    MIG Mode
        Current                           : N/A
        Pending                           : N/A
    Accounting Mode                       : Disabled
    Accounting Mode Buffer Size           : 4000
    Driver Model
        Current                           : N/A
        Pending                           : N/A
    Serial Number                         : N/A
    GPU UUID                              : GPU-65a4f66e-21ab-9843-1dfd-36cde5b6417f
    Minor Number                          : 1
    VBIOS Version                         : 90.02.17.00.8E
    MultiGPU Board                        : No
    Board ID                              : 0xa00
    GPU Part Number                       : N/A
    Inforom Version
        Image Version                     : G001.0000.02.04
        OEM Object                        : 1.1
        ECC Object                        : N/A
        Power Management Object           : N/A
    GPU Operation Mode
        Current                           : N/A
        Pending                           : N/A
    GPU Virtualization Mode
        Virtualization Mode               : None
        Host VGPU Mode                    : N/A
    IBMNPU
        Relaxed Ordering Mode             : N/A
    PCI
        Bus                               : 0x0A
        Device                            : 0x00
        Domain                            : 0x0000
        Device Id                         : 0x1E0410DE
        Bus Id                            : 00000000:0A:00.0
        Sub System Id                     : 0x22813842
        GPU Link Info
            PCIe Generation
                Max                       : 3
                Current                   : 1
            Link Width
                Max                       : 16x
                Current                   : 16x
        Bridge Chip
            Type                          : N/A
            Firmware                      : N/A
        Replays Since Reset               : 0
        Replay Number Rollovers           : 0
        Tx Throughput                     : 0 KB/s
        Rx Throughput                     : 0 KB/s
    Fan Speed                             : 0 %
    Performance State                     : P8
    Clocks Throttle Reasons
        Idle                              : Active
        Applications Clocks Setting       : Not Active
        SW Power Cap                      : Not Active
        HW Slowdown                       : Not Active
            HW Thermal Slowdown           : Not Active
            HW Power Brake Slowdown       : Not Active
        Sync Boost                        : Not Active
        SW Thermal Slowdown               : Not Active
        Display Clock Setting             : Not Active
    FB Memory Usage
        Total                             : 11019 MiB
        Used                              : 10 MiB
        Free                              : 11009 MiB
    BAR1 Memory Usage
        Total                             : 256 MiB
        Used                              : 4 MiB
        Free                              : 252 MiB
    Compute Mode                          : Default
    Utilization
        Gpu                               : 0 %
        Memory                            : 0 %
        Encoder                           : 0 %
        Decoder                           : 0 %
    Encoder Stats
        Active Sessions                   : 0
        Average FPS                       : 0
        Average Latency                   : 0
    FBC Stats
        Active Sessions                   : 0
        Average FPS                       : 0
        Average Latency                   : 0
    Ecc Mode
        Current                           : N/A
        Pending                           : N/A
    ECC Errors
        Volatile
            SRAM Correctable              : N/A
            SRAM Uncorrectable            : N/A
            DRAM Correctable              : N/A
            DRAM Uncorrectable            : N/A
        Aggregate
            SRAM Correctable              : N/A
            SRAM Uncorrectable            : N/A
            DRAM Correctable              : N/A
            DRAM Uncorrectable            : N/A
    Retired Pages
        Single Bit ECC                    : N/A
        Double Bit ECC                    : N/A
        Pending Page Blacklist            : N/A
    Remapped Rows                         : N/A
    Temperature
        GPU Current Temp                  : 35 C
        GPU Shutdown Temp                 : 94 C
        GPU Slowdown Temp                 : 91 C
        GPU Max Operating Temp            : 89 C
        GPU Target Temperature            : 84 C
        Memory Current Temp               : N/A
        Memory Max Operating Temp         : N/A
    Power Readings
        Power Management                  : Supported
        Power Draw                        : 2.35 W
        Power Limit                       : 250.00 W
        Default Power Limit               : 250.00 W
        Enforced Power Limit              : 250.00 W
        Min Power Limit                   : 100.00 W
        Max Power Limit                   : 280.00 W
    Clocks
        Graphics                          : 300 MHz
        SM                                : 300 MHz
        Memory                            : 405 MHz
        Video                             : 540 MHz
    Applications Clocks
        Graphics                          : N/A
        Memory                            : N/A
    Default Applications Clocks
        Graphics                          : N/A
        Memory                            : N/A
    Max Clocks
        Graphics                          : 2100 MHz
        SM                                : 2100 MHz
        Memory                            : 7000 MHz
        Video                             : 1950 MHz
    Max Customer Boost Clocks
        Graphics                          : N/A
    Clock Policy
        Auto Boost                        : N/A
        Auto Boost Default                : N/A
    Processes
        GPU instance ID                   : N/A
        Compute instance ID               : N/A
        Process ID                        : 2330
            Type                          : G
            Name                          : /usr/lib/xorg/Xorg
            Used GPU Memory               : 4 MiB
        GPU instance ID                   : N/A
        Compute instance ID               : N/A
        Process ID                        : 931071
            Type                          : G
            Name                          : /usr/lib/xorg/Xorg
            Used GPU Memory               : 4 MiB
  • [x] Docker version from docker version
Client: Docker Engine - Community
 Version:           20.10.6
 API version:       1.41
 Go version:        go1.13.15
 Git commit:        370c289
 Built:             Fri Apr  9 22:46:01 2021
 OS/Arch:           linux/amd64
 Context:           default
 Experimental:      true

Server: Docker Engine - Community
 Engine:
  Version:          20.10.6
  API version:      1.41 (minimum version 1.12)
  Go version:       go1.13.15
  Git commit:       8728dd2
  Built:            Fri Apr  9 22:44:13 2021
  OS/Arch:          linux/amd64
  Experimental:     false
 containerd:
  Version:          1.4.4
  GitCommit:        05f951a3781f4f2c1911b05e61c160e9c30eaa8e
 runc:
  Version:          1.0.0-rc93
  GitCommit:        12644e614e25b05da6fd08a38ffa0cfe1903fdec
 docker-init:
  Version:          0.19.0
  GitCommit:        de40ad0
  • [x] NVIDIA packages version from dpkg -l '*nvidia*' or rpm -qa '*nvidia*'
Desired=Unknown/Install/Remove/Purge/Hold
| Status=Not/Inst/Conf-files/Unpacked/halF-conf/Half-inst/trig-aWait/Trig-pend
|/ Err?=(none)/Reinst-required (Status,Err: uppercase=bad)
||/ Name                             Version                     Architecture Description
+++-================================-===========================-============-=========================================================
un  libgldispatch0-nvidia            <none>                      <none>       (no description available)
ii  libnvidia-cfg1-465:amd64         465.19.01-0ubuntu1          amd64        NVIDIA binary OpenGL/GLX configuration library
un  libnvidia-cfg1-any               <none>                      <none>       (no description available)
un  libnvidia-common                 <none>                      <none>       (no description available)
ii  libnvidia-common-465             465.19.01-0ubuntu1          all          Shared files used by the NVIDIA libraries
rc  libnvidia-compute-390:i386       390.116-0ubuntu0.18.04.1    i386         NVIDIA libcompute package
rc  libnvidia-compute-418:amd64      430.64-0ubuntu0~gpu18.04.1  amd64        Transitional package for libnvidia-compute-430
un  libnvidia-compute-430            <none>                      <none>       (no description available)
rc  libnvidia-compute-450:amd64      450.102.04-0ubuntu0.18.04.1 amd64        NVIDIA libcompute package
ii  libnvidia-compute-465:amd64      465.19.01-0ubuntu1          amd64        NVIDIA libcompute package
ii  libnvidia-container-tools        1.4.0-1                     amd64        NVIDIA container runtime library (command-line tools)
ii  libnvidia-container1:amd64       1.4.0-1                     amd64        NVIDIA container runtime library
un  libnvidia-decode                 <none>                      <none>       (no description available)
ii  libnvidia-decode-465:amd64       465.19.01-0ubuntu1          amd64        NVIDIA Video Decoding runtime libraries
un  libnvidia-encode                 <none>                      <none>       (no description available)
ii  libnvidia-encode-465:amd64       465.19.01-0ubuntu1          amd64        NVENC Video Encoding runtime library
un  libnvidia-extra                  <none>                      <none>       (no description available)
ii  libnvidia-extra-465:amd64        465.19.01-0ubuntu1          amd64        Extra libraries for the NVIDIA driver
un  libnvidia-fbc1                   <none>                      <none>       (no description available)
ii  libnvidia-fbc1-465:amd64         465.19.01-0ubuntu1          amd64        NVIDIA OpenGL-based Framebuffer Capture runtime library
un  libnvidia-gl                     <none>                      <none>       (no description available)
ii  libnvidia-gl-465:amd64           465.19.01-0ubuntu1          amd64        NVIDIA OpenGL/GLX/EGL/GLES GLVND libraries and Vulkan ICD
un  libnvidia-ifr1                   <none>                      <none>       (no description available)
ii  libnvidia-ifr1-465:amd64         465.19.01-0ubuntu1          amd64        NVIDIA OpenGL-based Inband Frame Readback runtime library
un  libnvidia-ml1                    <none>                      <none>       (no description available)
un  nvidia-384                       <none>                      <none>       (no description available)
un  nvidia-390                       <none>                      <none>       (no description available)
un  nvidia-common                    <none>                      <none>       (no description available)
ii  nvidia-compute-utils-465         465.19.01-0ubuntu1          amd64        NVIDIA compute utilities
ii  nvidia-container-runtime         3.5.0-1                     amd64        NVIDIA container runtime
un  nvidia-container-runtime-hook    <none>                      <none>       (no description available)
ii  nvidia-container-toolkit         1.5.0-1                     amd64        NVIDIA container runtime hook
ii  nvidia-dkms-465                  465.19.01-0ubuntu1          amd64        NVIDIA DKMS package
un  nvidia-dkms-kernel               <none>                      <none>       (no description available)
un  nvidia-docker                    <none>                      <none>       (no description available)
ii  nvidia-docker2                   2.6.0-1                     all          nvidia-docker CLI wrapper
ii  nvidia-driver-465                465.19.01-0ubuntu1          amd64        NVIDIA driver metapackage
un  nvidia-driver-binary             <none>                      <none>       (no description available)
un  nvidia-kernel-common             <none>                      <none>       (no description available)
ii  nvidia-kernel-common-465         465.19.01-0ubuntu1          amd64        Shared files used with the kernel module
un  nvidia-kernel-source             <none>                      <none>       (no description available)
ii  nvidia-kernel-source-465         465.19.01-0ubuntu1          amd64        NVIDIA kernel source package
un  nvidia-legacy-304xx-vdpau-driver <none>                      <none>       (no description available)
un  nvidia-legacy-340xx-vdpau-driver <none>                      <none>       (no description available)
un  nvidia-libopencl1-dev            <none>                      <none>       (no description available)
ii  nvidia-modprobe                  465.19.01-0ubuntu1          amd64        Load the NVIDIA kernel driver and create device files
un  nvidia-opencl-icd                <none>                      <none>       (no description available)
un  nvidia-persistenced              <none>                      <none>       (no description available)
ii  nvidia-prime                     0.8.16~0.20.04.1            all          Tools to enable NVIDIA's Prime
ii  nvidia-settings                  470.57.01-0ubuntu0.20.04.1  amd64        Tool for configuring the NVIDIA graphics driver
un  nvidia-settings-binary           <none>                      <none>       (no description available)
un  nvidia-smi                       <none>                      <none>       (no description available)
un  nvidia-utils                     <none>                      <none>       (no description available)
ii  nvidia-utils-465                 465.19.01-0ubuntu1          amd64        NVIDIA driver support binaries
un  nvidia-vdpau-driver              <none>                      <none>       (no description available)
ii  xserver-xorg-video-nvidia-465    465.19.01-0ubuntu1          amd64        NVIDIA binary Xorg driver
  • [x] NVIDIA container library version from nvidia-container-cli -V
version: 1.4.0
build date: 2021-04-24T14:25+00:00
build revision: 704a698b7a0ceec07a48e56c37365c741718c2df
build compiler: x86_64-linux-gnu-gcc-7 7.5.0
build platform: x86_64
build flags: -D_GNU_SOURCE -D_FORTIFY_SOURCE=2 -DNDEBUG -std=gnu11 -O2 -g -fdata-sections -ffunction-sections -fstack-protector -fno-strict-aliasing -fvisibility=hidden -Wall -Wextra -Wcast-align -Wpointer-arith -Wmissing-prototypes -Wnonnull -Wwrite-strings -Wlogical-op -Wformat=2 -Wmissing-format-attribute -Winit-self -Wshadow -Wstrict-prototypes -Wunreachable-code -Wconversion -Wsign-conversion -Wno-unknown-warning-option -Wno-format-extra-args -Wno-gnu-alignof-expression -Wl,-zrelro -Wl,-znow -Wl,-zdefs -Wl,--gc-sections
  • [ ] NVIDIA container library logs (see troubleshooting)
  • [x] Docker command, image and tag used

docker inspect of container. Image is based off nvcr.io/nvidia/pytorch:21.03-py3.

[
    {
        "Id": "957d21a0cba38ebee1020b5e5b451456d80617816ee1e7d4f5d578793ecfc7de",
        "Created": "2021-10-05T07:49:53.797531246Z",
        "Path": "/usr/local/bin/nvidia_entrypoint.sh",
        "Args": [
            "python",
            "fine_tune_step.py",
            "--local=False"
        ],
        "State": {
            "Status": "running",
            "Running": true,
            "Paused": false,
            "Restarting": false,
            "OOMKilled": false,
            "Dead": false,
            "Pid": 2230946,
            "ExitCode": 0,
            "Error": "",
            "StartedAt": "2021-10-05T07:49:54.568132913Z",
            "FinishedAt": "0001-01-01T00:00:00Z"
        },
        "Image": "sha256:f181827ebc7ebedf053ff890ed7369e2d1e41d4bd6dba80008b51065e666bda7",
        "ResolvConfPath": "/var/lib/docker/containers/957d21a0cba38ebee1020b5e5b451456d80617816ee1e7d4f5d578793ecfc7de/resolv.conf",
        "HostnamePath": "/var/lib/docker/containers/957d21a0cba38ebee1020b5e5b451456d80617816ee1e7d4f5d578793ecfc7de/hostname",
        "HostsPath": "/var/lib/docker/containers/957d21a0cba38ebee1020b5e5b451456d80617816ee1e7d4f5d578793ecfc7de/hosts",
        "LogPath": "/var/lib/docker/containers/957d21a0cba38ebee1020b5e5b451456d80617816ee1e7d4f5d578793ecfc7de/957d21a0cba38ebee1020b5e5b451456d80617816ee1e7d4f5d578793ecfc7de-json.log",
        "Name": "/xprun.7375562f0b84409f86024c62a1520eb2.main-0",
        "RestartCount": 0,
        "Driver": "overlay2",
        "Platform": "linux",
        "MountLabel": "",
        "ProcessLabel": "",
        "AppArmorProfile": "docker-default",
        "ExecIDs": null,
        "HostConfig": {
            "Binds": [
                "<REDACTED>",
            ],
            "ContainerIDFile": "",
            "LogConfig": {
                "Type": "json-file",
                "Config": {}
            },
            "NetworkMode": "xprun.shared_network",
            "PortBindings": {},
            "RestartPolicy": {
                "Name": "no",
                "MaximumRetryCount": 0
            },
            "AutoRemove": false,
            "VolumeDriver": "",
            "VolumesFrom": null,
            "CapAdd": null,
            "CapDrop": null,
            "CgroupnsMode": "host",
            "Dns": [],
            "DnsOptions": [],
            "DnsSearch": [],
            "ExtraHosts": null,
            "GroupAdd": null,
            "IpcMode": "private",
            "Cgroup": "",
            "Links": null,
            "OomScoreAdj": 0,
            "PidMode": "",
            "Privileged": false,
            "PublishAllPorts": false,
            "ReadonlyRootfs": false,
            "SecurityOpt": null,
            "UTSMode": "",
            "UsernsMode": "",
            "ShmSize": 67108864,
            "Runtime": "runc",
            "ConsoleSize": [
                0,
                0
            ],
            "Isolation": "",
            "CpuShares": 0,
            "Memory": 0,
            "NanoCpus": 0,
            "CgroupParent": "",
            "BlkioWeight": 0,
            "BlkioWeightDevice": [],
            "BlkioDeviceReadBps": null,
            "BlkioDeviceWriteBps": null,
            "BlkioDeviceReadIOps": null,
            "BlkioDeviceWriteIOps": null,
            "CpuPeriod": 0,
            "CpuQuota": 0,
            "CpuRealtimePeriod": 0,
            "CpuRealtimeRuntime": 0,
            "CpusetCpus": "",
            "CpusetMems": "",
            "Devices": [],
            "DeviceCgroupRules": null,
            "DeviceRequests": [
                {
                    "Driver": "",
                    "Count": 0,
                    "DeviceIDs": [
                        "0"
                    ],
                    "Capabilities": [
                        [
                            "gpu"
                        ]
                    ],
                    "Options": {}
                }
            ],
            "KernelMemory": 0,
            "KernelMemoryTCP": 0,
            "MemoryReservation": 0,
            "MemorySwap": 0,
            "MemorySwappiness": null,
            "OomKillDisable": false,
            "PidsLimit": null,
            "Ulimits": null,
            "CpuCount": 0,
            "CpuPercent": 0,
            "IOMaximumIOps": 0,
            "IOMaximumBandwidth": 0,
            "MaskedPaths": [
                "/proc/asound",
                "/proc/acpi",
                "/proc/kcore",
                "/proc/keys",
                "/proc/latency_stats",
                "/proc/timer_list",
                "/proc/timer_stats",
                "/proc/sched_debug",
                "/proc/scsi",
                "/sys/firmware"
            ],
            "ReadonlyPaths": [
                "/proc/bus",
                "/proc/fs",
                "/proc/irq",
                "/proc/sys",
                "/proc/sysrq-trigger"
            ]
        },
        "GraphDriver": {
            "Data": {
                "LowerDir": "/var/lib/docker/overlay2/6e7d6311a0d62485d5acf6e1917c765af7893d784a8f9771453f79a268bdb4f9-init/diff:/var/lib/docker/overlay2/5819376ffcbd175aa05b420b5b600a3aae0d41393a8ccda8b4a2fcf47c976f8b/diff:/var/lib/docker/overlay2/4b2f2c106888b3a2f87fa67fcd721f3eabe29610e48d83b948f58e0b8bf204ac/diff:/var/lib/docker/overlay2/13182d744181d3c22978107837991916fb3c01668db0dfe0a37fbbb4dca73cbf/diff:/var/lib/docker/overlay2/76f7ef2cbe3b183c85a850464837ba31aa248ad0ef811a5fe7bf909bb559723d/diff:/var/lib/docker/overlay2/605bd8d23c19c27771b9f93dabcaba4fccd27fed9a2a2b02a9dba69107b811d1/diff:/var/lib/docker/overlay2/c478a7669dd04bb346b2b6e500abbe23a9ef7719295269c45cb318e7f1e0db13/diff:/var/lib/docker/overlay2/77606d4a9115bb5bf8ac4c6c29f5ad46783363ce801da62d84666d8228b8503b/diff:/var/lib/docker/overlay2/a54cf4f6d1069d8fbabdb64ffa0712adc8b015422b2738b16b3ed9d6867ee14b/diff:/var/lib/docker/overlay2/1876bd7038d458a8933e6025015cf94fdb1e8cfe8e259145790ea7af5687f0f8/diff:/var/lib/docker/overlay2/b13eed5fea5f246facc38476c50bd620b72f56fb411f6668b35ae66da5367ec7/diff:/var/lib/docker/overlay2/89c2a01137487e5af4eabd263bab484aeb50433ad3a640f79a2c1d22f587c653/diff:/var/lib/docker/overlay2/c39394ccfcfb4898d339f560edc03634dbb5988f43526cffbf71a730f838b66f/diff:/var/lib/docker/overlay2/d497703527314f4ad9910998648f9cb7396600ec85159ad1288dbdd721de9edb/diff:/var/lib/docker/overlay2/140022f42fd75ca49556c9716dbac47e1a8344325025fb40eb88ea1bcc2bfb1a/diff:/var/lib/docker/overlay2/c939584fe61ca86fea89d9026e24f15b8497f47ae7772ac6e10dd0ce2c7da5d7/diff:/var/lib/docker/overlay2/a3711f1a6aa63075a598afccdead1e02da579e9a5feb04ab8de5b6ca6074b384/diff:/var/lib/docker/overlay2/3b3f0c59c38410c51a3fadc14f6206f6fa9d5523ccbbbdebfcba9eaf115efd1d/diff:/var/lib/docker/overlay2/32cb0f88346a99b047262fb4e5090afb6741322f028de3dcf1adb02597f5df04/diff:/var/lib/docker/overlay2/317b5e40d9b38a396231c682afdd4eec22930a5e8c9af06d1178216bbf0e2280/diff:/var/lib/docker/overlay2/829d67a87d73fc92b65de0e2c2b36d29b31d9b9849dee845fad94a1308651970/diff:/var/lib/docker/overlay2/ac9b9381c4f7928826467198044b309dee6a5ca8bcc01eaf93734247b5b77190/diff:/var/lib/docker/overlay2/1848e60786659b383c164f2605c5d7166ffd7bd30f9b94f0d2a72188cd565bed/diff:/var/lib/docker/overlay2/29ac62636cf989d3693481b8d290cefd0176448ef6c66a90c2f3ff34d9768c9b/diff:/var/lib/docker/overlay2/2a0ce43d5282c882e79605da3ac219d1e5ed38b767804d463bcd72d2333decd1/diff:/var/lib/docker/overlay2/e1386c6f98e6270aecbe83d4f97bd56e0431c965ffa173d7f7d8a7c8e500e616/diff:/var/lib/docker/overlay2/e82b78226eb4af73b3bdc2d0f14bbfcbca964bed49ba502d5220f623a903cdba/diff:/var/lib/docker/overlay2/b55f797db7c9d94667e26db3a7b9168834907f8fc12e557a49727754952a4b74/diff:/var/lib/docker/overlay2/3c2d39aceaa118c786d8a255d9bc9bf8034efdd5b2c829c0515e0847fc81b470/diff:/var/lib/docker/overlay2/101780446db783454057727757fb8adb2dd4eacee36dad0debd51f671af43ef9/diff:/var/lib/docker/overlay2/249c7988e07ee63e832fe8a180473a430d1a6746e9c027f742fbb5683453073e/diff:/var/lib/docker/overlay2/df2d105cb8a620ed1928e603cc133d72f5b575b10b11ba8d042d8cab7fbb5b32/diff:/var/lib/docker/overlay2/7053f02fb03c07042c774f44e2d533e59a97bcfe051a0ad17a7264b836839db4/diff:/var/lib/docker/overlay2/8def9c2e62240b377727379bd0f372835cb7d7c34c4bcc87e2f462f3143d6c6b/diff:/var/lib/docker/overlay2/e1bd444ad3650fc209603f09f19555bfb290693867c0b5d065eb5db37462e842/diff:/var/lib/docker/overlay2/8b71a94b0888c4addb9dfd66ed052a87c664472094d690d90f2f2572cd5ab1f3/diff:/var/lib/docker/overlay2/069ca13fc4edaff07bd548dddee38c370b2f990c61e3b3be24ad3473b3c812c8/diff:/var/lib/docker/overlay2/b7f9451ba5d1725186885ae8ef2f52b69d298faa29b845c4e4d8a116eb22f5fb/diff:/var/lib/docker/overlay2/fd5f974595bb9d0029d24fbf8c9f07e3733bce26e26eac5475a0c4ab6e854084/diff:/var/lib/docker/overlay2/532482a57839062c2047083b039b85f6319d2ad9d5c2af7f07af14c96777308e/diff:/var/lib/docker/overlay2/279f4dde7b9855bbba3a8548ef5b8b58c2d1d4444dfacc51dcc34a715901da9c/diff:/var/lib/docker/overlay2/a8edb47beb68cbcdca077d5f3642e887ba790b45aed4079f6d2163a88fa2004b/diff:/var/lib/docker/overlay2/f18c830ee40c843d7e354708b690654f1619c9b0be3f88ee588d1d28b895cbbc/diff:/var/lib/docker/overlay2/9e4dc0378f9a363c64343abdab944cf25c54d561ecdf392330f5084c25672293/diff:/var/lib/docker/overlay2/0acb192bc3e43e77ce4496553f8351257c259f2c5c8eae029f76ba82d6368fa2/diff:/var/lib/docker/overlay2/69608d804f3812b3c97297a93b5458ee7012346fd5b46c3c05c791f2d86be9d4/diff:/var/lib/docker/overlay2/1fe774e60c5014443325fdf9cf7597078ce09314a95ad395fdfc1317605e5d22/diff:/var/lib/docker/overlay2/6ee1c1194155f4a0a4b0f19ca8f0190594065ccaf9425753bb35ec902ae2268f/diff:/var/lib/docker/overlay2/998236703c0c89a69ee1c3083523af23d255c9fca355b413319be83518ff913e/diff:/var/lib/docker/overlay2/b9a9a8f7e058cf1c77727a02e15ebaa98630e75e2516f7639807a1059ee81b42/diff:/var/lib/docker/overlay2/becf6a7d5305b2ca446259ee2845b363b6f8d3fb801e8beaaaefea1ffd390f81/diff:/var/lib/docker/overlay2/2093f1a53f77f34522bf9c499c05a910cfcce07f4f5cc8f94082d8578e482593/diff:/var/lib/docker/overlay2/4ed1cbe01e7b2e6f8879f47e3918eab4d4f153bea46b55e164540014f2785841/diff:/var/lib/docker/overlay2/8b16232379ec4b65956e0b8e4bca41eb15509153545c2b75c01b86aa2b8e8439/diff:/var/lib/docker/overlay2/15db1d5353cf265ecfe9953894cceea2776215b42764ca380aad0980fe662b37/diff:/var/lib/docker/overlay2/17be0c5c6461c23354f06c8f119fdcae76639ac851e10e1594b4ced1d730e480/diff:/var/lib/docker/overlay2/fc06476921eeb57b02ff5ab236784a9b03ef61b4de80d4af94e77d7a7b26375c/diff:/var/lib/docker/overlay2/f04cf47c0a53c51d2c1ae5d235ceaf8d7b0caab5022f0824c71bbed720d861ff/diff:/var/lib/docker/overlay2/2beca2c21c44c7e0f7e94a622346d0886b1cc32307692f9bc8c8f2752f223c72/diff:/var/lib/docker/overlay2/69085137b5d5f0fd1942e49ea923ef64a917cffa6eef60b0f84f92eff3e9ba86/diff:/var/lib/docker/overlay2/9ecbfe92aca7a64c0cc852ea85575ea9640d6636ce3d07646ff793bfa3cba629/diff:/var/lib/docker/overlay2/8f767afbb5d24b2d55fd2f9f0271ddbc8423406ff66a7642a0e948eec71c3fc4/diff",
                "MergedDir": "/var/lib/docker/overlay2/6e7d6311a0d62485d5acf6e1917c765af7893d784a8f9771453f79a268bdb4f9/merged",
                "UpperDir": "/var/lib/docker/overlay2/6e7d6311a0d62485d5acf6e1917c765af7893d784a8f9771453f79a268bdb4f9/diff",
                "WorkDir": "/var/lib/docker/overlay2/6e7d6311a0d62485d5acf6e1917c765af7893d784a8f9771453f79a268bdb4f9/work"
            },
            "Name": "overlay2"
        },
        "Mounts": [
              "<REDACTED">,
        ],
        "Config": {
            "Hostname": "957d21a0cba3",
            "Domainname": "",
            "User": "",
            "AttachStdin": false,
            "AttachStdout": false,
            "AttachStderr": false,
            "ExposedPorts": {
                "6006/tcp": {},
                "8888/tcp": {}
            },
            "Tty": false,
            "OpenStdin": false,
            "StdinOnce": false,
            "Env": [
                "PATH=/opt/conda/bin:/opt/cmake-3.14.6-Linux-x86_64/bin/:/usr/local/mpi/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin:/opt/tensorrt/bin",
                "CUDA_VERSION=11.2.1.007",
                "CUDA_DRIVER_VERSION=460.32.03",
                "CUDA_CACHE_DISABLE=1",
                "_CUDA_COMPAT_PATH=/usr/local/cuda/compat",
                "ENV=/etc/shinit_v2",
                "BASH_ENV=/etc/bash.bashrc",
                "NVIDIA_REQUIRE_CUDA=cuda>=9.0",
                "NCCL_VERSION=2.8.4",
                "CUBLAS_VERSION=11.4.1.1026",
                "CUFFT_VERSION=10.4.0.135",
                "CURAND_VERSION=10.2.3.135",
                "CUSPARSE_VERSION=11.4.0.135",
                "CUSOLVER_VERSION=11.1.0.135",
                "NPP_VERSION=11.3.2.139",
                "NVJPEG_VERSION=11.4.0.135",
                "CUDNN_VERSION=8.1.1.33",
                "TRT_VERSION=7.2.2.3+cuda11.1.0.024",
                "TRTOSS_VERSION=21.03",
                "NSIGHT_SYSTEMS_VERSION=2020.4.3.7",
                "NSIGHT_COMPUTE_VERSION=2020.3.1.3",
                "DALI_VERSION=0.31.0",
                "DALI_BUILD=2054952",
                "DLPROF_VERSION=21.03",
                "LD_LIBRARY_PATH=/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64",
                "NVIDIA_VISIBLE_DEVICES=all",
                "NVIDIA_DRIVER_CAPABILITIES=compute,utility,video",
                "MOFED_VERSION=5.1-2.3.7",
                "OPENUCX_VERSION=1.9.0",
                "OPENMPI_VERSION=4.0.5",
                "LIBRARY_PATH=/usr/local/cuda/lib64/stubs:",
                "PYTORCH_BUILD_VERSION=1.9.0a0+df837d0",
                "PYTORCH_VERSION=1.9.0a0+df837d0",
                "PYTORCH_BUILD_NUMBER=0",
                "NVIDIA_PYTORCH_VERSION=21.03",
                "NVM_DIR=/usr/local/nvm",
                "JUPYTER_PORT=8888",
                "TENSORBOARD_PORT=6006",
                "TORCH_CUDA_ARCH_LIST=5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX",
                "COCOAPI_VERSION=2.0+nv0.4.0",
                "PYTHONIOENCODING=utf-8",
                "LC_ALL=C.UTF-8",
                "NVIDIA_BUILD_ID=21060478"
            ],
            "Cmd": [
                "python",
                "fine_tune_step.py",
                "--local=False"
            ],
            "Image": "xprun.main:631293ab8ab9943075a2a66c140eedf774583bc97accfadcfd16dae0eb1b3236",
            "Volumes": null,
            "WorkingDir": "/root/NNgamma",
            "Entrypoint": [
                "/usr/local/bin/nvidia_entrypoint.sh"
            ],
            "OnBuild": null,
            "Labels": {
                "com.nvidia.build.id": "21060478",
                "com.nvidia.build.ref": "b8c8e4e4a0105e697e84364fac28ae7d8024e4e5",
                "com.nvidia.cublas.version": "11.4.1.1026",
                "com.nvidia.cuda.version": "9.0",
                "com.nvidia.cudnn.version": "8.1.1.33",
                "com.nvidia.cufft.version": "10.4.0.135",
                "com.nvidia.curand.version": "10.2.3.135",
                "com.nvidia.cusolver.version": "11.1.0.135",
                "com.nvidia.cusparse.version": "11.4.0.135",
                "com.nvidia.nccl.version": "2.8.4",
                "com.nvidia.npp.version": "11.3.2.139",
                "com.nvidia.nsightcompute.version": "2020.3.1.3",
                "com.nvidia.nsightsystems.version": "2020.4.3.7",
                "com.nvidia.nvjpeg.version": "11.4.0.135",
                "com.nvidia.pytorch.version": "1.9.0a0+df837d0",
                "com.nvidia.tensorrt.version": "7.2.2.3+cuda11.1.0.024",
                "com.nvidia.tensorrtoss.version": "21.03",
                "com.nvidia.volumes.needed": "nvidia_driver"
            }
        },
        "NetworkSettings": {
            "Bridge": "",
            "SandboxID": "9b680677e35263117b3195ff6992c4e179c46e16d540db280756ce721f5884cb",
            "HairpinMode": false,
            "LinkLocalIPv6Address": "",
            "LinkLocalIPv6PrefixLen": 0,
            "Ports": {
                "6006/tcp": null,
                "8888/tcp": null
            },
            "SandboxKey": "/var/run/docker/netns/9b680677e352",
            "SecondaryIPAddresses": null,
            "SecondaryIPv6Addresses": null,
            "EndpointID": "",
            "Gateway": "",
            "GlobalIPv6Address": "",
            "GlobalIPv6PrefixLen": 0,
            "IPAddress": "",
            "IPPrefixLen": 0,
            "IPv6Gateway": "",
            "MacAddress": "",
            "Networks": {
                "xprun.shared_network": {
                    "IPAMConfig": null,
                    "Links": null,
                    "Aliases": [
                        "957d21a0cba3"
                    ],
                    "NetworkID": "c0ec5dd4a4d1918dcb2392b3ada1b5236a07198fa950d1a99e583c9fd9b4d2a6",
                    "EndpointID": "73076eb431e1484b1bff6d3e775b61b8e3da75f3ec25d8724de0157ef426d742",
                    "Gateway": "172.18.0.1",
                    "IPAddress": "172.18.0.2",
                    "IPPrefixLen": 16,
                    "IPv6Gateway": "",
                    "GlobalIPv6Address": "",
                    "GlobalIPv6PrefixLen": 0,
                    "MacAddress": "02:42:ac:12:00:02",
                    "DriverOpts": null
                }
            }
        }
    }
]

cswinter avatar Oct 05 '21 15:10 cswinter