Skip to content
Merged
13 changes: 11 additions & 2 deletions Dockerfile.sdk
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ ARG JAVA_BINDINGS_MAVEN_VERSION=3.8.4
ARG JAVA_BINDINGS_JAVACPP_PRESETS_TAG=1.5.8
ARG TRITON_PERF_ANALYZER_BUILD=1
# DCGM version to install for Model Analyzer
ARG DCGM_VERSION=3.3.6
ARG DCGM_VERSION=4.4.0

ARG NVIDIA_TRITON_SERVER_SDK_VERSION=unknown
ARG NVIDIA_BUILD_ID=unknown
Expand Down Expand Up @@ -265,7 +265,16 @@ RUN if [ "$TRITON_ENABLE_GPU" = "ON" ]; then \
curl -o /tmp/cuda-keyring.deb \
https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/$arch/cuda-keyring_1.1-1_all.deb \
&& apt install /tmp/cuda-keyring.deb && rm /tmp/cuda-keyring.deb && \
apt-get update && apt-get install -y datacenter-gpu-manager=1:${DCGM_VERSION}; \
if [ "${arch}" = "x86_64" ]; then \
curl -kL https://kitmaker-web.nvidia.com/kitpicks/dcgm-rel-dcgm-4-4/${DCGM_VERSION}/001/local_installers/dcgm-local-repo-ubuntu2404-${DCGM_VERSION}_1.0-1_amd64.deb -o /tmp/dcgm-local.deb; \
Comment thread
yinggeh marked this conversation as resolved.
Outdated
else \
curl -kL https://kitmaker-web.nvidia.com/kitpicks/dcgm-rel-dcgm-4-4/${DCGM_VERSION}/001/local_installers/dcgm-local-repo-ubuntu2404-${DCGM_VERSION}_1.0-1_arm64.deb -o /tmp/dcgm-local.deb; \
fi \
&& apt install /tmp/dcgm-local.deb \
&& rm /tmp/dcgm-local.deb \
&& cp /var/dcgm-local-repo-ubuntu2404-${DCGM_VERSION}/dcgm-local-*-keyring.gpg /usr/share/keyrings/ \
&& apt-get update \
&& apt install --yes datacenter-gpu-manager-4-dev; \
Comment thread
yinggeh marked this conversation as resolved.
Outdated
fi

# Build expects "python" executable (not python3).
Expand Down
36 changes: 27 additions & 9 deletions build.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@
"ort_version": "1.22.0",
"ort_openvino_version": "2025.2.0",
"standalone_openvino_version": "2025.2.0",
"dcgm_version": "3.3.6",
"dcgm_version": "4.4.0",
Comment thread
yinggeh marked this conversation as resolved.
Outdated
"vllm_version": "0.9.0.1",
"rhel_py_version": "3.12.3",
}
Expand Down Expand Up @@ -858,19 +858,29 @@ def install_dcgm_libraries(dcgm_version, target_machine):
# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo \\
&& dnf clean expire-cache \\
&& dnf install -y datacenter-gpu-manager-{}
&& curl -kL https://kitmaker-web.nvidia.com/kitpicks/dcgm-rel-dcgm-4-4/{}/001/local_installers/dcgm-local-repo-rhel8-{}-1.0-1.aarch64.rpm -o /tmp/dcgm-local.rpm \\
&& dnf install /tmp/dcgm-local.rpm \\
&& rm /tmp/dcgm-local.rpm \\
&& cp /var/dcgm-local-repo-rhel8-{}/dcgm-local-*-keyring.gpg /usr/share/keyrings/ \\
&& dnf clean expire-cache \\
&& dnf install --assumeyes datacenter-gpu-manager-4-devel
""".format(
dcgm_version, dcgm_version
dcgm_version, dcgm_version, dcgm_version, dcgm_version
)
else:
return """
ENV DCGM_VERSION {}
# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo \\
&& dnf clean expire-cache \\
&& dnf install -y datacenter-gpu-manager-{}
&& curl -kL https://kitmaker-web.nvidia.com/kitpicks/dcgm-rel-dcgm-4-4/{}/001/local_installers/dcgm-local-repo-rhel8-{}-1.0-1.x86_64.rpm -o /tmp/dcgm-local.rpm \\
&& dnf install /tmp/dcgm-local.rpm \\
&& rm /tmp/dcgm-local.rpm \\
&& cp /var/dcgm-local-repo-rhel8-{}/dcgm-local-*-keyring.gpg /usr/share/keyrings/ \\
&& dnf clean expire-cache \\
&& dnf install --assumeyes datacenter-gpu-manager-4-devel
""".format(
dcgm_version, dcgm_version
dcgm_version, dcgm_version, dcgm_version, dcgm_version
)
else:
if target_machine == "aarch64":
Expand All @@ -881,10 +891,14 @@ def install_dcgm_libraries(dcgm_version, target_machine):
https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa/cuda-keyring_1.1-1_all.deb \\
&& apt install /tmp/cuda-keyring.deb \\
&& rm /tmp/cuda-keyring.deb \\
&& curl -kL https://kitmaker-web.nvidia.com/kitpicks/dcgm-rel-dcgm-4-4/{}/001/local_installers/dcgm-local-repo-ubuntu2404-{}_1.0-1_arm64.deb -o /tmp/dcgm-local.deb \\
&& apt install /tmp/dcgm-local.deb \\
&& rm /tmp/dcgm-local.deb \\
&& cp /var/dcgm-local-repo-ubuntu2404-{}/dcgm-local-*-keyring.gpg /usr/share/keyrings/ \\
&& apt-get update \\
Comment thread
yinggeh marked this conversation as resolved.
Outdated
&& apt-get install -y datacenter-gpu-manager=1:{}
&& apt install --yes datacenter-gpu-manager-4-dev
""".format(
dcgm_version, dcgm_version
dcgm_version, dcgm_version, dcgm_version, dcgm_version
)
else:
return """
Expand All @@ -894,10 +908,14 @@ def install_dcgm_libraries(dcgm_version, target_machine):
https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb \\
&& apt install /tmp/cuda-keyring.deb \\
&& rm /tmp/cuda-keyring.deb \\
&& curl -kL https://kitmaker-web.nvidia.com/kitpicks/dcgm-rel-dcgm-4-4/{}/001/local_installers/dcgm-local-repo-ubuntu2404-{}_1.0-1_amd64.deb -o /tmp/dcgm-local.deb \\
&& apt install /tmp/dcgm-local.deb \\
&& rm /tmp/dcgm-local.deb \\
&& cp /var/dcgm-local-repo-ubuntu2404-{}/dcgm-local-*-keyring.gpg /usr/share/keyrings/ \\
&& apt-get update \\
&& apt-get install -y datacenter-gpu-manager=1:{}
&& apt install --yes datacenter-gpu-manager-4-dev
""".format(
dcgm_version, dcgm_version
dcgm_version, dcgm_version, dcgm_version, dcgm_version
)


Expand Down
2 changes: 1 addition & 1 deletion compose.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ def create_argmap(images, skip_pull):
dcgm_ver = re.search("DCGM_VERSION=([\S]{4,}) ", vars)
dcgm_version = ""
if dcgm_ver is None:
dcgm_version = "3.3.6"
dcgm_version = "4.4.0"
log(
"WARNING: DCGM version not found from image, installing the earlierst version {}".format(
dcgm_version
Expand Down
Loading