Skip to content

Commit 4c4344f

Browse files
committed
Attempt nvidia-device-plugin app fixes
1 parent 07ea2d0 commit 4c4344f

4 files changed

Lines changed: 23 additions & 7 deletions

File tree

helm/nvidia-device-plugin/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@ name: nvidia-device-plugin
33
description: NVIDIA Device Plugin for Kubernetes
44
type: application
55
version: 0.1.0
6-
appVersion: "v0.17.1"
6+
appVersion: "v0.18.0"

helm/nvidia-device-plugin/templates/daemonset.yaml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ spec:
2525
- image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
2626
imagePullPolicy: {{ .Values.image.pullPolicy }}
2727
name: nvidia-device-plugin-ctr
28-
args: ["--fail-on-init-error=false"]
28+
command: ["nvidia-device-plugin"]
2929
securityContext:
3030
{{- toYaml .Values.securityContext | nindent 10 }}
3131
resources:
@@ -35,8 +35,13 @@ spec:
3535
env:
3636
- name: PASS_DEVICE_SPECS
3737
value: "true"
38+
{{- if typeIs "bool" .Values.failOnInitError }}
39+
- name: FAIL_ON_INIT_ERROR
40+
value: {{ .Values.failOnInitError | quote }}
41+
{{- else }}
3842
- name: FAIL_ON_INIT_ERROR
3943
value: "false"
44+
{{- end }}
4045
{{- if .Values.env }}
4146
{{- toYaml .Values.env | nindent 8 }}
4247
{{- else }}
@@ -50,5 +55,4 @@ spec:
5055
value: "compute,utility"
5156
{{- end }}
5257
volumes:
53-
{{- toYaml .Values.volumes | nindent 8 }}
54-
hostNetwork: true
58+
{{- toYaml .Values.volumes | nindent 8 }}

helm/nvidia-device-plugin/values.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ image:
33
tag: v0.18.0
44
pullPolicy: IfNotPresent
55

6+
failOnInitError: false
7+
68
nodeSelector:
79
kubernetes.io/arch: amd64
810
gpu: "true"
@@ -33,6 +35,15 @@ env:
3335
value: "all"
3436
- name: NVIDIA_DRIVER_CAPABILITIES
3537
value: "compute,utility"
38+
# Set NVIDIA driver root to help locate driver libraries
39+
# Default is "/" which should work with privileged containers
40+
- name: NVIDIA_DRIVER_ROOT
41+
value: "/"
42+
# Device discovery strategy: "auto" (default) or "nvml"
43+
# Use "auto" to let the plugin automatically detect the best strategy
44+
# Only use "nvml" if you know NVML is available and auto-detection fails
45+
# - name: DEVICE_DISCOVERY_STRATEGY
46+
# value: "auto"
3647

3748
volumeMounts:
3849
- name: device-plugin

helm/ollama/values.yaml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
# Custom values for ollama-helm with GPU support
21
# Based on official otwld/ollama-helm chart with custom configurations
32

43
replicaCount: 1
@@ -172,9 +171,11 @@ persistentVolume:
172171
enabled: true
173172
accessModes:
174173
- ReadWriteOnce
175-
annotations: {}
174+
annotations:
175+
# Set the number of replicas for this Longhorn volume by overriding the storage class default
176+
longhorn.io/number-of-replicas: "1"
176177
existingClaim: ""
177-
size: 500Gi
178+
size: 200Gi
178179
storageClass: "longhorn"
179180
volumeMode: ""
180181
subPath: ""

0 commit comments

Comments
 (0)