apiVersion: apps/v1 kind: Deployment metadata: name: llama-server-gpu namespace: llama spec: replicas: 1 selector: matchLabels: app: llama-server-gpu template: metadata: labels: app: llama-server-gpu annotations: prometheus.io/scrape: "true" prometheus.io/port: "8080" prometheus.io/path: "/metrics" spec: nodeSelector: gpu: amd initContainers: - name: download-model image: python:3.11-slim env: - name: HF_HOME value: /models/.hf - name: MODEL_REPO value: "byteshape/Qwen3.6-35B-A3B-GGUF" - name: MODEL_FILE value: "Qwen3.6-35B-A3B-IQ3_S-3.00bpw.gguf" # optional, only if you need gated/private models # - name: HUGGING_FACE_HUB_TOKEN # valueFrom: # secretKeyRef: # name: hf-token # key: token command: - /bin/sh - -c - | set -eux MODEL_PATH="/models/${MODEL_FILE}" if [ -f "${MODEL_PATH}" ]; then echo "Model already exists at ${MODEL_PATH}, skipping download" exit 0 fi echo "Installing Hugging Face Hub downloader" pip install --no-cache-dir huggingface_hub echo "Downloading ${MODEL_REPO}/${MODEL_FILE}" python - <<'PY' import os from huggingface_hub import hf_hub_download repo_id = os.environ["MODEL_REPO"] filename = os.environ["MODEL_FILE"] token = os.environ.get("HUGGING_FACE_HUB_TOKEN") path = hf_hub_download( repo_id=repo_id, filename=filename, local_dir="/models", local_dir_use_symlinks=False, token=token, ) print(f"Downloaded to: {path}") PY ls -lah /models volumeMounts: - name: models mountPath: /models containers: - name: llama image: ghcr.io/ggml-org/llama.cpp:server-vulkan args: - "--model" - "/models/Qwen3.6-35B-A3B-IQ3_S-3.00bpw.gguf" - "--mmproj" - "/models/mmproj-bf16.gguf" - "--host" - "0.0.0.0" - "--port" - "8080" - "--n-gpu-layers" - "999" - "--metrics" # performance tuning - "--ctx-size" - "24576" - "--parallel" - "2" # KV cache quantization - "--cache-type-k" - "q8_0" - "--cache-type-v" - "q8_0" ports: - name: http containerPort: 8080 securityContext: privileged: true volumeMounts: - name: models mountPath: /models - name: dri mountPath: /dev/dri resources: requests: cpu: "2" memory: "4Gi" limits: cpu: "2" memory: "4Gi" volumes: - name: models persistentVolumeClaim: claimName: llama-gpu-models-pvc - name: dri hostPath: path: /dev/dri type: Directory --- apiVersion: v1 kind: Service metadata: name: llama-server-gpu namespace: llama spec: selector: app: llama-server-gpu ports: - name: http port: 8080 targetPort: http type: ClusterIP --- apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: llama-server-gpu namespace: llama labels: app: llama-server-gpu spec: namespaceSelector: matchNames: - llama selector: matchLabels: app: llama-server-gpu podMetricsEndpoints: - port: http path: /metrics interval: 15s