apiVersion: apps/v1 kind: Deployment metadata: name: llama-server-cpu namespace: llama spec: replicas: 0 strategy: type: Recreate selector: matchLabels: app: llama-server-cpu template: metadata: labels: app: llama-server-cpu annotations: prometheus.io/scrape: "true" prometheus.io/port: "8080" prometheus.io/path: "/metrics" spec: nodeSelector: gpu: amd initContainers: - name: download-model image: python:3.11-slim env: - name: HF_HOME value: /models/.hf - name: MODEL_REPO value: "byteshape/Qwen3-Coder-30B-A3B-Instruct-GGUF" - name: MODEL_FILE value: "Qwen3-Coder-30B-A3B-Instruct-IQ4_XS-4.20bpw.gguf" command: - /bin/sh - -c - | set -eux MODEL_PATH="/models/${MODEL_FILE}" if [ -f "${MODEL_PATH}" ]; then echo "Model already exists at ${MODEL_PATH}, skipping download" exit 0 fi echo "Installing Hugging Face Hub downloader" pip install --no-cache-dir huggingface_hub echo "Downloading ${MODEL_REPO}/${MODEL_FILE}" python - <<'PY' import os from huggingface_hub import hf_hub_download repo_id = os.environ["MODEL_REPO"] filename = os.environ["MODEL_FILE"] token = os.environ.get("HUGGING_FACE_HUB_TOKEN") path = hf_hub_download( repo_id=repo_id, filename=filename, local_dir="/models", local_dir_use_symlinks=False, token=token, ) print(f"Downloaded to: {path}") PY ls -lah /models volumeMounts: - name: models mountPath: /models containers: - name: llama image: ghcr.io/ggml-org/llama.cpp:server args: - "--model" - "/models/Qwen3-Coder-30B-A3B-Instruct-IQ4_XS-4.20bpw.gguf" - "--host" - "0.0.0.0" - "--port" - "8080" - "--metrics" - "--ctx-size" - "32768" - "--parallel" - "1" - "--cache-type-k" - "q8_0" - "--cache-type-v" - "q8_0" ports: - name: http containerPort: 8080 volumeMounts: - name: models mountPath: /models resources: requests: cpu: "8" memory: "24Gi" limits: cpu: "12" memory: "24Gi" volumes: - name: models persistentVolumeClaim: claimName: llama-cpu-models-pvc --- apiVersion: v1 kind: Service metadata: name: llama-server-cpu namespace: llama spec: selector: app: llama-server-cpu ports: - name: http port: 8080 targetPort: http type: ClusterIP --- apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: llama-server-cpu namespace: llama labels: app: llama-server-cpu spec: namespaceSelector: matchNames: - llama selector: matchLabels: app: llama-server-cpu podMetricsEndpoints: - port: http path: /metrics interval: 15s