apiVersion: apps/v1
kind: Deployment
metadata:
  name: llama-server-cpu
  namespace: llama
spec:
  replicas: 0
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app: llama-server-cpu
  template:
    metadata:
      labels:
        app: llama-server-cpu
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/port: "8080"
        prometheus.io/path: "/metrics"
    spec:
      nodeSelector:
        gpu: amd

      initContainers:
        - name: download-model
          image: python:3.11-slim
          env:
            - name: HF_HOME
              value: /models/.hf
            - name: MODEL_REPO
              value: "byteshape/Qwen3-Coder-30B-A3B-Instruct-GGUF"
            - name: MODEL_FILE
              value: "Qwen3-Coder-30B-A3B-Instruct-IQ4_XS-4.20bpw.gguf"
          command:
            - /bin/sh
            - -c
            - |
              set -eux

              MODEL_PATH="/models/${MODEL_FILE}"

              if [ -f "${MODEL_PATH}" ]; then
                echo "Model already exists at ${MODEL_PATH}, skipping download"
                exit 0
              fi

              echo "Installing Hugging Face Hub downloader"
              pip install --no-cache-dir huggingface_hub

              echo "Downloading ${MODEL_REPO}/${MODEL_FILE}"
              python - <<'PY'
              import os
              from huggingface_hub import hf_hub_download

              repo_id = os.environ["MODEL_REPO"]
              filename = os.environ["MODEL_FILE"]

              token = os.environ.get("HUGGING_FACE_HUB_TOKEN")

              path = hf_hub_download(
                  repo_id=repo_id,
                  filename=filename,
                  local_dir="/models",
                  local_dir_use_symlinks=False,
                  token=token,
              )
              print(f"Downloaded to: {path}")
              PY

              ls -lah /models
          volumeMounts:
            - name: models
              mountPath: /models

      containers:
        - name: llama
          image: ghcr.io/ggml-org/llama.cpp:server
          args:
            - "--model"
            - "/models/Qwen3-Coder-30B-A3B-Instruct-IQ4_XS-4.20bpw.gguf"
            - "--host"
            - "0.0.0.0"
            - "--port"
            - "8080"
            - "--metrics"
            - "--ctx-size"
            - "32768"
            - "--parallel"
            - "1"
            - "--cache-type-k"
            - "q8_0"
            - "--cache-type-v"
            - "q8_0"
          ports:
            - name: http
              containerPort: 8080

          volumeMounts:
            - name: models
              mountPath: /models

          resources:
            requests:
              cpu: "8"
              memory: "24Gi"
            limits:
              cpu: "12"
              memory: "24Gi"

      volumes:
        - name: models
          persistentVolumeClaim:
            claimName: llama-cpu-models-pvc
---
apiVersion: v1
kind: Service
metadata:
  name: llama-server-cpu
  namespace: llama
spec:
  selector:
    app: llama-server-cpu
  ports:
    - name: http
      port: 8080
      targetPort: http
  type: ClusterIP
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
  name: llama-server-cpu
  namespace: llama
  labels:
    app: llama-server-cpu
spec:
  namespaceSelector:
    matchNames:
      - llama
  selector:
    matchLabels:
      app: llama-server-cpu
  podMetricsEndpoints:
    - port: http
      path: /metrics
      interval: 15s