apiVersion: apps/v1
kind: Deployment
metadata:
  name: llama-server-gpu
  namespace: llama
spec:
  replicas: 1
  selector:
    matchLabels:
      app: llama-server-gpu
  template:
    metadata:
      labels:
        app: llama-server-gpu
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/port: "8080"
        prometheus.io/path: "/metrics"
    spec:
      nodeSelector:
        gpu: amd

      initContainers:
        - name: download-model
          image: python:3.11-slim
          env:
            - name: HF_HOME
              value: /models/.hf
            - name: MODEL_REPO
              value: "byteshape/Qwen3.6-35B-A3B-GGUF"
            - name: MODEL_FILE
              value: "Qwen3.6-35B-A3B-IQ3_S-3.00bpw.gguf"
            # optional, only if you need gated/private models
            # - name: HUGGING_FACE_HUB_TOKEN
            #   valueFrom:
            #     secretKeyRef:
            #       name: hf-token
            #       key: token
          command:
            - /bin/sh
            - -c
            - |
              set -eux

              MODEL_PATH="/models/${MODEL_FILE}"

              if [ -f "${MODEL_PATH}" ]; then
                echo "Model already exists at ${MODEL_PATH}, skipping download"
                exit 0
              fi

              echo "Installing Hugging Face Hub downloader"
              pip install --no-cache-dir huggingface_hub

              echo "Downloading ${MODEL_REPO}/${MODEL_FILE}"
              python - <<'PY'
              import os
              from huggingface_hub import hf_hub_download

              repo_id = os.environ["MODEL_REPO"]
              filename = os.environ["MODEL_FILE"]

              token = os.environ.get("HUGGING_FACE_HUB_TOKEN")

              path = hf_hub_download(
                  repo_id=repo_id,
                  filename=filename,
                  local_dir="/models",
                  local_dir_use_symlinks=False,
                  token=token,
              )
              print(f"Downloaded to: {path}")
              PY

              ls -lah /models
          volumeMounts:
            - name: models
              mountPath: /models

      containers:
        - name: llama
          image: ghcr.io/ggml-org/llama.cpp:server-vulkan
          args:
            - "--model"
            - "/models/Qwen3.6-35B-A3B-IQ3_S-3.00bpw.gguf"
            - "--mmproj"
            - "/models/mmproj-bf16.gguf"
            - "--host"
            - "0.0.0.0"
            - "--port"
            - "8080"
            - "--n-gpu-layers"
            - "999"
            - "--metrics"

            # performance tuning
            - "--ctx-size"
            - "24576"
            - "--parallel"
            - "2"

            # KV cache quantization
            - "--cache-type-k"
            - "q8_0"
            - "--cache-type-v"
            - "q8_0"
          ports:
            - name: http
              containerPort: 8080

          securityContext:
            privileged: true

          volumeMounts:
            - name: models
              mountPath: /models
            - name: dri
              mountPath: /dev/dri

          resources:
            requests:
              cpu: "2"
              memory: "4Gi"
            limits:
              cpu: "2"
              memory: "4Gi"

      volumes:
        - name: models
          persistentVolumeClaim:
            claimName: llama-gpu-models-pvc
        - name: dri
          hostPath:
            path: /dev/dri
            type: Directory
---
apiVersion: v1
kind: Service
metadata:
  name: llama-server-gpu
  namespace: llama
spec:
  selector:
    app: llama-server-gpu
  ports:
    - name: http
      port: 8080
      targetPort: http
  type: ClusterIP
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
  name: llama-server-gpu
  namespace: llama
  labels:
    app: llama-server-gpu
spec:
  namespaceSelector:
    matchNames:
      - llama
  selector:
    matchLabels:
      app: llama-server-gpu
  podMetricsEndpoints:
    - port: http
      path: /metrics
      interval: 15s