diff --git a/manifests/llama/cpu.yaml b/manifests/llama/cpu.yaml deleted file mode 100644 index aa04ff6..0000000 --- a/manifests/llama/cpu.yaml +++ /dev/null @@ -1,147 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: llama-server-cpu - namespace: llama -spec: - replicas: 1 - strategy: - type: Recreate - selector: - matchLabels: - app: llama-server-cpu - template: - metadata: - labels: - app: llama-server-cpu - annotations: - prometheus.io/scrape: "true" - prometheus.io/port: "8080" - prometheus.io/path: "/metrics" - spec: - nodeSelector: - gpu: amd - - initContainers: - - name: download-model - image: python:3.11-slim - env: - - name: HF_HOME - value: /models/.hf - - name: MODEL_REPO - value: "byteshape/Qwen3-Coder-30B-A3B-Instruct-GGUF" - - name: MODEL_FILE - value: "Qwen3-Coder-30B-A3B-Instruct-IQ4_XS-4.20bpw.gguf" - command: - - /bin/sh - - -c - - | - set -eux - - MODEL_PATH="/models/${MODEL_FILE}" - - if [ -f "${MODEL_PATH}" ]; then - echo "Model already exists at ${MODEL_PATH}, skipping download" - exit 0 - fi - - echo "Installing Hugging Face Hub downloader" - pip install --no-cache-dir huggingface_hub - - echo "Downloading ${MODEL_REPO}/${MODEL_FILE}" - python - <<'PY' - import os - from huggingface_hub import hf_hub_download - - repo_id = os.environ["MODEL_REPO"] - filename = os.environ["MODEL_FILE"] - - token = os.environ.get("HUGGING_FACE_HUB_TOKEN") - - path = hf_hub_download( - repo_id=repo_id, - filename=filename, - local_dir="/models", - local_dir_use_symlinks=False, - token=token, - ) - print(f"Downloaded to: {path}") - PY - - ls -lah /models - volumeMounts: - - name: models - mountPath: /models - - containers: - - name: llama - image: ghcr.io/ggml-org/llama.cpp:server - args: - - "--model" - - "/models/Qwen3-Coder-30B-A3B-Instruct-IQ4_XS-4.20bpw.gguf" - - "--host" - - "0.0.0.0" - - "--port" - - "8080" - - "--metrics" - - "--ctx-size" - - "32768" - - "--parallel" - - "1" - - "--cache-type-k" - - "q8_0" - - "--cache-type-v" - - "q8_0" - ports: - - name: http - containerPort: 8080 - - volumeMounts: - - name: models - mountPath: /models - - resources: - requests: - cpu: "8" - memory: "24Gi" - limits: - cpu: "12" - memory: "24Gi" - - volumes: - - name: models - persistentVolumeClaim: - claimName: llama-cpu-models-pvc ---- -apiVersion: v1 -kind: Service -metadata: - name: llama-server-cpu - namespace: llama -spec: - selector: - app: llama-server-cpu - ports: - - name: http - port: 8080 - targetPort: http - type: ClusterIP ---- -apiVersion: monitoring.coreos.com/v1 -kind: PodMonitor -metadata: - name: llama-server-cpu - namespace: llama - labels: - app: llama-server-cpu -spec: - namespaceSelector: - matchNames: - - llama - selector: - matchLabels: - app: llama-server-cpu - podMetricsEndpoints: - - port: http - path: /metrics - interval: 15s