Remove llama-server-cpu from ArgoCD manifests

2026-05-07 21:39:38 +03:00
parent 231e90a965
commit 03b5b6f07c
1 changed files with 0 additions and 147 deletions
--- a/manifests/llama/cpu.yaml
+++ b/manifests/llama/cpu.yaml
@@ -1,147 +0,0 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: llama-server-cpu
  namespace: llama
 spec:
  replicas: 1
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app: llama-server-cpu
  template:
    metadata:
      labels:
        app: llama-server-cpu
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/port: "8080"
        prometheus.io/path: "/metrics"
    spec:
      nodeSelector:
        gpu: amd
      initContainers:
        - name: download-model
          image: python:3.11-slim
          env:
            - name: HF_HOME
              value: /models/.hf
            - name: MODEL_REPO
              value: "byteshape/Qwen3-Coder-30B-A3B-Instruct-GGUF"
            - name: MODEL_FILE
              value: "Qwen3-Coder-30B-A3B-Instruct-IQ4_XS-4.20bpw.gguf"
          command:
            - /bin/sh
            - -c
            - |
              set -eux
              MODEL_PATH="/models/${MODEL_FILE}"
              if [ -f "${MODEL_PATH}" ]; then
                echo "Model already exists at ${MODEL_PATH}, skipping download"
                exit 0
              fi
              echo "Installing Hugging Face Hub downloader"
              pip install --no-cache-dir huggingface_hub
              echo "Downloading ${MODEL_REPO}/${MODEL_FILE}"
              python - <<'PY'
              import os
              from huggingface_hub import hf_hub_download
              repo_id = os.environ["MODEL_REPO"]
              filename = os.environ["MODEL_FILE"]
              token = os.environ.get("HUGGING_FACE_HUB_TOKEN")
              path = hf_hub_download(
                  repo_id=repo_id,
                  filename=filename,
                  local_dir="/models",
                  local_dir_use_symlinks=False,
                  token=token,
              )
              print(f"Downloaded to: {path}")
              PY
              ls -lah /models
          volumeMounts:
            - name: models
              mountPath: /models
      containers:
        - name: llama
          image: ghcr.io/ggml-org/llama.cpp:server
          args:
            - "--model"
            - "/models/Qwen3-Coder-30B-A3B-Instruct-IQ4_XS-4.20bpw.gguf"
            - "--host"
            - "0.0.0.0"
            - "--port"
            - "8080"
            - "--metrics"
            - "--ctx-size"
            - "32768"
            - "--parallel"
            - "1"
            - "--cache-type-k"
            - "q8_0"
            - "--cache-type-v"
            - "q8_0"
          ports:
            - name: http
              containerPort: 8080
          volumeMounts:
            - name: models
              mountPath: /models
          resources:
            requests:
              cpu: "8"
              memory: "24Gi"
            limits:
              cpu: "12"
              memory: "24Gi"
      volumes:
        - name: models
          persistentVolumeClaim:
            claimName: llama-cpu-models-pvc
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: llama-server-cpu
  namespace: llama
 spec:
  selector:
    app: llama-server-cpu
  ports:
    - name: http
      port: 8080
      targetPort: http
  type: ClusterIP
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
 metadata:
  name: llama-server-cpu
  namespace: llama
  labels:
    app: llama-server-cpu
 spec:
  namespaceSelector:
    matchNames:
      - llama
  selector:
    matchLabels:
      app: llama-server-cpu
  podMetricsEndpoints:
    - port: http
      path: /metrics
      interval: 15s