Initial commit: k3s GitOps manifests with ArgoCD App-of-Apps

2026-05-05 13:18:51 +03:00
commit 5d9a80b976
65 changed files with 3445 additions and 0 deletions
--- a/llama/cpu.yaml
+++ b/llama/cpu.yaml
@@ -0,0 +1,147 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: llama-server-cpu
+  namespace: llama
+spec:
+  replicas: 1
+  strategy:
+    type: Recreate
+  selector:
+    matchLabels:
+      app: llama-server-cpu
+  template:
+    metadata:
+      labels:
+        app: llama-server-cpu
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "8080"
+        prometheus.io/path: "/metrics"
+    spec:
+      nodeSelector:
+        gpu: amd
+
+      initContainers:
+        - name: download-model
+          image: python:3.11-slim
+          env:
+            - name: HF_HOME
+              value: /models/.hf
+            - name: MODEL_REPO
+              value: "byteshape/Qwen3-Coder-30B-A3B-Instruct-GGUF"
+            - name: MODEL_FILE
+              value: "Qwen3-Coder-30B-A3B-Instruct-IQ4_XS-4.20bpw.gguf"
+          command:
+            - /bin/sh
+            - -c
+            - |
+              set -eux
+
+              MODEL_PATH="/models/${MODEL_FILE}"
+
+              if [ -f "${MODEL_PATH}" ]; then
+                echo "Model already exists at ${MODEL_PATH}, skipping download"
+                exit 0
+              fi
+
+              echo "Installing Hugging Face Hub downloader"
+              pip install --no-cache-dir huggingface_hub
+
+              echo "Downloading ${MODEL_REPO}/${MODEL_FILE}"
+              python - <<'PY'
+              import os
+              from huggingface_hub import hf_hub_download
+
+              repo_id = os.environ["MODEL_REPO"]
+              filename = os.environ["MODEL_FILE"]
+
+              token = os.environ.get("HUGGING_FACE_HUB_TOKEN")
+
+              path = hf_hub_download(
+                  repo_id=repo_id,
+                  filename=filename,
+                  local_dir="/models",
+                  local_dir_use_symlinks=False,
+                  token=token,
+              )
+              print(f"Downloaded to: {path}")
+              PY
+
+              ls -lah /models
+          volumeMounts:
+            - name: models
+              mountPath: /models
+
+      containers:
+        - name: llama
+          image: ghcr.io/ggml-org/llama.cpp:server
+          args:
+            - "--model"
+            - "/models/Qwen3-Coder-30B-A3B-Instruct-IQ4_XS-4.20bpw.gguf"
+            - "--host"
+            - "0.0.0.0"
+            - "--port"
+            - "8080"
+            - "--metrics"
+            - "--ctx-size"
+            - "32768"
+            - "--parallel"
+            - "1"
+            - "--cache-type-k"
+            - "q8_0"
+            - "--cache-type-v"
+            - "q8_0"
+          ports:
+            - name: http
+              containerPort: 8080
+
+          volumeMounts:
+            - name: models
+              mountPath: /models
+
+          resources:
+            requests:
+              cpu: "8"
+              memory: "24Gi"
+            limits:
+              cpu: "12"
+              memory: "24Gi"
+
+      volumes:
+        - name: models
+          persistentVolumeClaim:
+            claimName: llama-cpu-models-pvc
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: llama-server-cpu
+  namespace: llama
+spec:
+  selector:
+    app: llama-server-cpu
+  ports:
+    - name: http
+      port: 8080
+      targetPort: http
+  type: ClusterIP
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: llama-server-cpu
+  namespace: llama
+  labels:
+    app: llama-server-cpu
+spec:
+  namespaceSelector:
+    matchNames:
+      - llama
+  selector:
+    matchLabels:
+      app: llama-server-cpu
+  podMetricsEndpoints:
+    - port: http
+      path: /metrics
+      interval: 15s