Initial commit: k3s GitOps manifests with ArgoCD App-of-Apps

2026-05-05 13:18:51 +03:00
commit 5d9a80b976
65 changed files with 3445 additions and 0 deletions
--- a/llama/cpu.yaml
+++ b/llama/cpu.yaml
@@ -0,0 +1,147 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: llama-server-cpu
+  namespace: llama
+spec:
+  replicas: 1
+  strategy:
+    type: Recreate
+  selector:
+    matchLabels:
+      app: llama-server-cpu
+  template:
+    metadata:
+      labels:
+        app: llama-server-cpu
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "8080"
+        prometheus.io/path: "/metrics"
+    spec:
+      nodeSelector:
+        gpu: amd
+
+      initContainers:
+        - name: download-model
+          image: python:3.11-slim
+          env:
+            - name: HF_HOME
+              value: /models/.hf
+            - name: MODEL_REPO
+              value: "byteshape/Qwen3-Coder-30B-A3B-Instruct-GGUF"
+            - name: MODEL_FILE
+              value: "Qwen3-Coder-30B-A3B-Instruct-IQ4_XS-4.20bpw.gguf"
+          command:
+            - /bin/sh
+            - -c
+            - |
+              set -eux
+
+              MODEL_PATH="/models/${MODEL_FILE}"
+
+              if [ -f "${MODEL_PATH}" ]; then
+                echo "Model already exists at ${MODEL_PATH}, skipping download"
+                exit 0
+              fi
+
+              echo "Installing Hugging Face Hub downloader"
+              pip install --no-cache-dir huggingface_hub
+
+              echo "Downloading ${MODEL_REPO}/${MODEL_FILE}"
+              python - <<'PY'
+              import os
+              from huggingface_hub import hf_hub_download
+
+              repo_id = os.environ["MODEL_REPO"]
+              filename = os.environ["MODEL_FILE"]
+
+              token = os.environ.get("HUGGING_FACE_HUB_TOKEN")
+
+              path = hf_hub_download(
+                  repo_id=repo_id,
+                  filename=filename,
+                  local_dir="/models",
+                  local_dir_use_symlinks=False,
+                  token=token,
+              )
+              print(f"Downloaded to: {path}")
+              PY
+
+              ls -lah /models
+          volumeMounts:
+            - name: models
+              mountPath: /models
+
+      containers:
+        - name: llama
+          image: ghcr.io/ggml-org/llama.cpp:server
+          args:
+            - "--model"
+            - "/models/Qwen3-Coder-30B-A3B-Instruct-IQ4_XS-4.20bpw.gguf"
+            - "--host"
+            - "0.0.0.0"
+            - "--port"
+            - "8080"
+            - "--metrics"
+            - "--ctx-size"
+            - "32768"
+            - "--parallel"
+            - "1"
+            - "--cache-type-k"
+            - "q8_0"
+            - "--cache-type-v"
+            - "q8_0"
+          ports:
+            - name: http
+              containerPort: 8080
+
+          volumeMounts:
+            - name: models
+              mountPath: /models
+
+          resources:
+            requests:
+              cpu: "8"
+              memory: "24Gi"
+            limits:
+              cpu: "12"
+              memory: "24Gi"
+
+      volumes:
+        - name: models
+          persistentVolumeClaim:
+            claimName: llama-cpu-models-pvc
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: llama-server-cpu
+  namespace: llama
+spec:
+  selector:
+    app: llama-server-cpu
+  ports:
+    - name: http
+      port: 8080
+      targetPort: http
+  type: ClusterIP
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: llama-server-cpu
+  namespace: llama
+  labels:
+    app: llama-server-cpu
+spec:
+  namespaceSelector:
+    matchNames:
+      - llama
+  selector:
+    matchLabels:
+      app: llama-server-cpu
+  podMetricsEndpoints:
+    - port: http
+      path: /metrics
+      interval: 15s
--- a/llama/gpu-exporter.yaml
+++ b/llama/gpu-exporter.yaml
@@ -0,0 +1,62 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: radeon-exporter
+  namespace: llama
+  labels:
+    app: radeon-exporter
+spec:
+  selector:
+    matchLabels:
+      app: radeon-exporter
+  template:
+    metadata:
+      labels:
+        app: radeon-exporter
+    spec:
+      nodeSelector:
+        gpu: amd
+      containers:
+        - name: radeon-exporter
+          image: kmulvey/radeon_exporter:latest
+          imagePullPolicy: IfNotPresent
+          ports:
+            - name: metrics
+              containerPort: 9200
+          securityContext:
+            privileged: true
+          volumeMounts:
+            - name: sys
+              mountPath: /sys
+              readOnly: true
+            - name: dri
+              mountPath: /dev/dri
+              readOnly: true
+      volumes:
+        - name: sys
+          hostPath:
+            path: /sys
+            type: Directory
+        - name: dri
+          hostPath:
+            path: /dev/dri
+            type: Directory
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: radeon-exporter
+  namespace: llama
+  labels:
+    monitoring: primary
+spec:
+  namespaceSelector:
+    matchNames:
+      - llama
+  selector:
+    matchLabels:
+      app: radeon-exporter
+  podMetricsEndpoints:
+    - port: metrics
+      path: /metrics
+      interval: 15s
--- a/llama/litellm-db.yaml
+++ b/llama/litellm-db.yaml
@@ -0,0 +1,116 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: litellm-postgres
+  namespace: llama
+type: Opaque
+stringData:
+  POSTGRES_DB: litellm
+  POSTGRES_USER: litellm
+  POSTGRES_PASSWORD: 7792e47efbc7348155f54a15ed34dc1d06716b2b1848711d0ee90e3461883c0d
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: litellm-postgres
+  namespace: llama
+  labels:
+    app.kubernetes.io/name: litellm-postgres
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 10Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: litellm-postgres
+  namespace: llama
+  labels:
+    app.kubernetes.io/name: litellm-postgres
+    app.kubernetes.io/component: database
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: litellm-postgres
+      app.kubernetes.io/component: database
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: litellm-postgres
+        app.kubernetes.io/component: database
+    spec:
+      containers:
+        - name: postgres
+          image: postgres:16
+          imagePullPolicy: IfNotPresent
+          ports:
+            - name: postgres
+              containerPort: 5432
+          env:
+            - name: POSTGRES_DB
+              valueFrom:
+                secretKeyRef:
+                  name: litellm-postgres
+                  key: POSTGRES_DB
+            - name: POSTGRES_USER
+              valueFrom:
+                secretKeyRef:
+                  name: litellm-postgres
+                  key: POSTGRES_USER
+            - name: POSTGRES_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: litellm-postgres
+                  key: POSTGRES_PASSWORD
+          volumeMounts:
+            - name: data
+              mountPath: /var/lib/postgresql
+          readinessProbe:
+            exec:
+              command:
+                - sh
+                - -c
+                - pg_isready -U "$POSTGRES_USER" -d "$POSTGRES_DB"
+            initialDelaySeconds: 5
+            periodSeconds: 10
+          livenessProbe:
+            exec:
+              command:
+                - sh
+                - -c
+                - pg_isready -U "$POSTGRES_USER" -d "$POSTGRES_DB"
+            initialDelaySeconds: 20
+            periodSeconds: 20
+          resources:
+            requests:
+              cpu: 100m
+              memory: 256Mi
+            limits:
+              cpu: 500m
+              memory: 256Mi
+      volumes:
+        - name: data
+          persistentVolumeClaim:
+            claimName: litellm-postgres
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: litellm-postgres
+  namespace: llama
+  labels:
+    app.kubernetes.io/name: litellm-postgres
+    app.kubernetes.io/component: database
+spec:
+  selector:
+    app.kubernetes.io/name: litellm-postgres
+    app.kubernetes.io/component: database
+  ports:
+    - name: postgres
+      port: 5432
+      targetPort: postgres
+  type: ClusterIP
--- a/llama/litellm.yaml
+++ b/llama/litellm.yaml
@@ -0,0 +1,202 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: litellm-secret
+  namespace: llama
+  labels:
+    app.kubernetes.io/name: litellm
+    app.kubernetes.io/component: gateway
+type: Opaque
+stringData:
+  LITELLM_MASTER_KEY: "6991c7c0f02b4bcf"
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: litellm-config
+  namespace: llama
+  labels:
+    app.kubernetes.io/name: litellm
+    app.kubernetes.io/component: gateway
+data:
+  config.yaml: |
+    model_list:
+      - model_name: fast
+        litellm_params:
+          model: openai/fast
+          api_base: "http://llama-server-gpu.llama.svc.cluster.local:8080/v1"
+          api_key: none
+
+      - model_name: smart
+        litellm_params:
+          model: openai/smart
+          api_base: "http://llama-server-cpu.llama.svc.cluster.local:8080/v1"
+          api_key: none
+
+      - model_name: rp
+        litellm_params:
+          model: openai/rp
+          api_base: "http://llama-server-gpu-rp.llama.svc.cluster.local:8080/v1"
+          api_key: none
+    litellm_settings:
+      callbacks:
+        - prometheus
+    general_settings:
+      store_model_in_db: true
+      store_prompts_in_spend_logs: true
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: litellm
+  namespace: llama
+  labels:
+    app.kubernetes.io/name: litellm
+    app.kubernetes.io/component: gateway
+    app.kubernetes.io/part-of: llama-stack
+    monitoring: prometheus
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: litellm
+      app.kubernetes.io/component: gateway
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: litellm
+        app.kubernetes.io/component: gateway
+        app.kubernetes.io/part-of: llama-stack
+        monitoring: prometheus
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "4000"
+        prometheus.io/path: "/metrics"
+    spec:
+      containers:
+        - name: litellm
+          image: ghcr.io/berriai/litellm:v1.82.6.rc.3
+          imagePullPolicy: IfNotPresent
+          args:
+            - "--config"
+            - "/app/config.yaml"
+          env:
+            - name: LITELLM_MASTER_KEY
+              valueFrom:
+                secretKeyRef:
+                  name: litellm-secret
+                  key: LITELLM_MASTER_KEY
+            - name: POSTGRES_USER
+              valueFrom:
+                secretKeyRef:
+                  name: litellm-postgres
+                  key: POSTGRES_USER
+
+            - name: POSTGRES_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: litellm-postgres
+                  key: POSTGRES_PASSWORD
+
+            - name: POSTGRES_DB
+              valueFrom:
+                secretKeyRef:
+                  name: litellm-postgres
+                  key: POSTGRES_DB
+
+            - name: DATABASE_URL
+              value: "postgresql://$(POSTGRES_USER):$(POSTGRES_PASSWORD)@litellm-postgres.llama.svc.cluster.local:5432/$(POSTGRES_DB)"
+          ports:
+            - name: http
+              containerPort: 4000
+              protocol: TCP
+          volumeMounts:
+            - name: litellm-config
+              mountPath: /app/config.yaml
+              subPath: config.yaml
+          resources:
+            requests:
+              cpu: "500m"
+              memory: "1Gi"
+            limits:
+              cpu: "1000m"
+              memory: "2Gi"
+      volumes:
+        - name: litellm-config
+          configMap:
+            name: litellm-config
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: litellm
+  namespace: llama
+  labels:
+    app.kubernetes.io/name: litellm
+    app.kubernetes.io/component: gateway
+    app.kubernetes.io/part-of: llama-stack
+    monitoring: prometheus
+spec:
+  selector:
+    app.kubernetes.io/name: litellm
+    app.kubernetes.io/component: gateway
+  ports:
+    - name: http
+      port: 4000
+      targetPort: http
+      protocol: TCP
+  type: ClusterIP
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: litellm
+  namespace: llama
+  labels:
+    app.kubernetes.io/name: litellm
+    app.kubernetes.io/component: gateway
+    app.kubernetes.io/part-of: llama-stack
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-production
+    traefik.ingress.kubernetes.io/router.entrypoints: websecure
+    traefik.ingress.kubernetes.io/router.tls: "true"
+spec:
+  ingressClassName: traefik
+  tls:
+    - hosts:
+        - litellm.mrt0rtikize.ru
+      secretName: web-echo-tls
+  rules:
+    - host: litellm.mrt0rtikize.ru
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: litellm
+                port:
+                  number: 4000
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: litellm
+  namespace: llama
+  labels:
+    app.kubernetes.io/name: litellm
+    app.kubernetes.io/component: gateway
+    app.kubernetes.io/part-of: llama-stack
+    release: kube-prometheus-stack
+spec:
+  namespaceSelector:
+    matchNames:
+      - llama
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: litellm
+      app.kubernetes.io/component: gateway
+  podMetricsEndpoints:
+    - port: http
+      path: /metrics
+      interval: 30s
--- a/llama/main.yaml
+++ b/llama/main.yaml
@@ -0,0 +1,166 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: llama-server-gpu
+  namespace: llama
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: llama-server-gpu
+  template:
+    metadata:
+      labels:
+        app: llama-server-gpu
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "8080"
+        prometheus.io/path: "/metrics"
+    spec:
+      nodeSelector:
+        gpu: amd
+
+      initContainers:
+        - name: download-model
+          image: python:3.11-slim
+          env:
+            - name: HF_HOME
+              value: /models/.hf
+            - name: MODEL_REPO
+              value: "byteshape/Devstral-Small-2-24B-Instruct-2512-GGUF"
+            - name: MODEL_FILE
+              value: "Devstral-Small-2-24B-Instruct-2512-IQ4_XS-4.04bpw.gguf"
+            # optional, only if you need gated/private models
+            # - name: HUGGING_FACE_HUB_TOKEN
+            #   valueFrom:
+            #     secretKeyRef:
+            #       name: hf-token
+            #       key: token
+          command:
+            - /bin/sh
+            - -c
+            - |
+              set -eux
+
+              MODEL_PATH="/models/${MODEL_FILE}"
+
+              if [ -f "${MODEL_PATH}" ]; then
+                echo "Model already exists at ${MODEL_PATH}, skipping download"
+                exit 0
+              fi
+
+              echo "Installing Hugging Face Hub downloader"
+              pip install --no-cache-dir huggingface_hub
+
+              echo "Downloading ${MODEL_REPO}/${MODEL_FILE}"
+              python - <<'PY'
+              import os
+              from huggingface_hub import hf_hub_download
+
+              repo_id = os.environ["MODEL_REPO"]
+              filename = os.environ["MODEL_FILE"]
+
+              token = os.environ.get("HUGGING_FACE_HUB_TOKEN")
+
+              path = hf_hub_download(
+                  repo_id=repo_id,
+                  filename=filename,
+                  local_dir="/models",
+                  local_dir_use_symlinks=False,
+                  token=token,
+              )
+              print(f"Downloaded to: {path}")
+              PY
+
+              ls -lah /models
+          volumeMounts:
+            - name: models
+              mountPath: /models
+
+      containers:
+        - name: llama
+          image: ghcr.io/ggml-org/llama.cpp:server-vulkan
+          args:
+            - "--model"
+            - "/models/Devstral-Small-2-24B-Instruct-2512-IQ4_XS-4.04bpw.gguf"
+            - "--host"
+            - "0.0.0.0"
+            - "--port"
+            - "8080"
+            - "--n-gpu-layers"
+            - "999"
+            - "--metrics"
+
+            # performance tuning
+            - "--ctx-size"
+            - "32768"
+            - "--parallel"
+            - "4"
+
+            # KV cache quantization
+            - "--cache-type-k"
+            - "q8_0"
+            - "--cache-type-v"
+            - "q8_0"
+          ports:
+            - name: http
+              containerPort: 8080
+
+          securityContext:
+            privileged: true
+
+          volumeMounts:
+            - name: models
+              mountPath: /models
+            - name: dri
+              mountPath: /dev/dri
+
+          resources:
+            requests:
+              cpu: "2"
+              memory: "4Gi"
+            limits:
+              cpu: "2"
+              memory: "4Gi"
+
+      volumes:
+        - name: models
+          persistentVolumeClaim:
+            claimName: llama-gpu-models-pvc
+        - name: dri
+          hostPath:
+            path: /dev/dri
+            type: Directory
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: llama-server-gpu
+  namespace: llama
+spec:
+  selector:
+    app: llama-server-gpu
+  ports:
+    - name: http
+      port: 8080
+      targetPort: http
+  type: ClusterIP
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: llama-server-gpu
+  namespace: llama
+  labels:
+    app: llama-server-gpu
+spec:
+  namespaceSelector:
+    matchNames:
+      - llama
+  selector:
+    matchLabels:
+      app: llama-server-gpu
+  podMetricsEndpoints:
+    - port: http
+      path: /metrics
+      interval: 15s
--- a/llama/namespace.yaml
+++ b/llama/namespace.yaml
@@ -0,0 +1,42 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: llama
+---
+# apiVersion: storage.k8s.io/v1
+# kind: StorageClass
+# metadata:
+#   name: longhorn-llama
+# provisioner: driver.longhorn.io
+# parameters:
+#   numberOfReplicas: "2"
+#   staleReplicaTimeout: "30"
+# allowVolumeExpansion: true
+# reclaimPolicy: Retain
+# volumeBindingMode: Immediate
+# ---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: llama-gpu-models-pvc
+  namespace: llama
+spec:
+  accessModes:
+    - ReadWriteOnce
+  # storageClassName: longhorn-llama
+  resources:
+    requests:
+      storage: 50Gi
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: llama-cpu-models-pvc
+  namespace: llama
+spec:
+  accessModes:
+    - ReadWriteOnce
+  # storageClassName: longhorn-llama
+  resources:
+    requests:
+      storage: 100Gi
--- a/llama/rp.yaml
+++ b/llama/rp.yaml
@@ -0,0 +1,166 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: llama-server-gpu-rp
+  namespace: llama
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: llama-server-gpu-rp
+  template:
+    metadata:
+      labels:
+        app: llama-server-gpu-rp
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "8080"
+        prometheus.io/path: "/metrics"
+    spec:
+      nodeSelector:
+        gpu: amd
+
+      initContainers:
+        - name: download-model
+          image: python:3.11-slim
+          env:
+            - name: HF_HOME
+              value: /models/.hf
+            - name: MODEL_REPO
+              value: "mradermacher/Omega-Darker-Gaslight_The-Final-Forgotten-Fever-Dream-24B-GGUF"
+            - name: MODEL_FILE
+              value: "Omega-Darker-Gaslight_The-Final-Forgotten-Fever-Dream-24B.Q4_K_S.gguf"
+            # optional, only if you need gated/private models
+            # - name: HUGGING_FACE_HUB_TOKEN
+            #   valueFrom:
+            #     secretKeyRef:
+            #       name: hf-token
+            #       key: token
+          command:
+            - /bin/sh
+            - -c
+            - |
+              set -eux
+
+              MODEL_PATH="/models/${MODEL_FILE}"
+
+              if [ -f "${MODEL_PATH}" ]; then
+                echo "Model already exists at ${MODEL_PATH}, skipping download"
+                exit 0
+              fi
+
+              echo "Installing Hugging Face Hub downloader"
+              pip install --no-cache-dir huggingface_hub
+
+              echo "Downloading ${MODEL_REPO}/${MODEL_FILE}"
+              python - <<'PY'
+              import os
+              from huggingface_hub import hf_hub_download
+
+              repo_id = os.environ["MODEL_REPO"]
+              filename = os.environ["MODEL_FILE"]
+
+              token = os.environ.get("HUGGING_FACE_HUB_TOKEN")
+
+              path = hf_hub_download(
+                  repo_id=repo_id,
+                  filename=filename,
+                  local_dir="/models",
+                  local_dir_use_symlinks=False,
+                  token=token,
+              )
+              print(f"Downloaded to: {path}")
+              PY
+
+              ls -lah /models
+          volumeMounts:
+            - name: models
+              mountPath: /models
+
+      containers:
+        - name: llama
+          image: ghcr.io/ggml-org/llama.cpp:server-vulkan
+          args:
+            - "--model"
+            - "/models/Omega-Darker-Gaslight_The-Final-Forgotten-Fever-Dream-24B.Q4_K_S.gguf"
+            - "--host"
+            - "0.0.0.0"
+            - "--port"
+            - "8080"
+            - "--n-gpu-layers"
+            - "999"
+            - "--metrics"
+
+            # performance tuning
+            - "--ctx-size"
+            - "32768"
+            - "--parallel"
+            - "1"
+
+            # KV cache quantization
+            - "--cache-type-k"
+            - "q8_0"
+            - "--cache-type-v"
+            - "q8_0"
+          ports:
+            - name: http
+              containerPort: 8080
+
+          securityContext:
+            privileged: true
+
+          volumeMounts:
+            - name: models
+              mountPath: /models
+            - name: dri
+              mountPath: /dev/dri
+
+          resources:
+            requests:
+              cpu: "2"
+              memory: "4Gi"
+            limits:
+              cpu: "2"
+              memory: "4Gi"
+
+      volumes:
+        - name: models
+          persistentVolumeClaim:
+            claimName: llama-gpu-models-pvc
+        - name: dri
+          hostPath:
+            path: /dev/dri
+            type: Directory
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: llama-server-gpu-rp
+  namespace: llama
+spec:
+  selector:
+    app: llama-server-gpu-rp
+  ports:
+    - name: http
+      port: 8080
+      targetPort: http
+  type: ClusterIP
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: llama-server-gpu-rp
+  namespace: llama
+  labels:
+    app: llama-server-gpu-rp
+spec:
+  namespaceSelector:
+    matchNames:
+      - llama
+  selector:
+    matchLabels:
+      app: llama-server-gpu-rp
+  podMetricsEndpoints:
+    - port: http
+      path: /metrics
+      interval: 15s