Initial commit: k3s GitOps manifests with ArgoCD App-of-Apps

This commit is contained in:
2026-05-05 13:18:51 +03:00
commit 5d9a80b976
65 changed files with 3445 additions and 0 deletions

147
manifests/llama/cpu.yaml Normal file
View File

@@ -0,0 +1,147 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: llama-server-cpu
namespace: llama
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app: llama-server-cpu
template:
metadata:
labels:
app: llama-server-cpu
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
spec:
nodeSelector:
gpu: amd
initContainers:
- name: download-model
image: python:3.11-slim
env:
- name: HF_HOME
value: /models/.hf
- name: MODEL_REPO
value: "byteshape/Qwen3-Coder-30B-A3B-Instruct-GGUF"
- name: MODEL_FILE
value: "Qwen3-Coder-30B-A3B-Instruct-IQ4_XS-4.20bpw.gguf"
command:
- /bin/sh
- -c
- |
set -eux
MODEL_PATH="/models/${MODEL_FILE}"
if [ -f "${MODEL_PATH}" ]; then
echo "Model already exists at ${MODEL_PATH}, skipping download"
exit 0
fi
echo "Installing Hugging Face Hub downloader"
pip install --no-cache-dir huggingface_hub
echo "Downloading ${MODEL_REPO}/${MODEL_FILE}"
python - <<'PY'
import os
from huggingface_hub import hf_hub_download
repo_id = os.environ["MODEL_REPO"]
filename = os.environ["MODEL_FILE"]
token = os.environ.get("HUGGING_FACE_HUB_TOKEN")
path = hf_hub_download(
repo_id=repo_id,
filename=filename,
local_dir="/models",
local_dir_use_symlinks=False,
token=token,
)
print(f"Downloaded to: {path}")
PY
ls -lah /models
volumeMounts:
- name: models
mountPath: /models
containers:
- name: llama
image: ghcr.io/ggml-org/llama.cpp:server
args:
- "--model"
- "/models/Qwen3-Coder-30B-A3B-Instruct-IQ4_XS-4.20bpw.gguf"
- "--host"
- "0.0.0.0"
- "--port"
- "8080"
- "--metrics"
- "--ctx-size"
- "32768"
- "--parallel"
- "1"
- "--cache-type-k"
- "q8_0"
- "--cache-type-v"
- "q8_0"
ports:
- name: http
containerPort: 8080
volumeMounts:
- name: models
mountPath: /models
resources:
requests:
cpu: "8"
memory: "24Gi"
limits:
cpu: "12"
memory: "24Gi"
volumes:
- name: models
persistentVolumeClaim:
claimName: llama-cpu-models-pvc
---
apiVersion: v1
kind: Service
metadata:
name: llama-server-cpu
namespace: llama
spec:
selector:
app: llama-server-cpu
ports:
- name: http
port: 8080
targetPort: http
type: ClusterIP
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: llama-server-cpu
namespace: llama
labels:
app: llama-server-cpu
spec:
namespaceSelector:
matchNames:
- llama
selector:
matchLabels:
app: llama-server-cpu
podMetricsEndpoints:
- port: http
path: /metrics
interval: 15s

View File

@@ -0,0 +1,62 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: radeon-exporter
namespace: llama
labels:
app: radeon-exporter
spec:
selector:
matchLabels:
app: radeon-exporter
template:
metadata:
labels:
app: radeon-exporter
spec:
nodeSelector:
gpu: amd
containers:
- name: radeon-exporter
image: kmulvey/radeon_exporter:latest
imagePullPolicy: IfNotPresent
ports:
- name: metrics
containerPort: 9200
securityContext:
privileged: true
volumeMounts:
- name: sys
mountPath: /sys
readOnly: true
- name: dri
mountPath: /dev/dri
readOnly: true
volumes:
- name: sys
hostPath:
path: /sys
type: Directory
- name: dri
hostPath:
path: /dev/dri
type: Directory
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: radeon-exporter
namespace: llama
labels:
monitoring: primary
spec:
namespaceSelector:
matchNames:
- llama
selector:
matchLabels:
app: radeon-exporter
podMetricsEndpoints:
- port: metrics
path: /metrics
interval: 15s

View File

@@ -0,0 +1,116 @@
apiVersion: v1
kind: Secret
metadata:
name: litellm-postgres
namespace: llama
type: Opaque
stringData:
POSTGRES_DB: litellm
POSTGRES_USER: litellm
POSTGRES_PASSWORD: 7792e47efbc7348155f54a15ed34dc1d06716b2b1848711d0ee90e3461883c0d
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: litellm-postgres
namespace: llama
labels:
app.kubernetes.io/name: litellm-postgres
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: litellm-postgres
namespace: llama
labels:
app.kubernetes.io/name: litellm-postgres
app.kubernetes.io/component: database
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: litellm-postgres
app.kubernetes.io/component: database
template:
metadata:
labels:
app.kubernetes.io/name: litellm-postgres
app.kubernetes.io/component: database
spec:
containers:
- name: postgres
image: postgres:16
imagePullPolicy: IfNotPresent
ports:
- name: postgres
containerPort: 5432
env:
- name: POSTGRES_DB
valueFrom:
secretKeyRef:
name: litellm-postgres
key: POSTGRES_DB
- name: POSTGRES_USER
valueFrom:
secretKeyRef:
name: litellm-postgres
key: POSTGRES_USER
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: litellm-postgres
key: POSTGRES_PASSWORD
volumeMounts:
- name: data
mountPath: /var/lib/postgresql
readinessProbe:
exec:
command:
- sh
- -c
- pg_isready -U "$POSTGRES_USER" -d "$POSTGRES_DB"
initialDelaySeconds: 5
periodSeconds: 10
livenessProbe:
exec:
command:
- sh
- -c
- pg_isready -U "$POSTGRES_USER" -d "$POSTGRES_DB"
initialDelaySeconds: 20
periodSeconds: 20
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 256Mi
volumes:
- name: data
persistentVolumeClaim:
claimName: litellm-postgres
---
apiVersion: v1
kind: Service
metadata:
name: litellm-postgres
namespace: llama
labels:
app.kubernetes.io/name: litellm-postgres
app.kubernetes.io/component: database
spec:
selector:
app.kubernetes.io/name: litellm-postgres
app.kubernetes.io/component: database
ports:
- name: postgres
port: 5432
targetPort: postgres
type: ClusterIP

View File

@@ -0,0 +1,202 @@
apiVersion: v1
kind: Secret
metadata:
name: litellm-secret
namespace: llama
labels:
app.kubernetes.io/name: litellm
app.kubernetes.io/component: gateway
type: Opaque
stringData:
LITELLM_MASTER_KEY: "6991c7c0f02b4bcf"
---
apiVersion: v1
kind: ConfigMap
metadata:
name: litellm-config
namespace: llama
labels:
app.kubernetes.io/name: litellm
app.kubernetes.io/component: gateway
data:
config.yaml: |
model_list:
- model_name: fast
litellm_params:
model: openai/fast
api_base: "http://llama-server-gpu.llama.svc.cluster.local:8080/v1"
api_key: none
- model_name: smart
litellm_params:
model: openai/smart
api_base: "http://llama-server-cpu.llama.svc.cluster.local:8080/v1"
api_key: none
- model_name: rp
litellm_params:
model: openai/rp
api_base: "http://llama-server-gpu-rp.llama.svc.cluster.local:8080/v1"
api_key: none
litellm_settings:
callbacks:
- prometheus
general_settings:
store_model_in_db: true
store_prompts_in_spend_logs: true
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: litellm
namespace: llama
labels:
app.kubernetes.io/name: litellm
app.kubernetes.io/component: gateway
app.kubernetes.io/part-of: llama-stack
monitoring: prometheus
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: litellm
app.kubernetes.io/component: gateway
template:
metadata:
labels:
app.kubernetes.io/name: litellm
app.kubernetes.io/component: gateway
app.kubernetes.io/part-of: llama-stack
monitoring: prometheus
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "4000"
prometheus.io/path: "/metrics"
spec:
containers:
- name: litellm
image: ghcr.io/berriai/litellm:v1.82.6.rc.3
imagePullPolicy: IfNotPresent
args:
- "--config"
- "/app/config.yaml"
env:
- name: LITELLM_MASTER_KEY
valueFrom:
secretKeyRef:
name: litellm-secret
key: LITELLM_MASTER_KEY
- name: POSTGRES_USER
valueFrom:
secretKeyRef:
name: litellm-postgres
key: POSTGRES_USER
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: litellm-postgres
key: POSTGRES_PASSWORD
- name: POSTGRES_DB
valueFrom:
secretKeyRef:
name: litellm-postgres
key: POSTGRES_DB
- name: DATABASE_URL
value: "postgresql://$(POSTGRES_USER):$(POSTGRES_PASSWORD)@litellm-postgres.llama.svc.cluster.local:5432/$(POSTGRES_DB)"
ports:
- name: http
containerPort: 4000
protocol: TCP
volumeMounts:
- name: litellm-config
mountPath: /app/config.yaml
subPath: config.yaml
resources:
requests:
cpu: "500m"
memory: "1Gi"
limits:
cpu: "1000m"
memory: "2Gi"
volumes:
- name: litellm-config
configMap:
name: litellm-config
---
apiVersion: v1
kind: Service
metadata:
name: litellm
namespace: llama
labels:
app.kubernetes.io/name: litellm
app.kubernetes.io/component: gateway
app.kubernetes.io/part-of: llama-stack
monitoring: prometheus
spec:
selector:
app.kubernetes.io/name: litellm
app.kubernetes.io/component: gateway
ports:
- name: http
port: 4000
targetPort: http
protocol: TCP
type: ClusterIP
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: litellm
namespace: llama
labels:
app.kubernetes.io/name: litellm
app.kubernetes.io/component: gateway
app.kubernetes.io/part-of: llama-stack
annotations:
cert-manager.io/cluster-issuer: letsencrypt-production
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
spec:
ingressClassName: traefik
tls:
- hosts:
- litellm.mrt0rtikize.ru
secretName: web-echo-tls
rules:
- host: litellm.mrt0rtikize.ru
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: litellm
port:
number: 4000
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: litellm
namespace: llama
labels:
app.kubernetes.io/name: litellm
app.kubernetes.io/component: gateway
app.kubernetes.io/part-of: llama-stack
release: kube-prometheus-stack
spec:
namespaceSelector:
matchNames:
- llama
selector:
matchLabels:
app.kubernetes.io/name: litellm
app.kubernetes.io/component: gateway
podMetricsEndpoints:
- port: http
path: /metrics
interval: 30s

166
manifests/llama/main.yaml Normal file
View File

@@ -0,0 +1,166 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: llama-server-gpu
namespace: llama
spec:
replicas: 1
selector:
matchLabels:
app: llama-server-gpu
template:
metadata:
labels:
app: llama-server-gpu
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
spec:
nodeSelector:
gpu: amd
initContainers:
- name: download-model
image: python:3.11-slim
env:
- name: HF_HOME
value: /models/.hf
- name: MODEL_REPO
value: "byteshape/Devstral-Small-2-24B-Instruct-2512-GGUF"
- name: MODEL_FILE
value: "Devstral-Small-2-24B-Instruct-2512-IQ4_XS-4.04bpw.gguf"
# optional, only if you need gated/private models
# - name: HUGGING_FACE_HUB_TOKEN
# valueFrom:
# secretKeyRef:
# name: hf-token
# key: token
command:
- /bin/sh
- -c
- |
set -eux
MODEL_PATH="/models/${MODEL_FILE}"
if [ -f "${MODEL_PATH}" ]; then
echo "Model already exists at ${MODEL_PATH}, skipping download"
exit 0
fi
echo "Installing Hugging Face Hub downloader"
pip install --no-cache-dir huggingface_hub
echo "Downloading ${MODEL_REPO}/${MODEL_FILE}"
python - <<'PY'
import os
from huggingface_hub import hf_hub_download
repo_id = os.environ["MODEL_REPO"]
filename = os.environ["MODEL_FILE"]
token = os.environ.get("HUGGING_FACE_HUB_TOKEN")
path = hf_hub_download(
repo_id=repo_id,
filename=filename,
local_dir="/models",
local_dir_use_symlinks=False,
token=token,
)
print(f"Downloaded to: {path}")
PY
ls -lah /models
volumeMounts:
- name: models
mountPath: /models
containers:
- name: llama
image: ghcr.io/ggml-org/llama.cpp:server-vulkan
args:
- "--model"
- "/models/Devstral-Small-2-24B-Instruct-2512-IQ4_XS-4.04bpw.gguf"
- "--host"
- "0.0.0.0"
- "--port"
- "8080"
- "--n-gpu-layers"
- "999"
- "--metrics"
# performance tuning
- "--ctx-size"
- "32768"
- "--parallel"
- "4"
# KV cache quantization
- "--cache-type-k"
- "q8_0"
- "--cache-type-v"
- "q8_0"
ports:
- name: http
containerPort: 8080
securityContext:
privileged: true
volumeMounts:
- name: models
mountPath: /models
- name: dri
mountPath: /dev/dri
resources:
requests:
cpu: "2"
memory: "4Gi"
limits:
cpu: "2"
memory: "4Gi"
volumes:
- name: models
persistentVolumeClaim:
claimName: llama-gpu-models-pvc
- name: dri
hostPath:
path: /dev/dri
type: Directory
---
apiVersion: v1
kind: Service
metadata:
name: llama-server-gpu
namespace: llama
spec:
selector:
app: llama-server-gpu
ports:
- name: http
port: 8080
targetPort: http
type: ClusterIP
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: llama-server-gpu
namespace: llama
labels:
app: llama-server-gpu
spec:
namespaceSelector:
matchNames:
- llama
selector:
matchLabels:
app: llama-server-gpu
podMetricsEndpoints:
- port: http
path: /metrics
interval: 15s

View File

@@ -0,0 +1,42 @@
apiVersion: v1
kind: Namespace
metadata:
name: llama
---
# apiVersion: storage.k8s.io/v1
# kind: StorageClass
# metadata:
# name: longhorn-llama
# provisioner: driver.longhorn.io
# parameters:
# numberOfReplicas: "2"
# staleReplicaTimeout: "30"
# allowVolumeExpansion: true
# reclaimPolicy: Retain
# volumeBindingMode: Immediate
# ---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: llama-gpu-models-pvc
namespace: llama
spec:
accessModes:
- ReadWriteOnce
# storageClassName: longhorn-llama
resources:
requests:
storage: 50Gi
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: llama-cpu-models-pvc
namespace: llama
spec:
accessModes:
- ReadWriteOnce
# storageClassName: longhorn-llama
resources:
requests:
storage: 100Gi

166
manifests/llama/rp.yaml Normal file
View File

@@ -0,0 +1,166 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: llama-server-gpu-rp
namespace: llama
spec:
replicas: 1
selector:
matchLabels:
app: llama-server-gpu-rp
template:
metadata:
labels:
app: llama-server-gpu-rp
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
spec:
nodeSelector:
gpu: amd
initContainers:
- name: download-model
image: python:3.11-slim
env:
- name: HF_HOME
value: /models/.hf
- name: MODEL_REPO
value: "mradermacher/Omega-Darker-Gaslight_The-Final-Forgotten-Fever-Dream-24B-GGUF"
- name: MODEL_FILE
value: "Omega-Darker-Gaslight_The-Final-Forgotten-Fever-Dream-24B.Q4_K_S.gguf"
# optional, only if you need gated/private models
# - name: HUGGING_FACE_HUB_TOKEN
# valueFrom:
# secretKeyRef:
# name: hf-token
# key: token
command:
- /bin/sh
- -c
- |
set -eux
MODEL_PATH="/models/${MODEL_FILE}"
if [ -f "${MODEL_PATH}" ]; then
echo "Model already exists at ${MODEL_PATH}, skipping download"
exit 0
fi
echo "Installing Hugging Face Hub downloader"
pip install --no-cache-dir huggingface_hub
echo "Downloading ${MODEL_REPO}/${MODEL_FILE}"
python - <<'PY'
import os
from huggingface_hub import hf_hub_download
repo_id = os.environ["MODEL_REPO"]
filename = os.environ["MODEL_FILE"]
token = os.environ.get("HUGGING_FACE_HUB_TOKEN")
path = hf_hub_download(
repo_id=repo_id,
filename=filename,
local_dir="/models",
local_dir_use_symlinks=False,
token=token,
)
print(f"Downloaded to: {path}")
PY
ls -lah /models
volumeMounts:
- name: models
mountPath: /models
containers:
- name: llama
image: ghcr.io/ggml-org/llama.cpp:server-vulkan
args:
- "--model"
- "/models/Omega-Darker-Gaslight_The-Final-Forgotten-Fever-Dream-24B.Q4_K_S.gguf"
- "--host"
- "0.0.0.0"
- "--port"
- "8080"
- "--n-gpu-layers"
- "999"
- "--metrics"
# performance tuning
- "--ctx-size"
- "32768"
- "--parallel"
- "1"
# KV cache quantization
- "--cache-type-k"
- "q8_0"
- "--cache-type-v"
- "q8_0"
ports:
- name: http
containerPort: 8080
securityContext:
privileged: true
volumeMounts:
- name: models
mountPath: /models
- name: dri
mountPath: /dev/dri
resources:
requests:
cpu: "2"
memory: "4Gi"
limits:
cpu: "2"
memory: "4Gi"
volumes:
- name: models
persistentVolumeClaim:
claimName: llama-gpu-models-pvc
- name: dri
hostPath:
path: /dev/dri
type: Directory
---
apiVersion: v1
kind: Service
metadata:
name: llama-server-gpu-rp
namespace: llama
spec:
selector:
app: llama-server-gpu-rp
ports:
- name: http
port: 8080
targetPort: http
type: ClusterIP
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: llama-server-gpu-rp
namespace: llama
labels:
app: llama-server-gpu-rp
spec:
namespaceSelector:
matchNames:
- llama
selector:
matchLabels:
app: llama-server-gpu-rp
podMetricsEndpoints:
- port: http
path: /metrics
interval: 15s