Initial commit: k3s GitOps manifests with ArgoCD App-of-Apps
This commit is contained in:
166
llama/rp.yaml
Normal file
166
llama/rp.yaml
Normal file
@@ -0,0 +1,166 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: llama-server-gpu-rp
|
||||
namespace: llama
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: llama-server-gpu-rp
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: llama-server-gpu-rp
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8080"
|
||||
prometheus.io/path: "/metrics"
|
||||
spec:
|
||||
nodeSelector:
|
||||
gpu: amd
|
||||
|
||||
initContainers:
|
||||
- name: download-model
|
||||
image: python:3.11-slim
|
||||
env:
|
||||
- name: HF_HOME
|
||||
value: /models/.hf
|
||||
- name: MODEL_REPO
|
||||
value: "mradermacher/Omega-Darker-Gaslight_The-Final-Forgotten-Fever-Dream-24B-GGUF"
|
||||
- name: MODEL_FILE
|
||||
value: "Omega-Darker-Gaslight_The-Final-Forgotten-Fever-Dream-24B.Q4_K_S.gguf"
|
||||
# optional, only if you need gated/private models
|
||||
# - name: HUGGING_FACE_HUB_TOKEN
|
||||
# valueFrom:
|
||||
# secretKeyRef:
|
||||
# name: hf-token
|
||||
# key: token
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
set -eux
|
||||
|
||||
MODEL_PATH="/models/${MODEL_FILE}"
|
||||
|
||||
if [ -f "${MODEL_PATH}" ]; then
|
||||
echo "Model already exists at ${MODEL_PATH}, skipping download"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Installing Hugging Face Hub downloader"
|
||||
pip install --no-cache-dir huggingface_hub
|
||||
|
||||
echo "Downloading ${MODEL_REPO}/${MODEL_FILE}"
|
||||
python - <<'PY'
|
||||
import os
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
repo_id = os.environ["MODEL_REPO"]
|
||||
filename = os.environ["MODEL_FILE"]
|
||||
|
||||
token = os.environ.get("HUGGING_FACE_HUB_TOKEN")
|
||||
|
||||
path = hf_hub_download(
|
||||
repo_id=repo_id,
|
||||
filename=filename,
|
||||
local_dir="/models",
|
||||
local_dir_use_symlinks=False,
|
||||
token=token,
|
||||
)
|
||||
print(f"Downloaded to: {path}")
|
||||
PY
|
||||
|
||||
ls -lah /models
|
||||
volumeMounts:
|
||||
- name: models
|
||||
mountPath: /models
|
||||
|
||||
containers:
|
||||
- name: llama
|
||||
image: ghcr.io/ggml-org/llama.cpp:server-vulkan
|
||||
args:
|
||||
- "--model"
|
||||
- "/models/Omega-Darker-Gaslight_The-Final-Forgotten-Fever-Dream-24B.Q4_K_S.gguf"
|
||||
- "--host"
|
||||
- "0.0.0.0"
|
||||
- "--port"
|
||||
- "8080"
|
||||
- "--n-gpu-layers"
|
||||
- "999"
|
||||
- "--metrics"
|
||||
|
||||
# performance tuning
|
||||
- "--ctx-size"
|
||||
- "32768"
|
||||
- "--parallel"
|
||||
- "1"
|
||||
|
||||
# KV cache quantization
|
||||
- "--cache-type-k"
|
||||
- "q8_0"
|
||||
- "--cache-type-v"
|
||||
- "q8_0"
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 8080
|
||||
|
||||
securityContext:
|
||||
privileged: true
|
||||
|
||||
volumeMounts:
|
||||
- name: models
|
||||
mountPath: /models
|
||||
- name: dri
|
||||
mountPath: /dev/dri
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: "2"
|
||||
memory: "4Gi"
|
||||
limits:
|
||||
cpu: "2"
|
||||
memory: "4Gi"
|
||||
|
||||
volumes:
|
||||
- name: models
|
||||
persistentVolumeClaim:
|
||||
claimName: llama-gpu-models-pvc
|
||||
- name: dri
|
||||
hostPath:
|
||||
path: /dev/dri
|
||||
type: Directory
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: llama-server-gpu-rp
|
||||
namespace: llama
|
||||
spec:
|
||||
selector:
|
||||
app: llama-server-gpu-rp
|
||||
ports:
|
||||
- name: http
|
||||
port: 8080
|
||||
targetPort: http
|
||||
type: ClusterIP
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PodMonitor
|
||||
metadata:
|
||||
name: llama-server-gpu-rp
|
||||
namespace: llama
|
||||
labels:
|
||||
app: llama-server-gpu-rp
|
||||
spec:
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- llama
|
||||
selector:
|
||||
matchLabels:
|
||||
app: llama-server-gpu-rp
|
||||
podMetricsEndpoints:
|
||||
- port: http
|
||||
path: /metrics
|
||||
interval: 15s
|
||||
Reference in New Issue
Block a user