Files
k3s-manifests/llama/rp.yaml

167 lines
4.1 KiB
YAML

apiVersion: apps/v1
kind: Deployment
metadata:
name: llama-server-gpu-rp
namespace: llama
spec:
replicas: 1
selector:
matchLabels:
app: llama-server-gpu-rp
template:
metadata:
labels:
app: llama-server-gpu-rp
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
spec:
nodeSelector:
gpu: amd
initContainers:
- name: download-model
image: python:3.11-slim
env:
- name: HF_HOME
value: /models/.hf
- name: MODEL_REPO
value: "mradermacher/Omega-Darker-Gaslight_The-Final-Forgotten-Fever-Dream-24B-GGUF"
- name: MODEL_FILE
value: "Omega-Darker-Gaslight_The-Final-Forgotten-Fever-Dream-24B.Q4_K_S.gguf"
# optional, only if you need gated/private models
# - name: HUGGING_FACE_HUB_TOKEN
# valueFrom:
# secretKeyRef:
# name: hf-token
# key: token
command:
- /bin/sh
- -c
- |
set -eux
MODEL_PATH="/models/${MODEL_FILE}"
if [ -f "${MODEL_PATH}" ]; then
echo "Model already exists at ${MODEL_PATH}, skipping download"
exit 0
fi
echo "Installing Hugging Face Hub downloader"
pip install --no-cache-dir huggingface_hub
echo "Downloading ${MODEL_REPO}/${MODEL_FILE}"
python - <<'PY'
import os
from huggingface_hub import hf_hub_download
repo_id = os.environ["MODEL_REPO"]
filename = os.environ["MODEL_FILE"]
token = os.environ.get("HUGGING_FACE_HUB_TOKEN")
path = hf_hub_download(
repo_id=repo_id,
filename=filename,
local_dir="/models",
local_dir_use_symlinks=False,
token=token,
)
print(f"Downloaded to: {path}")
PY
ls -lah /models
volumeMounts:
- name: models
mountPath: /models
containers:
- name: llama
image: ghcr.io/ggml-org/llama.cpp:server-vulkan
args:
- "--model"
- "/models/Omega-Darker-Gaslight_The-Final-Forgotten-Fever-Dream-24B.Q4_K_S.gguf"
- "--host"
- "0.0.0.0"
- "--port"
- "8080"
- "--n-gpu-layers"
- "999"
- "--metrics"
# performance tuning
- "--ctx-size"
- "32768"
- "--parallel"
- "1"
# KV cache quantization
- "--cache-type-k"
- "q8_0"
- "--cache-type-v"
- "q8_0"
ports:
- name: http
containerPort: 8080
securityContext:
privileged: true
volumeMounts:
- name: models
mountPath: /models
- name: dri
mountPath: /dev/dri
resources:
requests:
cpu: "2"
memory: "4Gi"
limits:
cpu: "2"
memory: "4Gi"
volumes:
- name: models
persistentVolumeClaim:
claimName: llama-gpu-models-pvc
- name: dri
hostPath:
path: /dev/dri
type: Directory
---
apiVersion: v1
kind: Service
metadata:
name: llama-server-gpu-rp
namespace: llama
spec:
selector:
app: llama-server-gpu-rp
ports:
- name: http
port: 8080
targetPort: http
type: ClusterIP
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: llama-server-gpu-rp
namespace: llama
labels:
app: llama-server-gpu-rp
spec:
namespaceSelector:
matchNames:
- llama
selector:
matchLabels:
app: llama-server-gpu-rp
podMetricsEndpoints:
- port: http
path: /metrics
interval: 15s