Skip to content

Custom LLM Deployment: Self-Hosting, Scaling, Production

FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
RUN pip install torch transformers accelerate
COPY model/ /app/model/
COPY server.py /app/
EXPOSE 8000
CMD ["python", "/app/server.py"]
apiVersion: apps/v1
kind: Deployment
metadata:
name: llm-service
spec:
replicas: 3
selector:
matchLabels:
app: llm
template:
metadata:
labels:
app: llm
spec:
containers:
- name: llm
image: llm-service:latest
resources:
limits:
nvidia.com/gpu: 1
memory: "16Gi"
requests:
memory: "8Gi"
ports:
- containerPort: 8000
from fastapi import FastAPI
import asyncio
from typing import List
app = FastAPI()
class ModelPool:
def __init__(self, num_replicas=4):
self.models = [load_model() for _ in range(num_replicas)]
self.locks = [asyncio.Lock() for _ in range(num_replicas)]
async def get_available_model(self):
while True:
for i, lock in enumerate(self.locks):
if lock.locked():
continue
await lock.acquire()
return self.models[i], lock
await asyncio.sleep(0.01)
pool = ModelPool(num_replicas=4)
@app.post("/generate")
async def generate(prompt: str):
model, lock = await pool.get_available_model()
try:
result = await asyncio.to_thread(model.generate, prompt)
return {"result": result}
finally:
lock.release()
from functools import lru_cache
import hashlib
class SmartCache:
def __init__(self, redis_client):
self.redis = redis_client
def cache_key(self, prompt, model, params):
key_str = f"{prompt}:{model}:{params}"
return hashlib.sha256(key_str.encode()).hexdigest()
def get(self, prompt, model, params):
key = self.cache_key(prompt, model, params)
return self.redis.get(key)
def set(self, prompt, model, params, result, ttl=3600):
key = self.cache_key(prompt, model, params)
self.redis.setex(key, ttl, result)
from prometheus_client import Counter, Histogram, Gauge
request_count = Counter('llm_requests_total', 'Total requests')
request_duration = Histogram('llm_request_duration_seconds', 'Request duration')
active_requests = Gauge('llm_active_requests', 'Active requests')
model_memory = Gauge('llm_model_memory_bytes', 'Model memory usage')
@request_duration.time()
async def generate_with_monitoring(prompt):
request_count.inc()
active_requests.inc()
try:
result = await generate(prompt)
return result
finally:
active_requests.dec()

Found an issue? Open an issue!