Custom LLM Deployment: Self-Hosting, Scaling, Production
Custom LLM Deployment
Section titled “Custom LLM Deployment”Infrastructure Setup
Section titled “Infrastructure Setup”Docker Container
Section titled “Docker Container”FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
RUN pip install torch transformers accelerate
COPY model/ /app/model/COPY server.py /app/
EXPOSE 8000
CMD ["python", "/app/server.py"]Kubernetes Deployment
Section titled “Kubernetes Deployment”apiVersion: apps/v1kind: Deploymentmetadata: name: llm-servicespec: replicas: 3 selector: matchLabels: app: llm template: metadata: labels: app: llm spec: containers: - name: llm image: llm-service:latest resources: limits: nvidia.com/gpu: 1 memory: "16Gi" requests: memory: "8Gi" ports: - containerPort: 8000Load Balancing
Section titled “Load Balancing”from fastapi import FastAPIimport asynciofrom typing import List
app = FastAPI()
class ModelPool: def __init__(self, num_replicas=4): self.models = [load_model() for _ in range(num_replicas)] self.locks = [asyncio.Lock() for _ in range(num_replicas)]
async def get_available_model(self): while True: for i, lock in enumerate(self.locks): if lock.locked(): continue await lock.acquire() return self.models[i], lock await asyncio.sleep(0.01)
pool = ModelPool(num_replicas=4)
@app.post("/generate")async def generate(prompt: str): model, lock = await pool.get_available_model() try: result = await asyncio.to_thread(model.generate, prompt) return {"result": result} finally: lock.release()Caching Strategy
Section titled “Caching Strategy”from functools import lru_cacheimport hashlib
class SmartCache: def __init__(self, redis_client): self.redis = redis_client
def cache_key(self, prompt, model, params): key_str = f"{prompt}:{model}:{params}" return hashlib.sha256(key_str.encode()).hexdigest()
def get(self, prompt, model, params): key = self.cache_key(prompt, model, params) return self.redis.get(key)
def set(self, prompt, model, params, result, ttl=3600): key = self.cache_key(prompt, model, params) self.redis.setex(key, ttl, result)Monitoring
Section titled “Monitoring”from prometheus_client import Counter, Histogram, Gauge
request_count = Counter('llm_requests_total', 'Total requests')request_duration = Histogram('llm_request_duration_seconds', 'Request duration')active_requests = Gauge('llm_active_requests', 'Active requests')model_memory = Gauge('llm_model_memory_bytes', 'Model memory usage')
@request_duration.time()async def generate_with_monitoring(prompt): request_count.inc() active_requests.inc() try: result = await generate(prompt) return result finally: active_requests.dec()Found an issue? Open an issue!