围绕 GPU共享与多租户隔离方案实现云原生多模型负载均衡与应急容灾的推理冷备架构设计一、多模型推理的负载均衡与容灾困境1.1 多模型部署的挑战云原生 AI 平台通常需要同时部署数十个不同规格的模型7B、13B、70B 等每个模型的 GPU 需求、延迟要求、吞吐量特征各不相同。多模型负载均衡与容灾的核心矛盾在于传统模式每个模型独立部署 模型A (7B) ─── GPU-0 ─── 10 QPS ─── LB 模型B (13B) ─── GPU-1 ─── 5 QPS ─── LB 模型C (70B) ─── GPU-2 ─── 1 QPS ─── LB 模型D (7B) ─── GPU-3 ─── 8 QPS ─── LB ↓ GPU 利用率35%, 45%, 30%, 40% ← 严重不均衡 容灾能力单点故障模型C 挂了直接不可用挑战描述影响显存碎片化各模型独占 GPU空闲时无法复用利用率50%负载不均衡大模型请求少但占 GPU小模型请求多但缺 GPU吞吐瓶颈容灾缺失模型实例故障后需人工干预MTTR 30min冷备切换慢冷备实例从镜像拉取到模型加载需数分钟SLA 违约1.2 理想架构设计目标架构GPU 共享池 多模型负载均衡 冷热备容灾 ┌──────────────────┐ │ Global Load │ │ Balancer │ └────────┬─────────┘ │ ┌───────────────┼───────────────┐ │ │ │ ┌────▼────┐ ┌────▼────┐ ┌────▼────┐ │ AZ-1 │ │ AZ-2 │ │ AZ-3 │ │ ┌─────┐ │ │ ┌─────┐ │ │ ┌─────┐ │ │ │GPU池│ │ │ │GPU池│ │ │ │GPU池│ │ │ │共享 │ │ │ │共享 │ │ │ │共享 │ │ │ └─────┘ │ │ └─────┘ │ │ └─────┘ │ │ ┌─────┐ │ │ ┌─────┐ │ │ ┌─────┐ │ │ │热备池│ │ │ │热备池│ │ │ │热备池│ │ │ │冷备池│ │ │ │冷备池│ │ │ │冷备池│ │ │ └─────┘ │ │ └─────┘ │ │ └─────┘ │ └─────────┘ └─────────┘ └─────────┘二、GPU 共享池架构2.1 基于 Volcano 的 GPU 共享池apiVersion: scheduling.volcano.sh/v1beta1 kind: Queue metadata: name: inference-queue spec: weight: 5 capability: nvidia.com/gpu: 16 cpu: 160 memory: 2Ti reclaimable: true overcommitRatio: nvidia.com/gpu: 1.5 --- apiVersion: scheduling.volcano.sh/v1beta1 kind: PodGroup metadata: name: model-group-a spec: minMember: 2 queue: inference-queue priorityClassName: high-priority --- apiVersion: apps/v1 kind: Deployment metadata: name: model-router namespace: inference-system spec: replicas: 2 selector: matchLabels: app: model-router template: metadata: labels: app: model-router spec: schedulerName: volcano containers: - name: router image: model-router:v1.0.0 args: - --gpu-pool-size16 - --overcommit-ratio1.5 - --modelsllama-7b,mistral-7b,gpt-4-8b ports: - containerPort: 8080 env: - name: GPU_MEMORY_STRATEGY value: shared - name: GPU_POOL_NODES value: gpu-node-0,gpu-node-1,gpu-node-2,gpu-node-32.2 共享池资源调度器// gpu_pool_scheduler.go package gpu_pool import ( sync time ) type GPUPool struct { mu sync.RWMutex nodes map[string]*GPUNode totalMemory int64 allocatedMemory int64 overcommitRatio float64 } type GPUNode struct { Name string GPUs []*GPUDevice TotalMemory int64 AllocatedMemory int64 } type GPUDevice struct { Index int TotalMemory int64 UsedMemory int64 ReservedMemory int64 ActiveModels []string LastUsed time.Time } func (p *GPUPool) ScheduleModel(modelName string, memoryRequired int64) (*GPUDevice, error) { p.mu.Lock() defer p.mu.Unlock() // 检查全局容量 availableMem : int64(float64(p.totalMemory) * p.overcommitRatio) - p.allocatedMemory if memoryRequired availableMem { return nil, ErrInsufficientMemory } // 选择最优 GPU显存最充裕 已有同模型缓存的优先 bestGPU : p.selectOptimalGPU(memoryRequired, modelName) if bestGPU nil { return nil, ErrNoSuitableGPU } // 分配显存 bestGPU.ReservedMemory memoryRequired bestGPU.ActiveModels append(bestGPU.ActiveModels, modelName) p.allocatedMemory memoryRequired return bestGPU, nil } func (p *GPUPool) selectOptimalGPU(memoryRequired int64, modelName string) *GPUDevice { var best *GPUDevice bestScore : -1.0 for _, node : range p.nodes { for _, gpu : range node.GPUs { available : gpu.TotalMemory - gpu.UsedMemory - gpu.ReservedMemory if available memoryRequired { continue } // 评分已有模型缓存 50分空闲率 50分 score : 0.0 for _, m : range gpu.ActiveModels { if m modelName { score 50 // 模型已缓存优先 } } score float64(available) / float64(gpu.TotalMemory) * 50 if score bestScore { bestScore score best gpu } } } return best }三、多模型负载均衡3.1 模型感知的负载均衡器apiVersion: v1 kind: ConfigMap metadata: name: model-lb-config namespace: inference-system data: nginx.conf: | upstream model_backend { # 模型A: 7B 参数权重 10 server model-a-instance-1.inference-system:8080 weight10; server model-a-instance-2.inference-system:8080 weight10; # 模型B: 13B 参数权重 5 server model-b-instance-1.inference-system:8080 weight5; # 模型C: 70B 参数权重 1 server model-c-instance-1.inference-system:8080 weight1; # 热备实例低权重仅在主实例故障时接管 server model-a-standby.inference-system:8080 weight1 backup; server model-b-standby.inference-system:8080 weight1 backup; keepalive 32; } # 请求路由根据模型名称分发 server { listen 8080; location ~ ^/v1/models/(?model_name[^/])/predict { # 基于模型名称的哈希路由保证同模型请求到同实例 hash $model_name; proxy_pass http://model_backend; } location /healthz { # 主动健康检查 health_check uri/healthz interval5s fails3 passes2; return 200; } } --- apiVersion: apps/v1 kind: Deployment metadata: name: model-load-balancer namespace: inference-system spec: replicas: 2 selector: matchLabels: app: model-lb template: metadata: labels: app: model-lb spec: containers: - name: nginx image: nginx:1.25-alpine volumeMounts: - name: config mountPath: /etc/nginx/conf.d ports: - containerPort: 8080 name: http - containerPort: 8081 name: health resources: requests: cpu: 1000m memory: 512Mi limits: cpu: 2000m memory: 1Gi volumes: - name: config configMap: name: model-lb-config3.2 基于 Envoy 的流量调度apiVersion: networking.istio.io/v1beta1 kind: VirtualService metadata: name: model-routing namespace: inference-system spec: hosts: - inference.example.com gateways: - inference-gateway http: # 模型A 路由7B - match: - headers: model-name: exact: llama-2-7b route: - destination: host: model-llama-7b port: number: 8080 weight: 90 - destination: host: model-llama-7b-standby port: number: 8080 weight: 10 mirror: host: model-llama-7b-shadow port: number: 8080 mirrorPercent: 10 retries: attempts: 3 perTryTimeout: 2s retryOn: gateway-error,connect-failure,refused-stream timeout: 30s # 模型B 路由13B - match: - headers: model-name: exact: mistral-7b route: - destination: host: model-mistral-7b port: number: 8080 weight: 100 fault: abort: percent: 0 httpStatus: 503 retries: attempts: 2 perTryTimeout: 5s --- apiVersion: networking.istio.io/v1beta1 kind: DestinationRule metadata: name: model-circuit-breaker namespace: inference-system spec: host: *.inference-system.svc.cluster.local trafficPolicy: connectionPool: tcp: maxConnections: 100 http: http1MaxPendingRequests: 50 http2MaxRequests: 200 maxRequestsPerConnection: 50 outlierDetection: consecutive5xxErrors: 5 interval: 30s baseEjectionTime: 60s maxEjectionPercent: 50 loadBalancer: consistentHash: httpHeaderName: x-request-id四、应急容灾的冷备架构4.1 冷备/热备/温备分级级别恢复时间资源消耗模型状态适用场景热备1s100% GPU已加载到显存核心模型SLA 10ms温备5-30s50% GPU 主机内存权重在共享内存重要模型SLA 100ms冷备60-300s0% GPU 持久化存储仅保存 checkpoint非关键模型SLA 1sapiVersion: v1 kind: ConfigMap metadata: name: disaster-recovery-config namespace: inference-system data: dr-policy.yaml: | models: - name: llama-2-7b priority: critical hotStandby: 2 # 2 个热备实例 warmStandby: 0 coldStandby: 1 rto: 10s rpo: 0 - name: mistral-7b priority: high hotStandby: 1 warmStandby: 1 coldStandby: 1 rto: 30s rpo: 1m - name: gpt-4-8b priority: normal hotStandby: 0 warmStandby: 1 coldStandby: 2 rto: 5m rpo: 5m az_failover: strategy: active-passive # 主备模式 activeZones: [az-1, az-2] standbyZone: az-3 healthCheckInterval: 10s failoverThreshold: 3 # 连续 3 次健康检查失败触发切换4.2 冷备实例自动管理系统# cold_standby_manager.py import kopf import kubernetes import asyncio import json class ColdStandbyManager: 冷备实例管理器 def __init__(self): self.api kubernetes.client.AppsV1Api() self.core_api kubernetes.client.CoreV1Api() self.standby_pool {} async def ensure_standby(self, model_name: str, count: int): 确保冷备实例数量 current_count len(self.standby_pool.get(model_name, [])) if current_count count: # 需要创建冷备实例 for i in range(count - current_count): await self.create_standby_instance(model_name) elif current_count count: # 需要缩减冷备实例 for _ in range(current_count - count): await self.delete_standby_instance(model_name) async def create_standby_instance(self, model_name: str): 创建冷备实例仅分配资源不加载模型 deploy_name f{model_name}-standby-{len(self.standby_pool.get(model_name, []))} # 创建 Deployment使用共享内存缓存模型权重 deployment { apiVersion: apps/v1, kind: Deployment, metadata: { name: deploy_name, labels: { app: model_name, standby-type: cold } }, spec: { replicas: 1, selector: {matchLabels: {app: deploy_name}}, template: { metadata: {labels: {app: deploy_name}}, spec: { containers: [{ name: standby, image: standby-agent:v1.0.0, env: [ {name: MODEL_NAME, value: model_name}, {name: STANDBY_MODE, value: cold}, {name: WARMUP_ENABLED, value: false} ], resources: { requests: { memory: 8Gi, cpu: 500m }, limits: { memory: 16Gi, cpu: 1000m } } }] } } } } # 记录到池中 if model_name not in self.standby_pool: self.standby_pool[model_name] [] self.standby_pool[model_name].append(deploy_name) return deploy_name async def promote_to_hot(self, model_name: str, standby_name: str): 将冷备实例提升为热备 # 1. 标记实例正在升级 self.patch_deployment(standby_name, { metadata: {labels: {standby-type: promoting}} }) # 2. 分配 GPU gpu_allocation await self.allocate_gpu(model_name) # 3. 加载模型到显存 await self.load_model_to_gpu(standby_name, model_name, gpu_allocation) # 4. 更新实例类型 self.patch_deployment(standby_name, { metadata: {labels: {standby-type: hot}}, spec: { template: { spec: { containers: [{ name: standby, resources: { requests: {nvidia.com/gpu: 1}, limits: {nvidia.com/gpu: 1} } }] } } } }) # 5. 注册到负载均衡器 await self.register_to_lb(model_name, standby_name)4.3 自动故障切换apiVersion: v1 kind: ConfigMap metadata: name: failover-controller namespace: kube-system data: controller.py: | import asyncio import aiohttp import kubernetes class FailoverController: 故障切换控制器 def __init__(self): self.api kubernetes.client.CoreV1Api() self.health_check_interval 10 async def check_instance_health(self, pod_name, namespace): 检查实例健康状态 try: pod self.api.read_namespaced_pod(pod_name, namespace) # 检查 Pod 状态 if pod.status.phase ! Running: return False # 检查 readiness probe for condition in pod.status.conditions: if condition.type Ready: return condition.status True return False except Exception: return False async def monitor_and_failover(self): 监控并执行故障切换 while True: # 获取所有推理实例 pods self.api.list_pods_for_all_namespaces( label_selectorapp in (inference-engine) ) for pod in pods.items: if not await self.check_instance_health( pod.metadata.name, pod.metadata.namespace ): print(fInstance unhealthy: {pod.metadata.name}) await self.execute_failover(pod) await asyncio.sleep(self.health_check_interval) async def execute_failover(self, failed_pod): 执行故障切换 model_name failed_pod.metadata.labels.get(model) standby_type failed_pod.metadata.labels.get(standby-type, hot) # 1. 标记故障实例 self.api.patch_namespaced_pod( failed_pod.metadata.name, failed_pod.metadata.namespace, {metadata: {labels: {status: failed}}} ) # 2. 从负载均衡器移除 await self.remove_from_lb(failed_pod) # 3. 寻找可用备实例 standby await self.find_standby(model_name) if standby: # 4. 提升备实例 await self.promote_standby(standby, model_name) else: # 5. 如果没有备实例创建新的冷备并紧急启动 print(fNo standby for {model_name}, creating emergency instance) await self.create_emergency_instance(model_name)五、容灾恢复验证5.1 故障注入与恢复测试#!/bin/bash # 容灾恢复测试脚本 echo 推理容灾恢复测试 # 1. 确认当前实例状态 echo 1. Current instance status: kubectl get pods -n inference-system -l appinference-engine -o wide # 2. 注入故障删除实例 echo 2. Injecting failure: deleting llama-7b-0... kubectl delete pod llama-7b-0 -n inference-system --grace-period0 # 3. 监控故障切换 echo 3. Monitoring failover... for i in {1..30}; do echo --- T${i}s --- kubectl get pods -n inference-system -l appinference-engine -o wide # 检查新实例是否已创建 new_instance$(kubectl get pods -n inference-system \ -l appinference-engine,statusactive \ -o json | jq -r .items[].metadata.name | grep llama) if [ -n $new_instance ]; then echo !!! New instance created: $new_instance !!! break fi sleep 2 done # 4. 验证服务可用性 echo 4. Verifying service availability... kubectl run test-request \ --imagecurlimages/curl \ --restartNever \ --rm -it -- \ curl -s http://model-router.inference-system:8080/v1/models/llama-2-7b/predict \ -H Content-Type: application/json \ -d {prompt: Hello, max_tokens: 10} echo Recovery test completed 5.2 故障恢复 SLA 基准故障类型检测时间切换时间总 RTO数据丢失Pod 进程崩溃1s5s6s进行中请求节点宕机10s30s40s进行中请求GPU 故障5s30s35s无显存数据AZ 中断30s60s90s无跨 AZ 同步模型损坏1s60s61s需重新加载六、总结围绕 GPU 共享与多租户隔离方案构建推理冷备架构的核心要点共享池化Volcano GPU 共享调度突破一模型一 GPU的物理隔离多级负载均衡Nginx简单路由 Envoy高级流量管理两层负载均衡冷热温三级容灾按模型优先级动态分配热备/温备/冷备实例自动故障切换健康检查 备实例自动提升 负载均衡器自动摘除/注册架构可观测Prometheus 指标 故障注入测试持续验证 RTOGPU 共享与多租户隔离不是互斥的——通过精细的调度策略和资源隔离机制可以在保障租户隔离的同时实现 GPU 资源的高效利用。当故障发生时冷备架构确保关键推理服务在秒级恢复非关键服务在分钟级恢复真正实现有状态的云原生推理。\