+
+ );
+}
+
+const capabilities = [
{
- title: 'Getting Started',
- to: '/docs/getting-started',
- description: 'Introduction to llm-d, quickstart guide, feature matrix, and release artifacts.',
+ icon: Network,
+ title: 'LLM-Aware Load Balancing',
+ tagline:
+ 'Route every request to the replica that will serve it fastest.',
+ body:
+ "llm-d's endpoint picker scores each replica in real time across four signals: prefix cache locality, KV-cache utilization, queue depth, and predicted latency. Each request is dispatched to the replica with the lowest expected tail latency — delivering order-of-magnitude p99 improvements over round-robin routing, with no additional hardware.",
+ ctaLabel: 'Explore LLM-aware routing',
+ to: '/docs/guides/intelligent-inference-scheduling',
},
{
- title: 'Architecture',
- to: '/docs/architecture',
- description: 'Core components — Proxy, InferencePool, EPP, Model Servers — and advanced features.',
+ icon: Split,
+ title: 'Prefill / Decode Disaggregation',
+ tagline:
+ 'Scale prompt processing and token generation independently.',
+ body:
+ 'Prefill and decode have fundamentally different resource profiles. llm-d splits them across dedicated worker pools and transfers KV-cache between phases over RDMA via NIXL. The result is faster TTFT, more predictable TPOT, and better GPU utilization across the cluster.',
+ ctaLabel: 'See how disaggregation works',
+ to: '/docs/guides/pd-disaggregation',
},
{
- title: 'Guides',
- to: '/docs/guides',
- description: 'Step-by-step adoption procedures: scheduling, disaggregation, expert parallelism, caching.',
+ icon: Layers,
+ title: 'Wide Expert Parallelism',
+ tagline:
+ "Serve frontier MoE models that don't fit on a single node.",
+ body:
+ 'llm-d combines data parallelism and expert parallelism across nodes to deploy large mixture-of-experts models like DeepSeek-R1. This pattern maximizes KV-cache space, enables long-context online serving, and supports high-throughput generation for batch and RL workloads.',
+ ctaLabel: 'Deploy wide-EP models',
+ to: '/docs/guides/wide-expert-parallelism',
},
{
- title: 'Resources',
- to: '/docs/resources/gateway',
- description: 'Gateway setup, API configuration, monitoring, multi-model deployment, and RDMA.',
+ icon: Database,
+ title: 'Tiered KV Prefix Caching',
+ tagline: 'Cache at memory speed. Spill at storage cost.',
+ body:
+ 'llm-d extends KV-cache beyond accelerator HBM through a configurable storage hierarchy: HBM, CPU memory, local SSD, and shared remote storage (in progress). Hot prefixes stay close to the accelerator; cold prefixes spill to cheaper tiers automatically. You serve longer contexts and higher concurrency without adding GPUs.',
+ ctaLabel: 'Configure tiered caching',
+ to: '/docs/guides/kv-cache-management',
},
{
- title: 'API Reference',
- to: '/docs/api-reference',
- description: 'API specifications and reference documentation.',
+ icon: TrendingUp,
+ title: 'Workload Autoscaling',
+ tagline: 'Scale for the load you have, on the hardware you have.',
+ body:
+ 'Two complementary patterns, both built on Kubernetes primitives. HPA scales replicas using live inference signals — queue depth and request counts from the endpoint picker. The Workload Variant Autoscaler routes across model variants on heterogeneous hardware to meet SLOs at the lowest cost.',
+ ctaLabel: 'Set up autoscaling',
+ to: '/docs/guides/workload-autoscaling',
},
];
-function FeaturesSection() {
+function CapabilitiesSection(): React.JSX.Element {
return (
-
-
);
From efa92c3ddc736fb4c57a030e1b4b2a9b10518e86 Mon Sep 17 00:00:00 2001
From: IBRAHIM IBRAHIM <66755652+Ibrahim2595@users.noreply.github.com>
Date: Fri, 8 May 2026 16:07:43 -0400
Subject: [PATCH 5/5] feat: updating the hero message, description, and
capabilities on the landing page
Signed-off-by: IBRAHIM IBRAHIM <66755652+Ibrahim2595@users.noreply.github.com>
---
preview/src/pages/index.tsx | 45 ++++++++++++++------------
preview/static/img/docs/llm-d-arch.svg | 2 +-
2 files changed, 26 insertions(+), 21 deletions(-)
diff --git a/preview/src/pages/index.tsx b/preview/src/pages/index.tsx
index fb4019bb..ca083f0b 100644
--- a/preview/src/pages/index.tsx
+++ b/preview/src/pages/index.tsx
@@ -9,26 +9,31 @@ function HeroSection(): React.JSX.Element {
return (
-
-
- Kubernetes-native distributed inference serving for LLMs
-
-
-
- Get Started
-
-
- Quickstart
-
+
+
+
+ Production-grade distributed LLM inference.
+
+
+ llm-d is a distributed inference stack that orchestrates vLLM and
+ SGLang across your cluster with LLM-aware routing, disaggregated
+ serving, and tiered KV caching — using Kubernetes primitives you
+ already run.
+
+
+
+ Get started
+
+
+ See the architecture
+
+
+
+
diff --git a/preview/static/img/docs/llm-d-arch.svg b/preview/static/img/docs/llm-d-arch.svg
index 6cbb4441..6ec00ecc 100644
--- a/preview/static/img/docs/llm-d-arch.svg
+++ b/preview/static/img/docs/llm-d-arch.svg
@@ -1 +1 @@
-
+
\ No newline at end of file