From eccdb89f854b0a70a74960126211d1c3f4a6b030 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Mon, 8 Jun 2026 00:57:12 -0700
Subject: [PATCH 01/10] feat(inference): add DeepSeek V4 Pro model architecture
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a MODEL_ARCHITECTURES entry for DeepSeek V4 Pro (1.6T/49B MoE, 61
layers, 1M context) following the existing per-model pattern. Attention
is modeled as a Hybrid stack of two interleaved compressed variants —
Heavily Compressed Attention (31 layers) and Compressed Sparse Attention
(30 layers) — each carrying a 128-token sliding-window branch and a
learnable attention sink.

Surface sliding-window attention on both alternating blocks: the diagram's
window note now derives from a per-spec slidingWindow field instead of the
block index, so hybrid models show window=128 on every attention variant
(gpt-oss behavior preserved). The specs-bar attention cell now derives from
attentionType so it reads "Hybrid" instead of the hardcoded "Sink/Full GQA".

Sourced from deepseek-ai/DeepSeek-V4-Pro (config.json, inference/model.py,
DeepSeek_V4.pdf).
---
 .../app/cypress/e2e/model-architecture.cy.ts  | 52 ++++++++++++++
 .../inference/ui/ModelArchitectureDiagram.tsx | 13 ++--
 .../app/src/lib/model-architectures.test.ts   | 69 ++++++++++++++++++
 packages/app/src/lib/model-architectures.ts   | 70 +++++++++++++++++++
 4 files changed, 199 insertions(+), 5 deletions(-)

diff --git a/packages/app/cypress/e2e/model-architecture.cy.ts b/packages/app/cypress/e2e/model-architecture.cy.ts
index dd8d3722..29564a9e 100644
--- a/packages/app/cypress/e2e/model-architecture.cy.ts
+++ b/packages/app/cypress/e2e/model-architecture.cy.ts
@@ -313,4 +313,56 @@ describe('Model Architecture Diagram', () => {
       cy.contains('Released by OpenAI').should('be.visible');
     });
   });
+
+  describe('Hybrid Attention Blocks (MoE model - DeepSeek V4 Pro)', () => {
+    before(() => {
+      // Clear any stale Radix scroll lock from prior Select interactions
+      cy.document().then((doc) => {
+        delete doc.body.dataset.scrollLocked;
+        doc.body.style.removeProperty('pointer-events');
+      });
+      cy.get('[role="combobox"]').filter(':visible').first().click();
+      cy.get('[role="option"]').contains('DeepSeek V4 Pro').click();
+
+      cy.get('[data-testid="model-architecture-toggle"]').should('be.visible');
+      cy.get('body').then(($body) => {
+        if ($body.find('[data-testid="model-architecture-svg"]:visible').length === 0) {
+          cy.get('[data-testid="model-architecture-toggle"]').click();
+        }
+      });
+      cy.get('[data-testid="model-architecture-svg"]').should('be.visible');
+    });
+
+    it('shows MoE and Hybrid badges for DeepSeek V4 Pro', () => {
+      cy.get('[data-testid="model-architecture-toggle"]').should('contain.text', 'MoE');
+      cy.get('[data-testid="model-architecture-toggle"]').should('contain.text', 'Hybrid');
+      cy.get('[data-testid="model-architecture-toggle"]').should('contain.text', '1.6T');
+    });
+
+    it('shows two separate hybrid (CSA/HCA) blocks with an alternating indicator', () => {
+      cy.get('[data-testid="expand-altBlock0"]').should('exist');
+      cy.get('[data-testid="expand-altBlock1"]').should('exist');
+      cy.get('[data-testid="expand-transformer"]').should('not.exist');
+      cy.get('[data-testid="expand-denseTransformer"]').should('not.exist');
+      cy.get('[data-testid="alternating-indicator"]').should('exist');
+    });
+
+    it('Hybrid attention is NOT expandable; expert grid is expandable within a block', () => {
+      cy.get('[data-testid="expand-altBlock0"]').click({ force: true });
+      cy.get('[data-testid="collapse-altBlock0"]').should('exist');
+      cy.get('[data-testid="expand-attention"]').should('not.exist');
+      cy.get('[data-testid="expand-altExperts0"]').should('exist');
+    });
+
+    it('expert grid can be expanded to show SwiGLU details', () => {
+      cy.get('[data-testid="expand-altExperts0"]').click({ force: true });
+      cy.get('[data-testid="model-architecture-svg"]').should('be.visible');
+    });
+
+    it('shows DeepSeek V4 Pro features (incl. sliding window) and developer info', () => {
+      cy.contains('Hybrid CSA + HCA Attention').should('be.visible');
+      cy.contains('Sliding Window Attention (128 tokens)').should('be.visible');
+      cy.contains('Released by DeepSeek').should('be.visible');
+    });
+  });
 });
diff --git a/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx b/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
index e79f057a..b03e7851 100644
--- a/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
+++ b/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
@@ -1667,10 +1667,13 @@ function renderDiagram(
         ]
           .filter(Boolean)
           .join('  \u00B7  ');
-        const attnSub =
-          bi === 0 && arch.slidingWindow
-            ? `${headSub}${headSub ? '  \u00B7  ' : ''}window=${arch.slidingWindow}`
-            : headSub || undefined;
+        // Sliding-window note is per layer-type: a hybrid model (e.g. DeepSeek
+        // V4) carries the window on every attention variant, whereas gpt-oss
+        // only puts it on its sliding block. Drive it off the spec, not bi.
+        const specWindow = spec.slidingWindow;
+        const attnSub = specWindow
+          ? `${headSub}${headSub ? '  \u00B7  ' : ''}window=${specWindow}`
+          : headSub || undefined;
         drawBlock(innerX, altAttnY[bi], innerW, blockH, 'attention', spec.label, attnSub);
 
         const aBottom = altAttnY[bi] + blockH + 4;
@@ -1909,7 +1912,7 @@ function renderDiagram(
     },
     {
       label: 'Attention',
-      value: hasAlternatingLayers ? 'Sink/Full GQA' : arch.attentionType,
+      value: arch.attentionType === 'AlternatingSinkGQA' ? 'Sink/Full GQA' : arch.attentionType,
     },
     {
       label: 'Context',
diff --git a/packages/app/src/lib/model-architectures.test.ts b/packages/app/src/lib/model-architectures.test.ts
index 68b913c4..0766e950 100644
--- a/packages/app/src/lib/model-architectures.test.ts
+++ b/packages/app/src/lib/model-architectures.test.ts
@@ -20,6 +20,7 @@ describe('MODEL_ARCHITECTURES', () => {
       Model.Llama3_3_70B,
       Model.Llama3_1_70B,
       Model.DeepSeek_R1,
+      Model.DeepSeek_V4_Pro,
       Model.GptOss,
       Model.Kimi_K2_5,
       Model.MiniMax_M2_5,
@@ -158,6 +159,66 @@ describe('getModelArchitecture', () => {
     expect(arch?.vocabSize).toBe(129280);
   });
 
+  it('returns architecture for DeepSeek V4 Pro with MoE and Hybrid attention details', () => {
+    const arch = getModelArchitecture(Model.DeepSeek_V4_Pro);
+    expect(arch).toBeDefined();
+    expect(arch?.totalParams).toBe(1600);
+    expect(arch?.activeParams).toBe(49);
+    expect(arch?.architectureType).toBe('moe');
+    expect(arch?.attentionType).toBe('Hybrid');
+    expect(arch?.attentionExpandable).toBe(false);
+    expect(arch?.numLayers).toBe(61);
+    expect(arch?.hiddenSize).toBe(7168);
+    expect(arch?.numHeads).toBe(128);
+    expect(arch?.numKVHeads).toBe(1);
+    expect(arch?.headDim).toBe(512);
+    expect(arch?.ffnDim).toBe(3072);
+    expect(arch?.numExperts).toBe(385);
+    expect(arch?.activeExperts).toBe(6);
+    expect(arch?.hasSharedExpert).toBe(true);
+    // First 3 layers use hash-routed MoE (not dense FFN), so no dense block.
+    expect(arch?.denseFFNLayers).toBeUndefined();
+    expect(arch?.slidingWindow).toBe(128);
+    expect(arch?.contextWindow).toBe(1048576);
+    expect(arch?.developer).toBe('DeepSeek');
+    expect(arch?.vocabSize).toBe(129280);
+    expect(arch?.sourceUrl).toBe('https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro');
+  });
+
+  it('DeepSeek V4 Pro surfaces sliding-window attention and hybrid components in features', () => {
+    const arch = getModelArchitecture(Model.DeepSeek_V4_Pro);
+    expect(arch?.features).toBeDefined();
+    expect(arch?.features).toContain('Sliding Window Attention (128 tokens)');
+    expect(arch?.features).toContain('Hybrid CSA + HCA Attention');
+    expect(arch?.features).toContain('Attention Sink');
+    expect(arch?.features).toContain('Multi-Token Prediction');
+  });
+
+  it('DeepSeek V4 Pro has alternatingLayers with CSA and HCA specs, each carrying a sliding window', () => {
+    const arch = getModelArchitecture(Model.DeepSeek_V4_Pro);
+    expect(arch?.alternatingLayers).toBeDefined();
+    expect(arch?.alternatingLayers).toHaveLength(2);
+
+    const [hca, csa] = arch!.alternatingLayers!;
+    expect(hca.label).toBe('Heavily Compressed Attention');
+    expect(hca.count).toBe(31);
+    expect(hca.description).toContain('sliding window');
+    expect(hca.slidingWindow).toBe(128);
+
+    expect(csa.label).toBe('Compressed Sparse Attention');
+    expect(csa.count).toBe(30);
+    expect(csa.description).toContain('sliding window');
+    expect(csa.description).toContain('lightning indexer');
+    expect(csa.slidingWindow).toBe(128);
+  });
+
+  it('DeepSeek V4 Pro alternating layer counts sum to numLayers', () => {
+    const arch = getModelArchitecture(Model.DeepSeek_V4_Pro);
+    expect(arch?.alternatingLayers).toBeDefined();
+    const totalAlternating = arch!.alternatingLayers!.reduce((sum, l) => sum + l.count, 0);
+    expect(totalAlternating).toBe(arch!.numLayers);
+  });
+
   it('returns architecture for Kimi K2.5 with MoE and MLA details', () => {
     const arch = getModelArchitecture(Model.Kimi_K2_5);
     expect(arch).toBeDefined();
@@ -241,10 +302,13 @@ describe('getModelArchitecture', () => {
     expect(sliding.count).toBe(18);
     expect(sliding.description).toContain('128-token sliding window');
     expect(sliding.description).toContain('attention sink');
+    expect(sliding.slidingWindow).toBe(128);
 
     expect(full.label).toBe('Causal Grouped Query Attention');
     expect(full.count).toBe(18);
     expect(full.description).toContain('full causal masking');
+    // Full-attention block has no sliding window (per-spec, not block-index).
+    expect(full.slidingWindow).toBeUndefined();
   });
 
   it('gpt-oss alternating layer counts sum to numLayers', () => {
@@ -298,6 +362,11 @@ describe('getArchitectureSummary', () => {
     expect(getArchitectureSummary(arch!)).toBe('MoE 671B (37B active)');
   });
 
+  it('returns MoE summary for DeepSeek V4 Pro with trillion-scale params', () => {
+    const arch = getModelArchitecture(Model.DeepSeek_V4_Pro);
+    expect(getArchitectureSummary(arch!)).toBe('MoE 1.6T (49B active)');
+  });
+
   it('returns MoE summary for gpt-oss 120B', () => {
     const arch = getModelArchitecture(Model.GptOss);
     expect(getArchitectureSummary(arch!)).toBe('MoE 120B (5B active)');
diff --git a/packages/app/src/lib/model-architectures.ts b/packages/app/src/lib/model-architectures.ts
index e7db84cd..40fd1995 100644
--- a/packages/app/src/lib/model-architectures.ts
+++ b/packages/app/src/lib/model-architectures.ts
@@ -23,6 +23,12 @@ export interface AlternatingLayerSpec {
   count: number;
   /** Color key for visual distinction */
   colorKey: 'attention' | 'ffn' | 'norm' | 'router' | 'expert';
+  /**
+   * Sliding-window size (in tokens) for this layer type, when it includes a
+   * local sliding-window attention branch. Rendered as `window=N` in the
+   * diagram. Omit for layer types that use full / non-windowed attention.
+   */
+  slidingWindow?: number;
 }
 
 /**
@@ -93,6 +99,7 @@ export interface ModelArchitecture {
  * - https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
  * - https://huggingface.co/deepseek-ai/DeepSeek-R1-0528
  * - https://github.com/deepseek-ai/DeepSeek-V3
+ * - https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro (config.json, inference/model.py, DeepSeek_V4.pdf)
  * - https://huggingface.co/moonshotai/Kimi-K2.5/blob/main/config.json
  * - https://huggingface.co/openai/gpt-oss-120b/blob/main/config.json
  * - https://huggingface.co/MiniMaxAI/MiniMax-M2/blob/main/config.json
@@ -124,6 +131,68 @@ export const MODEL_ARCHITECTURES: Partial<Record<Model, ModelArchitecture>> = {
     developer: 'DeepSeek',
     sourceUrl: 'https://huggingface.co/deepseek-ai/DeepSeek-R1-0528',
   },
+  [Model.DeepSeek_V4_Pro]: {
+    model: Model.DeepSeek_V4_Pro,
+    totalParams: 1600, // 1.6T
+    activeParams: 49,
+    architectureType: 'moe',
+    attentionType: 'Hybrid',
+    // Hybrid CSA/HCA is a bespoke compressed-attention stack, not the standard
+    // Q/K/V GQA layout — render it as static blocks, not the GQA drill-down.
+    attentionExpandable: false,
+    numLayers: 61,
+    hiddenSize: 7168,
+    numHeads: 128,
+    // Shared single-latent KV (MLA-lineage MQA): num_key_value_heads = 1.
+    numKVHeads: 1,
+    headDim: 512,
+    vocabSize: 129280,
+    ffnDim: 3072, // moe_intermediate_size
+    numExperts: 385, // 384 routed + 1 shared
+    activeExperts: 6,
+    hasSharedExpert: true,
+    // Attention layers interleave two compressed variants; every layer also
+    // carries a 128-token sliding-window branch plus a learnable attention sink.
+    // Counts: 31 HCA + 30 CSA = 61 (the extra MTP block is sliding-window only).
+    alternatingLayers: [
+      {
+        label: 'Heavily Compressed Attention',
+        description:
+          'HCA: the KV of every 128 tokens is consolidated into a single entry and attended densely, alongside a 128-token sliding window of uncompressed KV and a learnable attention sink.',
+        count: 31,
+        colorKey: 'attention',
+        slidingWindow: 128,
+      },
+      {
+        label: 'Compressed Sparse Attention',
+        description:
+          'CSA: the KV of every 4 tokens is compressed to one entry, then a lightning indexer selects the top-1024 compressed blocks for sparse attention, alongside a 128-token sliding window and a learnable attention sink.',
+        count: 30,
+        colorKey: 'attention',
+        slidingWindow: 128,
+      },
+    ],
+    slidingWindow: 128,
+    contextWindow: 1048576, // 1M
+    features: [
+      'Hybrid CSA + HCA Attention',
+      'Sliding Window Attention (128 tokens)',
+      'Attention Sink',
+      'MLA-style Shared-KV MQA',
+      'Lightning Indexer (sparse top-k)',
+      'Manifold-Constrained Hyper-Connections (mHC)',
+      'sqrt-softplus Routing',
+      'Auxiliary-loss-free Load Balancing',
+      'Hash Routing (first 3 layers)',
+      'Multi-Token Prediction',
+      'YaRN RoPE (1M context)',
+      'FP4 Experts + FP8 Mixed Precision',
+      'Muon Optimizer',
+    ],
+    releaseDate: '2026-06-08',
+    developer: 'DeepSeek',
+    sourceUrl: 'https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro',
+  },
   [Model.Llama3_3_70B]: {
     model: Model.Llama3_3_70B,
     totalParams: 70,
@@ -182,6 +251,7 @@ export const MODEL_ARCHITECTURES: Partial<Record<Model, ModelArchitecture>> = {
         description: 'GQA with 128-token sliding window and learnable attention sink tokens',
         count: 18,
         colorKey: 'attention',
+        slidingWindow: 128,
       },
       {
         label: 'Causal Grouped Query Attention',

From 0afbf446cfbf4ff1d8b28fcf5edc60e76ad9231a Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Mon, 8 Jun 2026 01:20:15 -0700
Subject: [PATCH 02/10] feat(inference): surface SWA as an explicit block; fix
 residual + glyph centering
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DeepSeek V4 hybrid attention now drills down: expanding the CSA/HCA
attention block reveals the sliding-window branch as its own block
alongside the compressed branch (lightning indexer for CSA, heavy
compression for HCA), converging at shared-KV MQA + sink + output
projection. Gated to attentionType === 'Hybrid' so gpt-oss is unchanged.

Also fixes two diagram issues affecting all models:
- Residual bypass tapped at the RMSNorm's top edge, so its horizontal
  connector ran across the norm block. Tap from the arrow gap above the
  norm instead.
- Circle glyphs (+, ×, −) rendered off-center because
  dominant-baseline: central is unreliable (Safari falls back to the
  alphabetic baseline). Use dy=0.35em, which centers consistently across
  browsers and matches central where it already worked.
---
 .../app/cypress/e2e/model-architecture.cy.ts  |   8 +-
 .../inference/ui/ModelArchitectureDiagram.tsx | 107 +++++++++++++-----
 .../app/src/lib/model-architectures.test.ts   |  38 +++++++
 packages/app/src/lib/model-architectures.ts   |  52 +++++++++
 4 files changed, 175 insertions(+), 30 deletions(-)

diff --git a/packages/app/cypress/e2e/model-architecture.cy.ts b/packages/app/cypress/e2e/model-architecture.cy.ts
index 29564a9e..93e9d485 100644
--- a/packages/app/cypress/e2e/model-architecture.cy.ts
+++ b/packages/app/cypress/e2e/model-architecture.cy.ts
@@ -347,10 +347,14 @@ describe('Model Architecture Diagram', () => {
       cy.get('[data-testid="alternating-indicator"]').should('exist');
     });
 
-    it('Hybrid attention is NOT expandable; expert grid is expandable within a block', () => {
+    it('Hybrid attention is expandable and drills down to a Sliding Window block', () => {
       cy.get('[data-testid="expand-altBlock0"]').click({ force: true });
       cy.get('[data-testid="collapse-altBlock0"]').should('exist');
-      cy.get('[data-testid="expand-attention"]').should('not.exist');
+      // Hybrid attention drills down (unlike gpt-oss sink/full GQA, which does not)
+      cy.get('[data-testid="expand-altAttention0"]').should('exist');
+      cy.get('[data-testid="expand-altAttention0"]').click({ force: true });
+      cy.get('[data-testid="model-architecture-svg"]').should('be.visible');
+      // Expert grid still expandable within the block
       cy.get('[data-testid="expand-altExperts0"]').should('exist');
     });
 
diff --git a/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx b/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
index b03e7851..987afee3 100644
--- a/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
+++ b/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
@@ -18,6 +18,7 @@ import {
   getAttentionLabel,
   getAttentionSubBlocks,
   getFFNSubBlocks,
+  getHybridAttentionSubBlocks,
   getModelArchitecture,
 } from '@/lib/model-architectures';
 
@@ -143,6 +144,21 @@ function renderDiagram(
     hasAlternatingLayers && expandedBlocks.has('altExperts1'),
   ];
 
+  // Hybrid models (DeepSeek V4) expose an expandable attention drill-down inside
+  // each alternating block, revealing the sliding-window branch as an explicit
+  // block alongside the compressed branch. gpt-oss (AlternatingSinkGQA) keeps a
+  // static attention block.
+  const altAttnExpandable = hasAlternatingLayers && arch.attentionType === 'Hybrid';
+  const altAttnExpanded = [
+    altAttnExpandable && expandedBlocks.has('altAttention0'),
+    altAttnExpandable && expandedBlocks.has('altAttention1'),
+  ];
+  const altAttnFlow: (SubBlockFlow | null)[] = altAttnExpandable
+    ? [0, 1].map((i) =>
+        alternatingSpecs[i] ? getHybridAttentionSubBlocks(arch, alternatingSpecs[i]) : null,
+      )
+    : [null, null];
+
   // Calculate flow height for either sequential or parallel layouts
   function getFlowHeight(flow: SubBlockFlow, hasLabel: boolean): number {
     if (flow.layout === 'sequential') {
@@ -201,6 +217,10 @@ function renderDiagram(
     altExpertsExpanded[0] ? getFlowHeight(ffnFlow, true) : 0,
     altExpertsExpanded[1] ? getFlowHeight(ffnFlow, true) : 0,
   ];
+  const altAttnExpandedH = [
+    altAttnExpanded[0] && altAttnFlow[0] ? getFlowHeight(altAttnFlow[0], false) : 0,
+    altAttnExpanded[1] && altAttnFlow[1] ? getFlowHeight(altAttnFlow[1], false) : 0,
+  ];
 
   // Compute vertical positions
   let y = pad.top;
@@ -273,6 +293,7 @@ function renderDiagram(
   const altBlockEnd = [0, 0];
   const altNorm1Y = [0, 0];
   const altAttnY = [0, 0];
+  const altAttnExpandedStartY = [0, 0];
   const altMerge1Y = [0, 0];
   const altNorm2Y = [0, 0];
   const altRouterY = [0, 0];
@@ -311,6 +332,12 @@ function renderDiagram(
         altAttnY[bi] = y;
         y += blockH;
 
+        // Expanded hybrid-attention sub-blocks (sliding window + compressed)
+        altAttnExpandedStartY[bi] = y;
+        if (altAttnExpanded[bi]) {
+          y += altAttnExpandedH[bi];
+        }
+
         y += 4;
         altMerge1Y[bi] = y + mergeGap / 2;
         y += mergeGap;
@@ -467,11 +494,14 @@ function renderDiagram(
   }
 
   function drawResidualBypass(branchY: number, mergeY: number) {
+    // Tap the residual from the input stream ABOVE the norm (in the arrow gap)
+    // so the horizontal connector doesn't run across the RMSNorm block.
+    const tapY = branchY - arrowH / 2;
     bgG
       .append('path')
       .attr(
         'd',
-        `M ${cx} ${branchY} L ${residLeftX} ${branchY} L ${residLeftX} ${mergeY} L ${cx - circleR} ${mergeY}`,
+        `M ${cx} ${tapY} L ${residLeftX} ${tapY} L ${residLeftX} ${mergeY} L ${cx - circleR} ${mergeY}`,
       )
       .attr('fill', 'none')
       .attr('stroke', mutedFg)
@@ -488,7 +518,7 @@ function renderDiagram(
       .attr('x', cx)
       .attr('y', mergeY)
       .attr('text-anchor', 'middle')
-      .attr('dominant-baseline', 'central')
+      .attr('dy', '0.35em')
       .attr('fill', fg)
       .attr('font-size', '14px')
       .attr('font-weight', 700)
@@ -520,7 +550,7 @@ function renderDiagram(
       .attr('x', x + w / 2)
       .attr('y', by + h / 2 - (subText ? 7 : 0))
       .attr('text-anchor', 'middle')
-      .attr('dominant-baseline', 'central')
+      .attr('dy', '0.35em')
       .attr('fill', fg)
       .attr('font-size', '13px')
       .attr('font-weight', 600)
@@ -532,7 +562,7 @@ function renderDiagram(
         .attr('x', x + w / 2)
         .attr('y', by + h / 2 + 10)
         .attr('text-anchor', 'middle')
-        .attr('dominant-baseline', 'central')
+        .attr('dy', '0.35em')
         .attr('fill', mutedFg)
         .attr('font-size', '11px')
         .attr('font-family', 'inherit')
@@ -566,7 +596,7 @@ function renderDiagram(
       .attr('x', x + w / 2 - 8)
       .attr('y', by + h / 2 - (subText ? 7 : 0))
       .attr('text-anchor', 'middle')
-      .attr('dominant-baseline', 'central')
+      .attr('dy', '0.35em')
       .attr('fill', fg)
       .attr('font-size', '13px')
       .attr('font-weight', 600)
@@ -579,7 +609,7 @@ function renderDiagram(
         .attr('x', x + w / 2 - 8)
         .attr('y', by + h / 2 + 10)
         .attr('text-anchor', 'middle')
-        .attr('dominant-baseline', 'central')
+        .attr('dy', '0.35em')
         .attr('fill', mutedFg)
         .attr('font-size', '11px')
         .attr('font-family', 'inherit')
@@ -603,7 +633,7 @@ function renderDiagram(
       .attr('x', iconX)
       .attr('y', iconY)
       .attr('text-anchor', 'middle')
-      .attr('dominant-baseline', 'central')
+      .attr('dy', '0.35em')
       .attr('fill', c.stroke)
       .attr('font-size', '14px')
       .attr('font-weight', 700)
@@ -649,7 +679,7 @@ function renderDiagram(
       .attr('x', bx + subBw / 2)
       .attr('y', by + subBlockH / 2 - (block.detail ? 5 : 0))
       .attr('text-anchor', 'middle')
-      .attr('dominant-baseline', 'central')
+      .attr('dy', '0.35em')
       .attr('fill', fg)
       .attr('font-size', fontSize.name)
       .attr('font-weight', 500)
@@ -661,7 +691,7 @@ function renderDiagram(
         .attr('x', bx + subBw / 2)
         .attr('y', by + subBlockH / 2 + 8)
         .attr('text-anchor', 'middle')
-        .attr('dominant-baseline', 'central')
+        .attr('dy', '0.35em')
         .attr('fill', mutedFg)
         .attr('font-size', fontSize.detail)
         .attr('font-family', 'inherit')
@@ -698,7 +728,7 @@ function renderDiagram(
         .attr('x', x + w / 2)
         .attr('y', sy + 8)
         .attr('text-anchor', 'middle')
-        .attr('dominant-baseline', 'central')
+        .attr('dy', '0.35em')
         .attr('fill', mutedFg)
         .attr('font-size', '10px')
         .attr('font-weight', 600)
@@ -723,7 +753,7 @@ function renderDiagram(
           .attr('x', leftCx)
           .attr('y', sy + 6)
           .attr('text-anchor', 'middle')
-          .attr('dominant-baseline', 'central')
+          .attr('dy', '0.35em')
           .attr('fill', mutedFg)
           .attr('font-size', '9px')
           .attr('font-weight', 600)
@@ -735,7 +765,7 @@ function renderDiagram(
           .attr('x', rightCx)
           .attr('y', sy + 6)
           .attr('text-anchor', 'middle')
-          .attr('dominant-baseline', 'central')
+          .attr('dy', '0.35em')
           .attr('fill', mutedFg)
           .attr('font-size', '9px')
           .attr('font-weight', 600)
@@ -891,7 +921,7 @@ function renderDiagram(
           .attr('x', mergeCx)
           .attr('y', circleCy)
           .attr('text-anchor', 'middle')
-          .attr('dominant-baseline', 'central')
+          .attr('dy', '0.35em')
           .attr('fill', fg)
           .attr('font-size', '14px')
           .attr('font-weight', 700)
@@ -963,7 +993,7 @@ function renderDiagram(
             .attr('x', lcx)
             .attr('y', sy + 6)
             .attr('text-anchor', 'middle')
-            .attr('dominant-baseline', 'central')
+            .attr('dy', '0.35em')
             .attr('fill', mutedFg)
             .attr('font-size', '9px')
             .attr('font-weight', 600)
@@ -1227,7 +1257,7 @@ function renderDiagram(
       .attr('x', x + w / 2 - 8)
       .attr('y', by + h / 2 - 8)
       .attr('text-anchor', 'middle')
-      .attr('dominant-baseline', 'central')
+      .attr('dy', '0.35em')
       .attr('fill', fg)
       .attr('font-size', '13px')
       .attr('font-weight', 600)
@@ -1239,7 +1269,7 @@ function renderDiagram(
       .attr('x', x + w / 2 - 8)
       .attr('y', by + h / 2 + 10)
       .attr('text-anchor', 'middle')
-      .attr('dominant-baseline', 'central')
+      .attr('dy', '0.35em')
       .attr('fill', mutedFg)
       .attr('font-size', '11px')
       .attr('font-family', 'inherit')
@@ -1260,7 +1290,7 @@ function renderDiagram(
       .attr('x', iconX)
       .attr('y', iconY)
       .attr('text-anchor', 'middle')
-      .attr('dominant-baseline', 'central')
+      .attr('dy', '0.35em')
       .attr('fill', mutedFg)
       .attr('font-size', '14px')
       .attr('font-weight', 700)
@@ -1354,7 +1384,7 @@ function renderDiagram(
         .attr('x', width - pad.right - denseBadgeW / 2 - 4)
         .attr('y', denseTxStart)
         .attr('text-anchor', 'middle')
-        .attr('dominant-baseline', 'central')
+        .attr('dy', '0.35em')
         .attr('fill', mutedFg)
         .attr('font-size', '11px')
         .attr('font-weight', 600)
@@ -1512,7 +1542,7 @@ function renderDiagram(
         .attr('x', ex + expertSize / 2)
         .attr('y', ey + expertSize / 2)
         .attr('text-anchor', 'middle')
-        .attr('dominant-baseline', 'central')
+        .attr('dy', '0.35em')
         .attr('fill', isActive ? fg : mutedFg)
         .attr('font-size', '9px')
         .attr('font-weight', isActive ? 600 : 400)
@@ -1527,7 +1557,7 @@ function renderDiagram(
         .attr('x', ex + expertSize / 2)
         .attr('y', ey + expertSize / 2)
         .attr('text-anchor', 'middle')
-        .attr('dominant-baseline', 'central')
+        .attr('dy', '0.35em')
         .attr('fill', mutedFg)
         .attr('font-size', '14px')
         .attr('font-weight', 700)
@@ -1549,7 +1579,7 @@ function renderDiagram(
         .attr('x', ex + expertSize / 2)
         .attr('y', ey + expertSize / 2)
         .attr('text-anchor', 'middle')
-        .attr('dominant-baseline', 'central')
+        .attr('dy', '0.35em')
         .attr('fill', mutedFg)
         .attr('font-size', '9px')
         .attr('font-weight', 600)
@@ -1572,7 +1602,7 @@ function renderDiagram(
       .attr('x', expIconX)
       .attr('y', expIconY)
       .attr('text-anchor', 'middle')
-      .attr('dominant-baseline', 'central')
+      .attr('dy', '0.35em')
       .attr('fill', ec.stroke)
       .attr('font-size', '14px')
       .attr('font-weight', 700)
@@ -1639,7 +1669,7 @@ function renderDiagram(
           .attr('x', width - pad.right - badgeW / 2 - 4)
           .attr('y', altBlockStart[bi])
           .attr('text-anchor', 'middle')
-          .attr('dominant-baseline', 'central')
+          .attr('dy', '0.35em')
           .attr('fill', mutedFg)
           .attr('font-size', '11px')
           .attr('font-weight', 600)
@@ -1660,7 +1690,8 @@ function renderDiagram(
         drawBlock(innerX, altNorm1Y[bi], innerW, smallH, 'norm', 'RMSNorm');
         drawArrow(altNorm1Y[bi] + smallH, altAttnY[bi]);
 
-        // Attention (non-expandable — AlternatingSinkGQA)
+        // Attention — expandable hybrid drill-down (DeepSeek V4 CSA/HCA) or a
+        // static block (gpt-oss AlternatingSinkGQA).
         const headSub = [
           arch.numHeads ? `${arch.numHeads} heads` : null,
           arch.numKVHeads ? `${arch.numKVHeads} KV heads` : null,
@@ -1674,9 +1705,29 @@ function renderDiagram(
         const attnSub = specWindow
           ? `${headSub}${headSub ? '  \u00B7  ' : ''}window=${specWindow}`
           : headSub || undefined;
-        drawBlock(innerX, altAttnY[bi], innerW, blockH, 'attention', spec.label, attnSub);
+        if (altAttnExpandable) {
+          drawExpandableBlock(
+            innerX,
+            altAttnY[bi],
+            innerW,
+            blockH,
+            'attention',
+            spec.label,
+            attnSub,
+            altAttnExpanded[bi],
+            `altAttention${bi}`,
+          );
+          const flow = altAttnFlow[bi];
+          if (altAttnExpanded[bi] && flow) {
+            drawFlow(flow, altAttnExpandedStartY[bi], innerX, innerW);
+          }
+        } else {
+          drawBlock(innerX, altAttnY[bi], innerW, blockH, 'attention', spec.label, attnSub);
+        }
 
-        const aBottom = altAttnY[bi] + blockH + 4;
+        const aBottom = altAttnExpanded[bi]
+          ? altAttnExpandedStartY[bi] + altAttnExpandedH[bi] + 4
+          : altAttnY[bi] + blockH + 4;
         drawArrow(aBottom, altMerge1Y[bi] - circleR);
         drawResidualBypass(altNorm1Y[bi], altMerge1Y[bi]);
         drawArrow(altMerge1Y[bi] + circleR, altNorm2Y[bi]);
@@ -1734,7 +1785,7 @@ function renderDiagram(
           .attr('x', cx)
           .attr('y', altIndicatorY)
           .attr('text-anchor', 'middle')
-          .attr('dominant-baseline', 'central')
+          .attr('dy', '0.35em')
           .attr('fill', mutedFg)
           .attr('font-size', `${labelFontSize}px`)
           .attr('font-weight', 500)
@@ -1775,7 +1826,7 @@ function renderDiagram(
       .attr('x', width - pad.right - badgeW / 2 - 4)
       .attr('y', txStart)
       .attr('text-anchor', 'middle')
-      .attr('dominant-baseline', 'central')
+      .attr('dy', '0.35em')
       .attr('fill', mutedFg)
       .attr('font-size', '11px')
       .attr('font-weight', 600)
diff --git a/packages/app/src/lib/model-architectures.test.ts b/packages/app/src/lib/model-architectures.test.ts
index 0766e950..57412d33 100644
--- a/packages/app/src/lib/model-architectures.test.ts
+++ b/packages/app/src/lib/model-architectures.test.ts
@@ -10,6 +10,7 @@ import {
   getAttentionLabel,
   getAttentionSubBlocks,
   getFFNSubBlocks,
+  getHybridAttentionSubBlocks,
   getModelArchitecture,
   MODEL_ARCHITECTURES,
 } from './model-architectures';
@@ -721,6 +722,43 @@ describe('getFFNSubBlocks', () => {
   });
 });
 
+describe('getHybridAttentionSubBlocks', () => {
+  it('exposes the sliding-window branch as an explicit block for DeepSeek V4', () => {
+    const arch = getModelArchitecture(Model.DeepSeek_V4_Pro)!;
+    const [hca, csa] = arch.alternatingLayers!;
+
+    const csaFlow = getHybridAttentionSubBlocks(arch, csa);
+    expect(csaFlow.layout).toBe('parallel');
+    if (csaFlow.layout !== 'parallel') return;
+    expect(csaFlow.leftLabel).toBe('Local');
+    expect(csaFlow.leftPath[0].name).toBe('Sliding Window');
+    expect(csaFlow.leftPath[0].detail).toContain('128');
+    // CSA compressed branch runs the lightning indexer (sparse top-k)
+    expect(csaFlow.rightPath.some((b) => b.name === 'Lightning Indexer')).toBe(true);
+    expect(csaFlow.mergeBlocks[0].name).toContain('MQA');
+    expect(csaFlow.mergeBlocks.at(-1)?.name).toBe('Output Projection');
+
+    const hcaFlow = getHybridAttentionSubBlocks(arch, hca);
+    if (hcaFlow.layout !== 'parallel') return;
+    expect(hcaFlow.leftPath[0].name).toBe('Sliding Window');
+    // HCA compressed branch is heavy compression (no sparse indexer)
+    expect(hcaFlow.rightPath.some((b) => b.name === 'Lightning Indexer')).toBe(false);
+    expect(hcaFlow.rightPath[0].name).toBe('Heavy Compression');
+  });
+
+  it('all hybrid sub-blocks have valid types', () => {
+    const arch = getModelArchitecture(Model.DeepSeek_V4_Pro)!;
+    const validTypes = ['projection', 'activation', 'operation', 'attention'];
+    for (const spec of arch.alternatingLayers!) {
+      const flow = getHybridAttentionSubBlocks(arch, spec);
+      for (const block of getAllBlocks(flow)) {
+        expect(validTypes).toContain(block.type);
+        expect(block.name.length).toBeGreaterThan(0);
+      }
+    }
+  });
+});
+
 /** Helper: get all blocks from a flow (flat list for easy assertions) */
 function getAllBlocks(flow: SubBlockFlow) {
   if (flow.layout === 'sequential') return flow.blocks;
diff --git a/packages/app/src/lib/model-architectures.ts b/packages/app/src/lib/model-architectures.ts
index 40fd1995..42cebcee 100644
--- a/packages/app/src/lib/model-architectures.ts
+++ b/packages/app/src/lib/model-architectures.ts
@@ -558,3 +558,55 @@ export function getFFNSubBlocks(
     ],
   };
 }
+
+/**
+ * Hybrid attention sub-blocks (DeepSeek V4-style CSA / HCA layers).
+ *
+ * Unlike a standard GQA layer, every hybrid attention layer fuses two KV
+ * sources for each query: a local sliding-window branch (recent uncompressed
+ * tokens) and a compressed-KV branch, combined by a shared-KV MQA with a
+ * learnable attention sink. The compressed branch depends on the layer type —
+ * CSA runs a lightning indexer (sparse top-k) over lightly compressed KV, while
+ * HCA attends densely over heavily compressed KV. Rendering this as a flow makes
+ * the sliding-window attention an explicit, visible block rather than a one-line
+ * `window=N` annotation.
+ */
+export function getHybridAttentionSubBlocks(
+  arch: ModelArchitecture,
+  spec: AlternatingLayerSpec,
+): SubBlockFlow {
+  const win = spec.slidingWindow ?? arch.slidingWindow;
+  const isSparse = /sparse/iu.test(spec.label);
+  const compressedPath: ArchSubBlock[] = isSparse
+    ? [
+        { name: 'Token Compression', detail: '1 entry / 4 tokens', type: 'operation' },
+        { name: 'Lightning Indexer', detail: 'sparse top-1024', type: 'attention' },
+      ]
+    : [{ name: 'Heavy Compression', detail: '1 entry / 128 tokens', type: 'attention' }];
+
+  return {
+    layout: 'parallel',
+    leftLabel: 'Local',
+    rightLabel: 'Compressed',
+    leftPath: [
+      {
+        name: 'Sliding Window',
+        detail: win ? `last ${win} tokens` : 'local KV',
+        type: 'attention',
+      },
+    ],
+    rightPath: compressedPath,
+    mergeBlocks: [
+      {
+        name: 'Shared-KV MQA + Sink',
+        detail: arch.numHeads ? `${arch.numHeads} heads · 1 KV` : undefined,
+        type: 'attention',
+      },
+      {
+        name: 'Output Projection',
+        detail: arch.hiddenSize ? `→ ${arch.hiddenSize.toLocaleString()}` : undefined,
+        type: 'projection',
+      },
+    ],
+  };
+}

From 6371aebd64662ac7ab8b51f6ecfecff7feb1c48c Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Mon, 8 Jun 2026 01:55:00 -0700
Subject: [PATCH 03/10] fix(inference): balance hybrid attention drill-down,
 center circle glyphs

Addresses three rendering issues in the model architecture diagram:

- Hybrid (CSA/HCA) attention now drills down into symmetric 2x2 columns:
  Local (Sliding Window + Attention Sink) beside the two-stage Compressed
  branch (compression + selector). This removes the lonely long connector
  that made the expanded box look unbalanced and promotes the attention
  sink to an explicit block; the merge block is now plain "Shared-KV MQA".

- The +, -, and x symbols inside merge/expand circles are drawn as
  geometric strokes instead of <text>. Font baseline drift (even with dy
  tuning) left the glyph sitting slightly low, which also made the residual
  bypass line read as misaligned with the "+". The strokes are centered on
  the circle's center, so the residual line is now co-linear with the arm.
---
 .../inference/ui/ModelArchitectureDiagram.tsx | 81 +++++++------------
 .../app/src/lib/model-architectures.test.ts   | 14 +++-
 packages/app/src/lib/model-architectures.ts   | 32 +++++---
 3 files changed, 66 insertions(+), 61 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx b/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
index 987afee3..0ed6ed4e 100644
--- a/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
+++ b/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
@@ -493,6 +493,32 @@ function renderDiagram(
       .attr('marker-end', 'url(#arch-arrow)');
   }
 
+  // Draw a +, −, or × glyph from geometric strokes, perfectly centered at
+  // (gx, gy). Rendering these as lines instead of <text> sidesteps font
+  // baseline drift, which left the symbol sitting slightly low inside the
+  // merge / expand circles regardless of dy tuning.
+  function drawCircleGlyph(gx: number, gy: number, color: string, symbol: string) {
+    const arm = 5;
+    const seg = (x1: number, y1: number, x2: number, y2: number) =>
+      g
+        .append('line')
+        .attr('x1', x1)
+        .attr('y1', y1)
+        .attr('x2', x2)
+        .attr('y2', y2)
+        .attr('stroke', color)
+        .attr('stroke-width', 2)
+        .attr('stroke-linecap', 'round')
+        .style('pointer-events', 'none');
+    if (symbol === '×') {
+      seg(gx - arm, gy - arm, gx + arm, gy + arm);
+      seg(gx - arm, gy + arm, gx + arm, gy - arm);
+    } else {
+      seg(gx - arm, gy, gx + arm, gy); // horizontal arm (+ and −)
+      if (symbol !== '−') seg(gx, gy - arm, gx, gy + arm); // vertical arm (+ only)
+    }
+  }
+
   function drawResidualBypass(branchY: number, mergeY: number) {
     // Tap the residual from the input stream ABOVE the norm (in the arrow gap)
     // so the horizontal connector doesn't run across the RMSNorm block.
@@ -514,16 +540,7 @@ function renderDiagram(
       .attr('fill', bgSubtle)
       .attr('stroke', mutedFg)
       .attr('stroke-width', 1.5);
-    g.append('text')
-      .attr('x', cx)
-      .attr('y', mergeY)
-      .attr('text-anchor', 'middle')
-      .attr('dy', '0.35em')
-      .attr('fill', fg)
-      .attr('font-size', '14px')
-      .attr('font-weight', 700)
-      .attr('font-family', 'inherit')
-      .text('+');
+    drawCircleGlyph(cx, mergeY, fg, '+');
   }
 
   function drawBlock(
@@ -629,16 +646,7 @@ function renderDiagram(
       .attr('stroke-width', 1)
       .style('pointer-events', 'none');
 
-    g.append('text')
-      .attr('x', iconX)
-      .attr('y', iconY)
-      .attr('text-anchor', 'middle')
-      .attr('dy', '0.35em')
-      .attr('fill', c.stroke)
-      .attr('font-size', '14px')
-      .attr('font-weight', 700)
-      .style('pointer-events', 'none')
-      .text(isBlockExpanded ? '\u2212' : '+');
+    drawCircleGlyph(iconX, iconY, c.stroke, isBlockExpanded ? '\u2212' : '+');
 
     g.append('rect')
       .attr('x', x)
@@ -917,16 +925,7 @@ function renderDiagram(
           .attr('fill', bgSubtle)
           .attr('stroke', mutedFg)
           .attr('stroke-width', 1.5);
-        g.append('text')
-          .attr('x', mergeCx)
-          .attr('y', circleCy)
-          .attr('text-anchor', 'middle')
-          .attr('dy', '0.35em')
-          .attr('fill', fg)
-          .attr('font-size', '14px')
-          .attr('font-weight', 700)
-          .attr('font-family', 'inherit')
-          .text(block.circleSymbol);
+        drawCircleGlyph(mergeCx, circleCy, fg, block.circleSymbol);
       } else {
         drawSingleSubBlock(block, subInnerXLocal, msy, subInnerWLocal);
       }
@@ -1286,16 +1285,7 @@ function renderDiagram(
       .attr('stroke', borderColor)
       .attr('stroke-width', 1)
       .style('pointer-events', 'none');
-    g.append('text')
-      .attr('x', iconX)
-      .attr('y', iconY)
-      .attr('text-anchor', 'middle')
-      .attr('dy', '0.35em')
-      .attr('fill', mutedFg)
-      .attr('font-size', '14px')
-      .attr('font-weight', 700)
-      .style('pointer-events', 'none')
-      .text('+');
+    drawCircleGlyph(iconX, iconY, mutedFg, '+');
 
     g.append('rect')
       .attr('x', x)
@@ -1598,16 +1588,7 @@ function renderDiagram(
       .attr('stroke', ec.stroke)
       .attr('stroke-width', 1)
       .style('pointer-events', 'none');
-    g.append('text')
-      .attr('x', expIconX)
-      .attr('y', expIconY)
-      .attr('text-anchor', 'middle')
-      .attr('dy', '0.35em')
-      .attr('fill', ec.stroke)
-      .attr('font-size', '14px')
-      .attr('font-weight', 700)
-      .style('pointer-events', 'none')
-      .text(isExpExpanded ? '\u2212' : '+');
+    drawCircleGlyph(expIconX, expIconY, ec.stroke, isExpExpanded ? '\u2212' : '+');
 
     g.append('rect')
       .attr('x', innerX)
diff --git a/packages/app/src/lib/model-architectures.test.ts b/packages/app/src/lib/model-architectures.test.ts
index 57412d33..247b735d 100644
--- a/packages/app/src/lib/model-architectures.test.ts
+++ b/packages/app/src/lib/model-architectures.test.ts
@@ -731,19 +731,31 @@ describe('getHybridAttentionSubBlocks', () => {
     expect(csaFlow.layout).toBe('parallel');
     if (csaFlow.layout !== 'parallel') return;
     expect(csaFlow.leftLabel).toBe('Local');
+    // Local branch shows the sliding window AND the always-attended sink as two
+    // explicit blocks, balancing the two-stage compressed branch (no lonely
+    // long connector line).
     expect(csaFlow.leftPath[0].name).toBe('Sliding Window');
     expect(csaFlow.leftPath[0].detail).toContain('128');
+    expect(csaFlow.leftPath[1].name).toBe('Attention Sink');
+    expect(csaFlow.leftPath).toHaveLength(2);
     // CSA compressed branch runs the lightning indexer (sparse top-k)
     expect(csaFlow.rightPath.some((b) => b.name === 'Lightning Indexer')).toBe(true);
-    expect(csaFlow.mergeBlocks[0].name).toContain('MQA');
+    // Columns are balanced so the parallel flow renders symmetrically
+    expect(csaFlow.rightPath).toHaveLength(csaFlow.leftPath.length);
+    // Sink now lives in the local branch, so the merge block is plain MQA
+    expect(csaFlow.mergeBlocks[0].name).toBe('Shared-KV MQA');
     expect(csaFlow.mergeBlocks.at(-1)?.name).toBe('Output Projection');
 
     const hcaFlow = getHybridAttentionSubBlocks(arch, hca);
     if (hcaFlow.layout !== 'parallel') return;
     expect(hcaFlow.leftPath[0].name).toBe('Sliding Window');
+    expect(hcaFlow.leftPath[1].name).toBe('Attention Sink');
     // HCA compressed branch is heavy compression (no sparse indexer)
     expect(hcaFlow.rightPath.some((b) => b.name === 'Lightning Indexer')).toBe(false);
     expect(hcaFlow.rightPath[0].name).toBe('Heavy Compression');
+    expect(hcaFlow.rightPath[1].name).toBe('Compressed Attn');
+    // Balanced here too
+    expect(hcaFlow.rightPath).toHaveLength(hcaFlow.leftPath.length);
   });
 
   it('all hybrid sub-blocks have valid types', () => {
diff --git a/packages/app/src/lib/model-architectures.ts b/packages/app/src/lib/model-architectures.ts
index 42cebcee..cb3a11ab 100644
--- a/packages/app/src/lib/model-architectures.ts
+++ b/packages/app/src/lib/model-architectures.ts
@@ -577,29 +577,41 @@ export function getHybridAttentionSubBlocks(
 ): SubBlockFlow {
   const win = spec.slidingWindow ?? arch.slidingWindow;
   const isSparse = /sparse/iu.test(spec.label);
+
+  // Local branch (both variants): a sliding window over recent tokens plus the
+  // always-attended sink tokens (StreamingLLM-style). Keeping these as two
+  // explicit blocks balances the flow against the two-stage compressed branch.
+  const localPath: ArchSubBlock[] = [
+    {
+      name: 'Sliding Window',
+      detail: win ? `last ${win} tokens` : 'local KV',
+      type: 'attention',
+    },
+    { name: 'Attention Sink', detail: 'first tokens', type: 'operation' },
+  ];
+
+  // Compressed branch: CSA lightly compresses then sparsely selects via the
+  // lightning indexer; HCA compresses heavily then attends over the latent KV.
   const compressedPath: ArchSubBlock[] = isSparse
     ? [
         { name: 'Token Compression', detail: '1 entry / 4 tokens', type: 'operation' },
         { name: 'Lightning Indexer', detail: 'sparse top-1024', type: 'attention' },
       ]
-    : [{ name: 'Heavy Compression', detail: '1 entry / 128 tokens', type: 'attention' }];
+    : [
+        { name: 'Heavy Compression', detail: '1 entry / 128 tokens', type: 'operation' },
+        { name: 'Compressed Attn', detail: 'over latent KV', type: 'attention' },
+      ];
 
   return {
     layout: 'parallel',
     leftLabel: 'Local',
     rightLabel: 'Compressed',
-    leftPath: [
-      {
-        name: 'Sliding Window',
-        detail: win ? `last ${win} tokens` : 'local KV',
-        type: 'attention',
-      },
-    ],
+    leftPath: localPath,
     rightPath: compressedPath,
     mergeBlocks: [
       {
-        name: 'Shared-KV MQA + Sink',
-        detail: arch.numHeads ? `${arch.numHeads} heads · 1 KV` : undefined,
+        name: 'Shared-KV MQA',
+        detail: arch.numHeads ? `${arch.numHeads} heads · ${arch.numKVHeads ?? 1} KV` : undefined,
         type: 'attention',
       },
       {

From cea961b8cfa0a205653847b1a88995a3d83cd622 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Mon, 8 Jun 2026 11:15:50 -0700
Subject: [PATCH 04/10] fix(inference): represent the hybrid attention sink
 accurately
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The sliding-window and compressed branches are two KV *sources* whose
selected indices are unioned into a single shared-KV MQA softmax — not two
attentions merged after the fact. The attention sink is a per-head learnable
softmax-denominator bias on that MQA (model.py attn_sink / kernel.py
sum_exp += exp(attn_sink - max)), not literal "first tokens" in the local
branch.

- Local branch is just the sliding-window source (one block); the sink moves
  back onto the merge block as "Shared-KV MQA + Sink".
- CSA compressed branch = Token Compression -> Lightning Indexer (2 stages);
  HCA = a single Heavy Compression source. This makes CSA a 1-vs-2 split
  again.
- Center each column within the shared column area in drawParallelFlow so an
  unequal split reads as an intentional branch merge instead of leaving the
  shorter column's connector dangling as a long unattached line. Also
  improves the 2-vs-1 SwiGLU expert merge.
---
 .../inference/ui/ModelArchitectureDiagram.tsx | 25 ++++++++------
 .../app/src/lib/model-architectures.test.ts   | 33 +++++++++----------
 packages/app/src/lib/model-architectures.ts   | 22 ++++++-------
 3 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx b/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
index 0ed6ed4e..37cf9ff1 100644
--- a/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
+++ b/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
@@ -790,6 +790,16 @@ function renderDiagram(
     const parallelStartY = sy;
     const colFontSize = { name: '10px', detail: '8px' };
 
+    const maxRows = Math.max(flow.leftPath.length, flow.rightPath.length);
+    const rowsH = (n: number) => n * subBlockH + Math.max(0, n - 1) * subArrowH;
+    const colAreaH = rowsH(maxRows);
+    // Vertically center each column within the shared column area so an unequal
+    // split (e.g. 1 local vs 2 compressed) reads as an intentional branch merge
+    // rather than leaving the shorter column's connector dangling as a long
+    // unattached line beside the taller column.
+    const leftStartY = parallelStartY + (colAreaH - rowsH(flow.leftPath.length)) / 2;
+    const rightStartY = parallelStartY + (colAreaH - rowsH(flow.rightPath.length)) / 2;
+
     g.append('line')
       .attr('x1', mergeCx)
       .attr('y1', splitTopY)
@@ -799,10 +809,7 @@ function renderDiagram(
       .attr('stroke-width', 1);
 
     g.append('path')
-      .attr(
-        'd',
-        `M ${mergeCx} ${splitMidY} L ${leftCx} ${splitMidY} L ${leftCx} ${parallelStartY - 2}`,
-      )
+      .attr('d', `M ${mergeCx} ${splitMidY} L ${leftCx} ${splitMidY} L ${leftCx} ${leftStartY - 2}`)
       .attr('fill', 'none')
       .attr('stroke', mutedFg)
       .attr('stroke-width', 1)
@@ -811,14 +818,14 @@ function renderDiagram(
     g.append('path')
       .attr(
         'd',
-        `M ${mergeCx} ${splitMidY} L ${rightCx} ${splitMidY} L ${rightCx} ${parallelStartY - 2}`,
+        `M ${mergeCx} ${splitMidY} L ${rightCx} ${splitMidY} L ${rightCx} ${rightStartY - 2}`,
       )
       .attr('fill', 'none')
       .attr('stroke', mutedFg)
       .attr('stroke-width', 1)
       .attr('marker-end', 'url(#arch-arrow-sub)');
 
-    let lsy = parallelStartY;
+    let lsy = leftStartY;
     for (let i = 0; i < flow.leftPath.length; i++) {
       drawSingleSubBlock(flow.leftPath[i], leftX, lsy, colW, colFontSize);
       lsy += subBlockH;
@@ -836,7 +843,7 @@ function renderDiagram(
     }
     const leftEndY = lsy;
 
-    let rsy = parallelStartY;
+    let rsy = rightStartY;
     for (let i = 0; i < flow.rightPath.length; i++) {
       drawSingleSubBlock(flow.rightPath[i], rightX, rsy, colW, colFontSize);
       rsy += subBlockH;
@@ -854,9 +861,7 @@ function renderDiagram(
     }
     const rightEndY = rsy;
 
-    const maxRows = Math.max(flow.leftPath.length, flow.rightPath.length);
-    const mergeStartY =
-      parallelStartY + maxRows * subBlockH + Math.max(0, maxRows - 1) * subArrowH + subArrowH + 4;
+    const mergeStartY = parallelStartY + colAreaH + subArrowH + 4;
 
     const subInnerXLocal = x + 16;
     const subInnerWLocal = w - 40;
diff --git a/packages/app/src/lib/model-architectures.test.ts b/packages/app/src/lib/model-architectures.test.ts
index 247b735d..5aaea343 100644
--- a/packages/app/src/lib/model-architectures.test.ts
+++ b/packages/app/src/lib/model-architectures.test.ts
@@ -731,31 +731,30 @@ describe('getHybridAttentionSubBlocks', () => {
     expect(csaFlow.layout).toBe('parallel');
     if (csaFlow.layout !== 'parallel') return;
     expect(csaFlow.leftLabel).toBe('Local');
-    // Local branch shows the sliding window AND the always-attended sink as two
-    // explicit blocks, balancing the two-stage compressed branch (no lonely
-    // long connector line).
+    // Local branch is the sliding-window KV source (one explicit block). The
+    // sink is NOT here — it is a learnable softmax bias on the shared MQA.
     expect(csaFlow.leftPath[0].name).toBe('Sliding Window');
     expect(csaFlow.leftPath[0].detail).toContain('128');
-    expect(csaFlow.leftPath[1].name).toBe('Attention Sink');
-    expect(csaFlow.leftPath).toHaveLength(2);
-    // CSA compressed branch runs the lightning indexer (sparse top-k)
-    expect(csaFlow.rightPath.some((b) => b.name === 'Lightning Indexer')).toBe(true);
-    // Columns are balanced so the parallel flow renders symmetrically
-    expect(csaFlow.rightPath).toHaveLength(csaFlow.leftPath.length);
-    // Sink now lives in the local branch, so the merge block is plain MQA
-    expect(csaFlow.mergeBlocks[0].name).toBe('Shared-KV MQA');
+    expect(csaFlow.leftPath).toHaveLength(1);
+    expect(csaFlow.leftPath.some((b) => b.name === 'Attention Sink')).toBe(false);
+    // CSA compressed branch: light compression then the learned lightning
+    // indexer (sparse top-k) — two stages.
+    expect(csaFlow.rightPath.map((b) => b.name)).toEqual([
+      'Token Compression',
+      'Lightning Indexer',
+    ]);
+    // The fused attention is a single shared-KV MQA that carries the sink
+    expect(csaFlow.mergeBlocks[0].name).toBe('Shared-KV MQA + Sink');
     expect(csaFlow.mergeBlocks.at(-1)?.name).toBe('Output Projection');
 
     const hcaFlow = getHybridAttentionSubBlocks(arch, hca);
     if (hcaFlow.layout !== 'parallel') return;
     expect(hcaFlow.leftPath[0].name).toBe('Sliding Window');
-    expect(hcaFlow.leftPath[1].name).toBe('Attention Sink');
-    // HCA compressed branch is heavy compression (no sparse indexer)
+    expect(hcaFlow.leftPath).toHaveLength(1);
+    // HCA compressed branch is a single heavy-compression source (no indexer)
     expect(hcaFlow.rightPath.some((b) => b.name === 'Lightning Indexer')).toBe(false);
-    expect(hcaFlow.rightPath[0].name).toBe('Heavy Compression');
-    expect(hcaFlow.rightPath[1].name).toBe('Compressed Attn');
-    // Balanced here too
-    expect(hcaFlow.rightPath).toHaveLength(hcaFlow.leftPath.length);
+    expect(hcaFlow.rightPath.map((b) => b.name)).toEqual(['Heavy Compression']);
+    expect(hcaFlow.mergeBlocks[0].name).toBe('Shared-KV MQA + Sink');
   });
 
   it('all hybrid sub-blocks have valid types', () => {
diff --git a/packages/app/src/lib/model-architectures.ts b/packages/app/src/lib/model-architectures.ts
index cb3a11ab..76bbc765 100644
--- a/packages/app/src/lib/model-architectures.ts
+++ b/packages/app/src/lib/model-architectures.ts
@@ -578,29 +578,26 @@ export function getHybridAttentionSubBlocks(
   const win = spec.slidingWindow ?? arch.slidingWindow;
   const isSparse = /sparse/iu.test(spec.label);
 
-  // Local branch (both variants): a sliding window over recent tokens plus the
-  // always-attended sink tokens (StreamingLLM-style). Keeping these as two
-  // explicit blocks balances the flow against the two-stage compressed branch.
+  // Both branches are KV *sources* whose selected indices are unioned and fed to
+  // a single shared-KV MQA softmax — they are not two attentions merged after
+  // the fact. The local branch contributes the recent sliding-window tokens; the
+  // compressed branch contributes selected long-range tokens. CSA lightly
+  // compresses (1/4) then sparsely selects via the learned lightning indexer;
+  // HCA compresses heavily (1/128) and keeps the few resulting entries.
   const localPath: ArchSubBlock[] = [
     {
       name: 'Sliding Window',
       detail: win ? `last ${win} tokens` : 'local KV',
       type: 'attention',
     },
-    { name: 'Attention Sink', detail: 'first tokens', type: 'operation' },
   ];
 
-  // Compressed branch: CSA lightly compresses then sparsely selects via the
-  // lightning indexer; HCA compresses heavily then attends over the latent KV.
   const compressedPath: ArchSubBlock[] = isSparse
     ? [
         { name: 'Token Compression', detail: '1 entry / 4 tokens', type: 'operation' },
         { name: 'Lightning Indexer', detail: 'sparse top-1024', type: 'attention' },
       ]
-    : [
-        { name: 'Heavy Compression', detail: '1 entry / 128 tokens', type: 'operation' },
-        { name: 'Compressed Attn', detail: 'over latent KV', type: 'attention' },
-      ];
+    : [{ name: 'Heavy Compression', detail: '1 entry / 128 tokens', type: 'attention' }];
 
   return {
     layout: 'parallel',
@@ -608,9 +605,12 @@ export function getHybridAttentionSubBlocks(
     rightLabel: 'Compressed',
     leftPath: localPath,
     rightPath: compressedPath,
+    // The union of both branches' indices is consumed by one MQA softmax that
+    // carries a per-head learnable attention sink (a softmax-denominator bias,
+    // not literal sink tokens) — hence the sink lives on the MQA block here.
     mergeBlocks: [
       {
-        name: 'Shared-KV MQA',
+        name: 'Shared-KV MQA + Sink',
         detail: arch.numHeads ? `${arch.numHeads} heads · ${arch.numKVHeads ?? 1} KV` : undefined,
         type: 'attention',
       },

From 5df50ee35256e7306717cd9507a90eb1de127fca Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Mon, 8 Jun 2026 11:26:43 -0700
Subject: [PATCH 05/10] feat(inference): caption hybrid attention drill-down as
 one fused softmax
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the DeepSeek V4 hybrid attention drill-down is expanded, show a short
note clarifying that the Local (sliding-window) and Compressed (CSA/HCA)
columns are two KV *sources* unioned into a single shared-KV MQA softmax —
not two separate attentions that get summed — with the attention sink being
a learnable per-head softmax-denominator bias. Prevents the parallel-column
schematic from being read as two independent attention paths.

Shown only while a hybrid attention block is expanded; covered by an e2e
assertion.
---
 packages/app/cypress/e2e/model-architecture.cy.ts   |  6 ++++++
 .../inference/ui/ModelArchitectureDiagram.tsx       | 13 +++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/packages/app/cypress/e2e/model-architecture.cy.ts b/packages/app/cypress/e2e/model-architecture.cy.ts
index 93e9d485..4d9c5b1c 100644
--- a/packages/app/cypress/e2e/model-architecture.cy.ts
+++ b/packages/app/cypress/e2e/model-architecture.cy.ts
@@ -350,10 +350,16 @@ describe('Model Architecture Diagram', () => {
     it('Hybrid attention is expandable and drills down to a Sliding Window block', () => {
       cy.get('[data-testid="expand-altBlock0"]').click({ force: true });
       cy.get('[data-testid="collapse-altBlock0"]').should('exist');
+      // The union-softmax caption only appears once the attention drill-down is open
+      cy.get('[data-testid="hybrid-attention-note"]').should('not.exist');
       // Hybrid attention drills down (unlike gpt-oss sink/full GQA, which does not)
       cy.get('[data-testid="expand-altAttention0"]').should('exist');
       cy.get('[data-testid="expand-altAttention0"]').click({ force: true });
       cy.get('[data-testid="model-architecture-svg"]').should('be.visible');
+      // Caption clarifies the two branches feed one softmax (not two attentions)
+      cy.get('[data-testid="hybrid-attention-note"]')
+        .should('be.visible')
+        .and('contain', 'single softmax');
       // Expert grid still expandable within the block
       cy.get('[data-testid="expand-altExperts0"]').should('exist');
     });
diff --git a/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx b/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
index 37cf9ff1..e63a7b46 100644
--- a/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
+++ b/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
@@ -2132,6 +2132,19 @@ export default function ModelArchitectureDiagram({
       >
         <div ref={containerRef} className="px-4 pb-4">
           <svg ref={svgRef} className="w-full" data-testid="model-architecture-svg" />
+          {arch.attentionType === 'Hybrid' &&
+            (expandedBlocks.has('altAttention0') || expandedBlocks.has('altAttention1')) && (
+              <p
+                className="mt-2 text-[11px] leading-snug text-muted-foreground"
+                data-testid="hybrid-attention-note"
+              >
+                <span className="font-medium text-foreground">Local</span> and{' '}
+                <span className="font-medium text-foreground">Compressed</span> are two KV sources,
+                not two separate attentions: each query attends in a{' '}
+                <span className="font-medium text-foreground">single softmax</span> to the union of
+                sliding-window + selected compressed keys, with a learnable per-head attention sink.
+              </p>
+            )}
           {arch.features && arch.features.length > 0 && (
             <div className="mt-3 pt-3 border-t border-border/50">
               <div className="flex flex-wrap gap-1.5 items-center">

From 3c555773a6df7659d1b2838f830435b50d9e8857 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Mon, 8 Jun 2026 11:43:32 -0700
Subject: [PATCH 06/10] fix(inference): rename SWA feature label so it isn't
 read as a separate attention type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 128-token sliding window is the shared local base of every hybrid layer
(both HCA and CSA extend it; the final layer runs it alone) — not a third
attention type alongside CSA/HCA. Rename the features badge from
"Sliding Window Attention (128 tokens)" to "Sliding window (128 tokens)" so it
reads as a windowing mechanism rather than a standalone attention. The
drill-down's "Sliding Window" KV-source block is unchanged.
---
 packages/app/cypress/e2e/model-architecture.cy.ts | 2 +-
 packages/app/src/lib/model-architectures.test.ts  | 2 +-
 packages/app/src/lib/model-architectures.ts       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/packages/app/cypress/e2e/model-architecture.cy.ts b/packages/app/cypress/e2e/model-architecture.cy.ts
index 4d9c5b1c..ea3f06f9 100644
--- a/packages/app/cypress/e2e/model-architecture.cy.ts
+++ b/packages/app/cypress/e2e/model-architecture.cy.ts
@@ -371,7 +371,7 @@ describe('Model Architecture Diagram', () => {
 
     it('shows DeepSeek V4 Pro features (incl. sliding window) and developer info', () => {
       cy.contains('Hybrid CSA + HCA Attention').should('be.visible');
-      cy.contains('Sliding Window Attention (128 tokens)').should('be.visible');
+      cy.contains('Sliding window (128 tokens)').should('be.visible');
       cy.contains('Released by DeepSeek').should('be.visible');
     });
   });
diff --git a/packages/app/src/lib/model-architectures.test.ts b/packages/app/src/lib/model-architectures.test.ts
index 5aaea343..93dec453 100644
--- a/packages/app/src/lib/model-architectures.test.ts
+++ b/packages/app/src/lib/model-architectures.test.ts
@@ -189,7 +189,7 @@ describe('getModelArchitecture', () => {
   it('DeepSeek V4 Pro surfaces sliding-window attention and hybrid components in features', () => {
     const arch = getModelArchitecture(Model.DeepSeek_V4_Pro);
     expect(arch?.features).toBeDefined();
-    expect(arch?.features).toContain('Sliding Window Attention (128 tokens)');
+    expect(arch?.features).toContain('Sliding window (128 tokens)');
     expect(arch?.features).toContain('Hybrid CSA + HCA Attention');
     expect(arch?.features).toContain('Attention Sink');
     expect(arch?.features).toContain('Multi-Token Prediction');
diff --git a/packages/app/src/lib/model-architectures.ts b/packages/app/src/lib/model-architectures.ts
index 76bbc765..19c952c7 100644
--- a/packages/app/src/lib/model-architectures.ts
+++ b/packages/app/src/lib/model-architectures.ts
@@ -176,7 +176,7 @@ export const MODEL_ARCHITECTURES: Partial<Record<Model, ModelArchitecture>> = {
     contextWindow: 1048576, // 1M
     features: [
       'Hybrid CSA + HCA Attention',
-      'Sliding Window Attention (128 tokens)',
+      'Sliding window (128 tokens)',
       'Attention Sink',
       'MLA-style Shared-KV MQA',
       'Lightning Indexer (sparse top-k)',

From 5bfe4decf2ea584fed8663eb73444aa986aa9733 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Mon, 8 Jun 2026 14:26:31 -0700
Subject: [PATCH 07/10] feat(inference): draw hash-routed prefix block and mHC
 hyper-connections for V4
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two DeepSeek V4 architectural facts were only feature badges; surface them in
the diagram structure.

Hash-routed layers (num_hash_layers=3): the first 3 MoE layers route by token
id, not a learned gate. They now render as a separate stacked prefix block
(between embedding and the alternating blocks) with a "Hash Router" instead of
"MoE Router". Alternating HCA/CSA counts drop 31/30 → 29/29 so they describe the
learned-router layers (3 + 29 + 29 = 61); drawExpertGrid gains optional
routerLabel / routerSub params.

mHC (hc_mult=4): residuals are replaced by 4 parallel hyper-connection streams
with learned, Sinkhorn-normalized A/B/C mixing. Residual merges now render as an
"mHC ×N" mixer node instead of a plain "+" when arch.hyperConnections > 1, plus
a caption shown while a block exposing the nodes is expanded. Models without
hyper-connections keep the "+" residual.

Adds arch fields hashRoutedLayers and hyperConnections; unit + e2e coverage.
---
 .../app/cypress/e2e/model-architecture.cy.ts  |  14 +
 .../inference/ui/ModelArchitectureDiagram.tsx | 252 +++++++++++++++++-
 .../app/src/lib/model-architectures.test.ts   |  16 +-
 packages/app/src/lib/model-architectures.ts   |  28 +-
 4 files changed, 285 insertions(+), 25 deletions(-)

diff --git a/packages/app/cypress/e2e/model-architecture.cy.ts b/packages/app/cypress/e2e/model-architecture.cy.ts
index ea3f06f9..3547c500 100644
--- a/packages/app/cypress/e2e/model-architecture.cy.ts
+++ b/packages/app/cypress/e2e/model-architecture.cy.ts
@@ -347,6 +347,20 @@ describe('Model Architecture Diagram', () => {
       cy.get('[data-testid="alternating-indicator"]').should('exist');
     });
 
+    it('shows a hash-routed MoE prefix block; mHC caption appears once a block is open', () => {
+      // First 3 layers render as a separate hash-routed prefix block
+      cy.get('[data-testid="expand-hashBlock"]').should('exist');
+      // mHC caption only appears once a block exposing the mixer nodes is expanded
+      cy.get('[data-testid="mhc-note"]').should('not.exist');
+      cy.get('[data-testid="expand-hashBlock"]').click({ force: true });
+      cy.get('[data-testid="collapse-hashBlock"]').should('exist');
+      cy.get('[data-testid="model-architecture-svg"]').contains('Hash Router').should('exist');
+      cy.get('[data-testid="mhc-note"]').should('be.visible').and('contain', 'Hyper-Connections');
+      // Restore collapsed state for subsequent tests (shared state: testIsolation off)
+      cy.get('[data-testid="collapse-hashBlock"]').click({ force: true });
+      cy.get('[data-testid="mhc-note"]').should('not.exist');
+    });
+
     it('Hybrid attention is expandable and drills down to a Sliding Window block', () => {
       cy.get('[data-testid="expand-altBlock0"]').click({ force: true });
       cy.get('[data-testid="collapse-altBlock0"]').should('exist');
diff --git a/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx b/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
index e63a7b46..1740fc00 100644
--- a/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
+++ b/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
@@ -134,6 +134,11 @@ function renderDiagram(
   const denseAttnExpanded = isAttnExpandable && expandedBlocks.has('denseAttention');
   const denseFFNExpanded = expandedBlocks.has('denseFFN');
 
+  // Hash-routed MoE prefix block (DeepSeek V4: first N layers route by token id).
+  const hasHashBlock = isMoE && (arch.hashRoutedLayers ?? 0) > 0;
+  const hashBlockExpanded = hasHashBlock && expandedBlocks.has('hashBlock');
+  const hashExpertsExpanded = hasHashBlock && expandedBlocks.has('hashExperts');
+
   // Alternating block expand states (for models with alternating attention like gpt-oss)
   const altBlockExpanded = [
     hasAlternatingLayers && expandedBlocks.has('altBlock0'),
@@ -221,6 +226,7 @@ function renderDiagram(
     altAttnExpanded[0] && altAttnFlow[0] ? getFlowHeight(altAttnFlow[0], false) : 0,
     altAttnExpanded[1] && altAttnFlow[1] ? getFlowHeight(altAttnFlow[1], false) : 0,
   ];
+  const hashExpertsExpandedH = hashExpertsExpanded ? getFlowHeight(ffnFlow, true) : 0;
 
   // Compute vertical positions
   let y = pad.top;
@@ -288,6 +294,57 @@ function renderDiagram(
     y += arrowH;
   }
 
+  // === HASH-ROUTED MoE PREFIX BLOCK (DeepSeek V4: first N layers use hash routing) ===
+  let hashTxStart = 0;
+  let hashNorm1Y = 0;
+  let hashAttnY = 0;
+  let hashMerge1Y = 0;
+  let hashNorm2Y = 0;
+  let hashExpertY = 0;
+  let hashFFNExpandedStartY = 0;
+  let hashMerge2Y = 0;
+  let hashTxEnd = 0;
+
+  if (hasHashBlock) {
+    hashTxStart = y;
+    if (hashBlockExpanded) {
+      y += 14;
+
+      hashNorm1Y = y;
+      y += smallH + arrowH;
+
+      hashAttnY = y;
+      y += blockH;
+
+      y += 4;
+      hashMerge1Y = y + mergeGap / 2;
+      y += mergeGap;
+
+      y += arrowH;
+      hashNorm2Y = y;
+      y += smallH + arrowH;
+
+      // Hash Router + Expert grid (drawExpertGrid lays out router above eY)
+      y += blockH + arrowH;
+      hashExpertY = y;
+      y += expertGridH;
+
+      hashFFNExpandedStartY = y;
+      if (hashExpertsExpanded) {
+        y += hashExpertsExpandedH;
+      }
+
+      hashMerge2Y = y + mergeGap / 2;
+      y += mergeGap;
+
+      y += 14;
+    } else {
+      y += collapsedTxH;
+    }
+    hashTxEnd = y;
+    y += arrowH;
+  }
+
   // === ALTERNATING TRANSFORMER BLOCKS (for models like gpt-oss with alternating attention) ===
   const altBlockStart = [0, 0];
   const altBlockEnd = [0, 0];
@@ -519,28 +576,59 @@ function renderDiagram(
     }
   }
 
+  // Models with hyper-connections (mHC) replace the plain residual add with a
+  // multi-stream mixer; when present, residual merges render as an "mHC ×N" pill
+  // instead of a "+" circle. N = number of parallel residual streams.
+  const hcStreams = arch.hyperConnections ?? 0;
+  const isHyperConn = hcStreams > 1;
+
   function drawResidualBypass(branchY: number, mergeY: number) {
     // Tap the residual from the input stream ABOVE the norm (in the arrow gap)
     // so the horizontal connector doesn't run across the RMSNorm block.
     const tapY = branchY - arrowH / 2;
+    // Keep the node's vertical half-height = circleR so the spine arrows (drawn
+    // by callers to mergeY ± circleR) still meet its top/bottom edges.
+    const nodeHalfW = isHyperConn ? 25 : circleR;
     bgG
       .append('path')
       .attr(
         'd',
-        `M ${cx} ${tapY} L ${residLeftX} ${tapY} L ${residLeftX} ${mergeY} L ${cx - circleR} ${mergeY}`,
+        `M ${cx} ${tapY} L ${residLeftX} ${tapY} L ${residLeftX} ${mergeY} L ${cx - nodeHalfW} ${mergeY}`,
       )
       .attr('fill', 'none')
       .attr('stroke', mutedFg)
       .attr('stroke-width', 1.5)
       .attr('opacity', 0.6);
-    g.append('circle')
-      .attr('cx', cx)
-      .attr('cy', mergeY)
-      .attr('r', circleR)
-      .attr('fill', bgSubtle)
-      .attr('stroke', mutedFg)
-      .attr('stroke-width', 1.5);
-    drawCircleGlyph(cx, mergeY, fg, '+');
+    if (isHyperConn) {
+      g.append('rect')
+        .attr('x', cx - nodeHalfW)
+        .attr('y', mergeY - circleR)
+        .attr('width', nodeHalfW * 2)
+        .attr('height', circleR * 2)
+        .attr('rx', circleR)
+        .attr('fill', bgSubtle)
+        .attr('stroke', mutedFg)
+        .attr('stroke-width', 1.5);
+      g.append('text')
+        .attr('x', cx)
+        .attr('y', mergeY)
+        .attr('text-anchor', 'middle')
+        .attr('dy', '0.35em')
+        .attr('fill', fg)
+        .attr('font-size', '8.5px')
+        .attr('font-weight', 700)
+        .attr('font-family', 'inherit')
+        .text(`mHC ×${hcStreams}`);
+    } else {
+      g.append('circle')
+        .attr('cx', cx)
+        .attr('cy', mergeY)
+        .attr('r', circleR)
+        .attr('fill', bgSubtle)
+        .attr('stroke', mutedFg)
+        .attr('stroke-width', 1.5);
+      drawCircleGlyph(cx, mergeY, fg, '+');
+    }
   }
 
   function drawBlock(
@@ -1345,7 +1433,13 @@ function renderDiagram(
   drawBlock(pad.left, embedY, bw, blockH, 'embedding', 'Token Embedding', embedSub || undefined);
   drawArrow(
     embedY + blockH,
-    hasDenseLayers ? denseTxStart : hasAlternatingLayers ? altBlockStart[0] : txStart,
+    hasDenseLayers
+      ? denseTxStart
+      : hasHashBlock
+        ? hashTxStart
+        : hasAlternatingLayers
+          ? altBlockStart[0]
+          : txStart,
   );
 
   // === DENSE TRANSFORMER BLOCK (for MoE models with initial dense layers) ===
@@ -1473,7 +1567,10 @@ function renderDiagram(
     }
 
     // Arrow from dense block to next block
-    drawArrow(denseTxEnd, hasAlternatingLayers ? altBlockStart[0] : txStart);
+    drawArrow(
+      denseTxEnd,
+      hasHashBlock ? hashTxStart : hasAlternatingLayers ? altBlockStart[0] : txStart,
+    );
   }
 
   // Compute labels
@@ -1491,11 +1588,15 @@ function renderDiagram(
     n2Y: number,
     m2Y: number,
     expertBlockId: string,
+    routerLabel = 'MoE Router',
+    routerSubOverride?: string,
   ) {
     const routedCount = arch.hasSharedExpert ? (arch.numExperts || 0) - 1 : arch.numExperts;
-    const routerSub = `Top-${arch.activeExperts} of ${routedCount} routed${arch.hasSharedExpert ? ' + 1 shared' : ''}`;
+    const routerSub =
+      routerSubOverride ??
+      `Top-${arch.activeExperts} of ${routedCount} routed${arch.hasSharedExpert ? ' + 1 shared' : ''}`;
     const rY = n2Y + smallH + arrowH;
-    drawBlock(innerX, rY, innerW, blockH, 'router', 'MoE Router', routerSub);
+    drawBlock(innerX, rY, innerW, blockH, 'router', routerLabel, routerSub);
     drawArrow(rY + blockH, rY + blockH + arrowH);
 
     const ec = getColor('expert', isDark);
@@ -1617,6 +1718,114 @@ function renderDiagram(
     drawResidualBypass(n2Y, m2Y);
   }
 
+  // === HASH-ROUTED MoE PREFIX BLOCK (DeepSeek V4: first N layers route by token id) ===
+  if (hasHashBlock) {
+    if (hashBlockExpanded) {
+      // Container
+      g.append('rect')
+        .attr('x', pad.left - 4)
+        .attr('y', hashTxStart)
+        .attr('width', bw + 8)
+        .attr('height', hashTxEnd - hashTxStart)
+        .attr('rx', 10)
+        .attr('fill', 'none')
+        .attr('stroke', borderColor)
+        .attr('stroke-width', 2)
+        .attr('stroke-dasharray', '6,3');
+
+      // Collapse badge
+      const hashBadge = `− ×${arch.hashRoutedLayers} layers`;
+      const hashBadgeW = hashBadge.length * 7 + 16;
+      g.append('rect')
+        .attr('x', width - pad.right - hashBadgeW - 4)
+        .attr('y', hashTxStart - 11)
+        .attr('width', hashBadgeW)
+        .attr('height', 22)
+        .attr('rx', 11)
+        .attr('fill', bgSubtle)
+        .attr('stroke', borderColor)
+        .attr('stroke-width', 1);
+      g.append('text')
+        .attr('x', width - pad.right - hashBadgeW / 2 - 4)
+        .attr('y', hashTxStart)
+        .attr('text-anchor', 'middle')
+        .attr('dy', '0.35em')
+        .attr('fill', mutedFg)
+        .attr('font-size', '11px')
+        .attr('font-weight', 600)
+        .attr('font-family', 'inherit')
+        .text(hashBadge);
+      g.append('rect')
+        .attr('x', width - pad.right - hashBadgeW - 4)
+        .attr('y', hashTxStart - 11)
+        .attr('width', hashBadgeW)
+        .attr('height', 22)
+        .attr('rx', 11)
+        .attr('fill', 'transparent')
+        .style('cursor', 'pointer')
+        .attr('data-testid', 'collapse-hashBlock')
+        .on('click', () => onBlockClick('hashBlock'));
+
+      // RMSNorm 1
+      drawBlock(innerX, hashNorm1Y, innerW, smallH, 'norm', 'RMSNorm');
+      drawArrow(hashNorm1Y + smallH, hashAttnY);
+
+      // Attention — static hybrid block (same attention stack as the rest)
+      const hashHeadSub = [
+        arch.numHeads ? `${arch.numHeads} heads` : null,
+        arch.numKVHeads ? `${arch.numKVHeads} KV heads` : null,
+      ]
+        .filter(Boolean)
+        .join('  ·  ');
+      drawBlock(
+        innerX,
+        hashAttnY,
+        innerW,
+        blockH,
+        'attention',
+        attnLabel,
+        hashHeadSub || undefined,
+      );
+
+      drawArrow(hashAttnY + blockH + 4, hashMerge1Y - circleR);
+      drawResidualBypass(hashNorm1Y, hashMerge1Y);
+      drawArrow(hashMerge1Y + circleR, hashNorm2Y);
+
+      // RMSNorm 2
+      drawBlock(innerX, hashNorm2Y, innerW, smallH, 'norm', 'RMSNorm');
+      drawArrow(hashNorm2Y + smallH, hashNorm2Y + smallH + arrowH);
+
+      // Hash Router + Expert grid (token-id → fixed experts, not a learned gate)
+      const hashRoutedCount = arch.hasSharedExpert ? (arch.numExperts || 0) - 1 : arch.numExperts;
+      const hashRouterSub = `token-id → ${arch.activeExperts} of ${hashRoutedCount}${arch.hasSharedExpert ? ' + 1 shared' : ''}`;
+      drawExpertGrid(
+        hashExpertY,
+        hashExpertsExpanded,
+        hashExpertsExpandedH,
+        hashFFNExpandedStartY,
+        hashNorm2Y,
+        hashMerge2Y,
+        'hashExperts',
+        'Hash Router',
+        hashRouterSub,
+      );
+    } else {
+      const hashSub = `×${arch.hashRoutedLayers} first layers · token-id → experts`;
+      drawCollapsedTransformerBlock(
+        pad.left,
+        hashTxStart,
+        bw,
+        collapsedTxH,
+        'Hash-Routed MoE',
+        hashSub,
+        'hashBlock',
+      );
+    }
+
+    // Arrow from the hash block to the first alternating block (or main transformer)
+    drawArrow(hashTxEnd, hasAlternatingLayers ? altBlockStart[0] : txStart);
+  }
+
   // === ALTERNATING TRANSFORMER BLOCKS (gpt-oss style) ===
   if (hasAlternatingLayers) {
     for (let bi = 0; bi < 2; bi++) {
@@ -2145,6 +2354,23 @@ export default function ModelArchitectureDiagram({
                 sliding-window + selected compressed keys, with a learnable per-head attention sink.
               </p>
             )}
+          {(arch.hyperConnections ?? 0) > 1 &&
+            ['altBlock0', 'altBlock1', 'hashBlock', 'transformer', 'denseTransformer'].some((id) =>
+              expandedBlocks.has(id),
+            ) && (
+              <p
+                className="mt-2 text-[11px] leading-snug text-muted-foreground"
+                data-testid="mhc-note"
+              >
+                <span className="font-medium text-foreground">
+                  Hyper-Connections (mHC ×{arch.hyperConnections})
+                </span>{' '}
+                replace each residual with {arch.hyperConnections} parallel streams combined by
+                learned, Sinkhorn-normalized weights — read ({arch.hyperConnections}→1), output, and
+                a {arch.hyperConnections}×{arch.hyperConnections} stream mix — shown as the mHC ×
+                {arch.hyperConnections} nodes.
+              </p>
+            )}
           {arch.features && arch.features.length > 0 && (
             <div className="mt-3 pt-3 border-t border-border/50">
               <div className="flex flex-wrap gap-1.5 items-center">
diff --git a/packages/app/src/lib/model-architectures.test.ts b/packages/app/src/lib/model-architectures.test.ts
index 93dec453..e55ba84e 100644
--- a/packages/app/src/lib/model-architectures.test.ts
+++ b/packages/app/src/lib/model-architectures.test.ts
@@ -177,8 +177,12 @@ describe('getModelArchitecture', () => {
     expect(arch?.numExperts).toBe(385);
     expect(arch?.activeExperts).toBe(6);
     expect(arch?.hasSharedExpert).toBe(true);
-    // First 3 layers use hash-routed MoE (not dense FFN), so no dense block.
+    // First 3 layers use hash-routed MoE (not dense FFN), so no dense block —
+    // they render as a dedicated hash-routed prefix block instead.
     expect(arch?.denseFFNLayers).toBeUndefined();
+    expect(arch?.hashRoutedLayers).toBe(3);
+    // mHC: residuals are replaced by 4 parallel hyper-connection streams.
+    expect(arch?.hyperConnections).toBe(4);
     expect(arch?.slidingWindow).toBe(128);
     expect(arch?.contextWindow).toBe(1048576);
     expect(arch?.developer).toBe('DeepSeek');
@@ -200,24 +204,26 @@ describe('getModelArchitecture', () => {
     expect(arch?.alternatingLayers).toBeDefined();
     expect(arch?.alternatingLayers).toHaveLength(2);
 
+    // Counts describe the learned-router layers (the first 3 hash-routed layers
+    // are split out into their own block): 29 HCA + 29 CSA + 3 hash = 61.
     const [hca, csa] = arch!.alternatingLayers!;
     expect(hca.label).toBe('Heavily Compressed Attention');
-    expect(hca.count).toBe(31);
+    expect(hca.count).toBe(29);
     expect(hca.description).toContain('sliding window');
     expect(hca.slidingWindow).toBe(128);
 
     expect(csa.label).toBe('Compressed Sparse Attention');
-    expect(csa.count).toBe(30);
+    expect(csa.count).toBe(29);
     expect(csa.description).toContain('sliding window');
     expect(csa.description).toContain('lightning indexer');
     expect(csa.slidingWindow).toBe(128);
   });
 
-  it('DeepSeek V4 Pro alternating layer counts sum to numLayers', () => {
+  it('DeepSeek V4 Pro alternating + hash-routed layer counts sum to numLayers', () => {
     const arch = getModelArchitecture(Model.DeepSeek_V4_Pro);
     expect(arch?.alternatingLayers).toBeDefined();
     const totalAlternating = arch!.alternatingLayers!.reduce((sum, l) => sum + l.count, 0);
-    expect(totalAlternating).toBe(arch!.numLayers);
+    expect(totalAlternating + (arch!.hashRoutedLayers ?? 0)).toBe(arch!.numLayers);
   });
 
   it('returns architecture for Kimi K2.5 with MoE and MLA details', () => {
diff --git a/packages/app/src/lib/model-architectures.ts b/packages/app/src/lib/model-architectures.ts
index 19c952c7..83e02044 100644
--- a/packages/app/src/lib/model-architectures.ts
+++ b/packages/app/src/lib/model-architectures.ts
@@ -69,6 +69,11 @@ export interface ModelArchitecture {
   denseFFNLayers?: number;
   /** Intermediate dimension of the dense FFN layers (differs from MoE expert FFN dim) */
   denseFFNDim?: number;
+  /**
+   * Number of leading MoE layers that use hash routing (token-id → fixed experts)
+   * instead of the learned gate. Rendered as a separate stacked prefix block.
+   */
+  hashRoutedLayers?: number;
   /**
    * Alternating layer type pattern (e.g., gpt-oss uses sliding_attention/full_attention).
    * Each entry describes one category of layer and how many of that type exist.
@@ -76,6 +81,11 @@ export interface ModelArchitecture {
   alternatingLayers?: AlternatingLayerSpec[];
   /** Sliding window size in tokens (for models using sliding/local attention) */
   slidingWindow?: number;
+  /**
+   * Number of parallel residual streams for hyper-connections (mHC). When > 1,
+   * residual merges render as "mHC ×N" mixer nodes instead of a plain "+" add.
+   */
+  hyperConnections?: number;
   /** Context window size (in tokens) */
   contextWindow?: number;
   /** Special architectural features */
@@ -151,28 +161,32 @@ export const MODEL_ARCHITECTURES: Partial<Record<Model, ModelArchitecture>> = {
     numExperts: 385, // 384 routed + 1 shared
     activeExperts: 6,
     hasSharedExpert: true,
-    // Attention layers interleave two compressed variants; every layer also
-    // carries a 128-token sliding-window branch plus a learnable attention sink.
-    // Counts: 31 HCA + 30 CSA = 61 (the extra MTP block is sliding-window only).
+    // First 3 layers use hash-routed MoE (shown as a separate prefix block); the
+    // remaining 58 learned-router layers interleave two compressed-attention
+    // variants. Every layer also carries a 128-token sliding-window branch plus a
+    // learnable attention sink. Counts below are the learned-router layers:
+    // 29 HCA + 29 CSA + 3 hash-routed = 61 (the extra MTP block is SWA-only).
+    hashRoutedLayers: 3,
     alternatingLayers: [
       {
         label: 'Heavily Compressed Attention',
         description:
-          'HCA: the KV of every 128 tokens is consolidated into a single entry and attended densely, alongside a 128-token sliding window of uncompressed KV and a learnable attention sink.',
-        count: 31,
+          'HCA (learned-router layers): the KV of every 128 tokens is consolidated into a single entry and attended densely, alongside a 128-token sliding window of uncompressed KV and a learnable attention sink.',
+        count: 29,
         colorKey: 'attention',
         slidingWindow: 128,
       },
       {
         label: 'Compressed Sparse Attention',
         description:
-          'CSA: the KV of every 4 tokens is compressed to one entry, then a lightning indexer selects the top-1024 compressed blocks for sparse attention, alongside a 128-token sliding window and a learnable attention sink.',
-        count: 30,
+          'CSA (learned-router layers): the KV of every 4 tokens is compressed to one entry, then a lightning indexer selects the top-1024 compressed blocks for sparse attention, alongside a 128-token sliding window and a learnable attention sink.',
+        count: 29,
         colorKey: 'attention',
         slidingWindow: 128,
       },
     ],
     slidingWindow: 128,
+    hyperConnections: 4, // mHC: 4 parallel residual streams (hc_mult)
     contextWindow: 1048576, // 1M
     features: [
       'Hybrid CSA + HCA Attention',

From 55adbef5be4200ad3c5b0003be7c69aee6ab0686 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Mon, 8 Jun 2026 14:40:08 -0700
Subject: [PATCH 08/10] fix(inference): connect arrow into RMSNorm and gap
 attention drill-down

Two diagram glitches in expanded transformer blocks:

- The incoming arrow stopped at the dashed container border, leaving no line
  above the first RMSNorm. Route each block's incoming arrow to its first
  RMSNorm when the block is expanded (through the border), so there is a
  continuous connector; collapsed blocks still target the block top.

- The attention drill-down rect sat flush against the attention block's bottom
  border, reading as an overlap. Add a small gap (drillGap) between an
  attention block and its expansion flow.
---
 .../inference/ui/ModelArchitectureDiagram.tsx | 32 +++++++++++++++----
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx b/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
index 1740fc00..cf93c159 100644
--- a/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
+++ b/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
@@ -107,6 +107,9 @@ function renderDiagram(
   const subBlockH = 34;
   const subArrowH = 12;
   const subPadY = 10;
+  // Gap between a block and its drill-down flow so the dashed expansion rect
+  // reads as separate from the block above it (not overlapping its border).
+  const drillGap = 8;
 
   // Architecture flags
   const isMoE = arch.architectureType === 'moe';
@@ -262,6 +265,7 @@ function renderDiagram(
       denseAttnY = y;
       y += blockH;
 
+      if (denseAttnExpanded) y += drillGap;
       denseAttnExpandedStartY = y;
       if (denseAttnExpanded) {
         y += denseAttnExpandedH;
@@ -390,6 +394,7 @@ function renderDiagram(
         y += blockH;
 
         // Expanded hybrid-attention sub-blocks (sliding window + compressed)
+        if (altAttnExpanded[bi]) y += drillGap;
         altAttnExpandedStartY[bi] = y;
         if (altAttnExpanded[bi]) {
           y += altAttnExpandedH[bi];
@@ -445,6 +450,7 @@ function renderDiagram(
     y += blockH;
 
     // Expanded attention sub-blocks (only for non-MLA, non-AlternatingSinkGQA)
+    if (attnExpanded) y += drillGap;
     attnExpandedStartY = y;
     if (attnExpanded) {
       y += attnExpandedH;
@@ -1431,15 +1437,27 @@ function renderDiagram(
     .filter(Boolean)
     .join('  \u00B7  ');
   drawBlock(pad.left, embedY, bw, blockH, 'embedding', 'Token Embedding', embedSub || undefined);
+
+  // A block's incoming arrow should land on its first RMSNorm when the block is
+  // expanded (so there's a continuous line into the norm, through the container
+  // border), or on the block's top when collapsed.
+  const denseEntryY = denseTxExpanded ? denseNorm1Y : denseTxStart;
+  const hashEntryY = hashBlockExpanded ? hashNorm1Y : hashTxStart;
+  const altEntryY = [
+    altBlockExpanded[0] ? altNorm1Y[0] : altBlockStart[0],
+    altBlockExpanded[1] ? altNorm1Y[1] : altBlockStart[1],
+  ];
+  const mainEntryY = txExpanded ? norm1Y : txStart;
+
   drawArrow(
     embedY + blockH,
     hasDenseLayers
-      ? denseTxStart
+      ? denseEntryY
       : hasHashBlock
-        ? hashTxStart
+        ? hashEntryY
         : hasAlternatingLayers
-          ? altBlockStart[0]
-          : txStart,
+          ? altEntryY[0]
+          : mainEntryY,
   );
 
   // === DENSE TRANSFORMER BLOCK (for MoE models with initial dense layers) ===
@@ -1569,7 +1587,7 @@ function renderDiagram(
     // Arrow from dense block to next block
     drawArrow(
       denseTxEnd,
-      hasHashBlock ? hashTxStart : hasAlternatingLayers ? altBlockStart[0] : txStart,
+      hasHashBlock ? hashEntryY : hasAlternatingLayers ? altEntryY[0] : mainEntryY,
     );
   }
 
@@ -1823,7 +1841,7 @@ function renderDiagram(
     }
 
     // Arrow from the hash block to the first alternating block (or main transformer)
-    drawArrow(hashTxEnd, hasAlternatingLayers ? altBlockStart[0] : txStart);
+    drawArrow(hashTxEnd, hasAlternatingLayers ? altEntryY[0] : mainEntryY);
   }
 
   // === ALTERNATING TRANSFORMER BLOCKS (gpt-oss style) ===
@@ -1958,7 +1976,7 @@ function renderDiagram(
       // Draw alternating indicator between the two blocks
       if (bi === 0) {
         // Arrow from block 0 end through indicator to block 1 start (drawn first, behind text)
-        drawArrow(altBlockEnd[0] + 2, altBlockStart[1]);
+        drawArrow(altBlockEnd[0] + 2, altEntryY[1]);
 
         // Opaque background rect behind the label so it doesn't overlap the arrow
         const cardBg = isDark ? '#131416' : '#eaebec';

From 9a600284d947aa75a0230491ec58b9b66416e02c Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Mon, 8 Jun 2026 15:25:30 -0700
Subject: [PATCH 09/10] fix(inference): count the shared expert in the
 specs-bar active count
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The specs bar showed "6/385", but the always-on shared expert is active too,
so 7 experts run per token (6 routed + 1 shared). Show "6+1/385" for
shared-expert MoE models (e.g. R1 → "8+1/257") so the active count isn't
undersold; the title's "N active" params and the router subtitle already
account for the shared expert.
---
 .../src/components/inference/ui/ModelArchitectureDiagram.tsx  | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx b/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
index cf93c159..4146a875 100644
--- a/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
+++ b/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
@@ -2186,7 +2186,9 @@ function renderDiagram(
       ? [
           {
             label: 'Experts',
-            value: `${arch.activeExperts}/${arch.numExperts}`,
+            // Active per token = routed top-k + the always-on shared expert, so
+            // show "6+1/385" (not "6/385"): the shared expert is active too.
+            value: `${arch.activeExperts}${arch.hasSharedExpert ? '+1' : ''}/${arch.numExperts}`,
           },
         ]
       : []),

From cb5671cfd0016f42d3d87fee5c781d11a5308e8c Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 11 Jun 2026 18:42:28 -0700
Subject: [PATCH 10/10] final

---
 packages/app/cypress/e2e/model-architecture.cy.ts  | 14 ++++++++++++++
 .../inference/ui/ModelArchitectureDiagram.tsx      |  8 +++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/packages/app/cypress/e2e/model-architecture.cy.ts b/packages/app/cypress/e2e/model-architecture.cy.ts
index 6c8bef25..f6e29592 100644
--- a/packages/app/cypress/e2e/model-architecture.cy.ts
+++ b/packages/app/cypress/e2e/model-architecture.cy.ts
@@ -386,6 +386,20 @@ describe('Model Architecture Diagram', () => {
       cy.get('[data-testid="model-architecture-svg"]').should('be.visible');
     });
 
+    it('collapsing the parent block hides the union-softmax caption (no orphan caption)', () => {
+      // altBlock0 + altAttention0 are expanded from the previous tests; collapsing
+      // the parent removes the drill-down from the SVG, so the caption must go too
+      // even though altAttention0 stays in the expansion state.
+      cy.get('[data-testid="hybrid-attention-note"]').should('be.visible');
+      cy.get('[data-testid="collapse-altBlock0"]').click({ force: true });
+      cy.get('[data-testid="hybrid-attention-note"]').should('not.exist');
+      // Re-expanding the parent restores the remembered drill-down and its caption.
+      cy.get('[data-testid="expand-altBlock0"]').click({ force: true });
+      cy.get('[data-testid="hybrid-attention-note"]')
+        .should('be.visible')
+        .and('contain', 'single softmax');
+    });
+
     it('shows DeepSeek V4 Pro features (incl. sliding window) and developer info', () => {
       cy.contains('Hybrid CSA + HCA Attention').should('be.visible');
       cy.contains('Sliding window (128 tokens)').should('be.visible');
diff --git a/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx b/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
index 4146a875..3fcd91d8 100644
--- a/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
+++ b/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx
@@ -2361,8 +2361,14 @@ export default function ModelArchitectureDiagram({
       >
         <div ref={containerRef} className="px-4 pb-4">
           <svg ref={svgRef} className="w-full" data-testid="model-architecture-svg" />
+          {/* The drill-down only renders while its parent block is expanded, so gate
+              the caption on the parent too — collapsing the parent leaves the child
+              id in expandedBlocks (state is restored on re-expand), and the caption
+              must not outlive the drawing it explains. */}
           {arch.attentionType === 'Hybrid' &&
-            (expandedBlocks.has('altAttention0') || expandedBlocks.has('altAttention1')) && (
+            [0, 1].some(
+              (i) => expandedBlocks.has(`altBlock${i}`) && expandedBlocks.has(`altAttention${i}`),
+            ) && (
               <p
                 className="mt-2 text-[11px] leading-snug text-muted-foreground"
                 data-testid="hybrid-attention-note"