diff --git a/packages/app/cypress/e2e/model-architecture.cy.ts b/packages/app/cypress/e2e/model-architecture.cy.ts index 30cadbb4..f6e29592 100644 --- a/packages/app/cypress/e2e/model-architecture.cy.ts +++ b/packages/app/cypress/e2e/model-architecture.cy.ts @@ -316,4 +316,94 @@ describe('Model Architecture Diagram', () => { cy.contains('Released by OpenAI').should('be.visible'); }); }); + + describe('Hybrid Attention Blocks (MoE model - DeepSeek V4 Pro)', () => { + before(() => { + // Clear any stale Radix scroll lock from prior Select interactions + cy.document().then((doc) => { + delete doc.body.dataset.scrollLocked; + doc.body.style.removeProperty('pointer-events'); + }); + cy.get('[role="combobox"]').filter(':visible').first().click(); + cy.get('[role="option"]').contains('DeepSeek V4 Pro').click(); + + cy.get('[data-testid="model-architecture-toggle"]').should('be.visible'); + cy.get('body').then(($body) => { + if ($body.find('[data-testid="model-architecture-svg"]:visible').length === 0) { + cy.get('[data-testid="model-architecture-toggle"]').click(); + } + }); + cy.get('[data-testid="model-architecture-svg"]').should('be.visible'); + }); + + it('shows MoE and Hybrid badges for DeepSeek V4 Pro', () => { + cy.get('[data-testid="model-architecture-toggle"]').should('contain.text', 'MoE'); + cy.get('[data-testid="model-architecture-toggle"]').should('contain.text', 'Hybrid'); + cy.get('[data-testid="model-architecture-toggle"]').should('contain.text', '1.6T'); + }); + + it('shows two separate hybrid (CSA/HCA) blocks with an alternating indicator', () => { + cy.get('[data-testid="expand-altBlock0"]').should('exist'); + cy.get('[data-testid="expand-altBlock1"]').should('exist'); + cy.get('[data-testid="expand-transformer"]').should('not.exist'); + cy.get('[data-testid="expand-denseTransformer"]').should('not.exist'); + cy.get('[data-testid="alternating-indicator"]').should('exist'); + }); + + it('shows a hash-routed MoE prefix block; mHC caption appears once a block is open', () => { + // First 3 layers render as a separate hash-routed prefix block + cy.get('[data-testid="expand-hashBlock"]').should('exist'); + // mHC caption only appears once a block exposing the mixer nodes is expanded + cy.get('[data-testid="mhc-note"]').should('not.exist'); + cy.get('[data-testid="expand-hashBlock"]').click({ force: true }); + cy.get('[data-testid="collapse-hashBlock"]').should('exist'); + cy.get('[data-testid="model-architecture-svg"]').contains('Hash Router').should('exist'); + cy.get('[data-testid="mhc-note"]').should('be.visible').and('contain', 'Hyper-Connections'); + // Restore collapsed state for subsequent tests (shared state: testIsolation off) + cy.get('[data-testid="collapse-hashBlock"]').click({ force: true }); + cy.get('[data-testid="mhc-note"]').should('not.exist'); + }); + + it('Hybrid attention is expandable and drills down to a Sliding Window block', () => { + cy.get('[data-testid="expand-altBlock0"]').click({ force: true }); + cy.get('[data-testid="collapse-altBlock0"]').should('exist'); + // The union-softmax caption only appears once the attention drill-down is open + cy.get('[data-testid="hybrid-attention-note"]').should('not.exist'); + // Hybrid attention drills down (unlike gpt-oss sink/full GQA, which does not) + cy.get('[data-testid="expand-altAttention0"]').should('exist'); + cy.get('[data-testid="expand-altAttention0"]').click({ force: true }); + cy.get('[data-testid="model-architecture-svg"]').should('be.visible'); + // Caption clarifies the two branches feed one softmax (not two attentions) + cy.get('[data-testid="hybrid-attention-note"]') + .should('be.visible') + .and('contain', 'single softmax'); + // Expert grid still expandable within the block + cy.get('[data-testid="expand-altExperts0"]').should('exist'); + }); + + it('expert grid can be expanded to show SwiGLU details', () => { + cy.get('[data-testid="expand-altExperts0"]').click({ force: true }); + cy.get('[data-testid="model-architecture-svg"]').should('be.visible'); + }); + + it('collapsing the parent block hides the union-softmax caption (no orphan caption)', () => { + // altBlock0 + altAttention0 are expanded from the previous tests; collapsing + // the parent removes the drill-down from the SVG, so the caption must go too + // even though altAttention0 stays in the expansion state. + cy.get('[data-testid="hybrid-attention-note"]').should('be.visible'); + cy.get('[data-testid="collapse-altBlock0"]').click({ force: true }); + cy.get('[data-testid="hybrid-attention-note"]').should('not.exist'); + // Re-expanding the parent restores the remembered drill-down and its caption. + cy.get('[data-testid="expand-altBlock0"]').click({ force: true }); + cy.get('[data-testid="hybrid-attention-note"]') + .should('be.visible') + .and('contain', 'single softmax'); + }); + + it('shows DeepSeek V4 Pro features (incl. sliding window) and developer info', () => { + cy.contains('Hybrid CSA + HCA Attention').should('be.visible'); + cy.contains('Sliding window (128 tokens)').should('be.visible'); + cy.contains('Released by DeepSeek').should('be.visible'); + }); + }); }); diff --git a/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx b/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx index e79f057a..3fcd91d8 100644 --- a/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx +++ b/packages/app/src/components/inference/ui/ModelArchitectureDiagram.tsx @@ -18,6 +18,7 @@ import { getAttentionLabel, getAttentionSubBlocks, getFFNSubBlocks, + getHybridAttentionSubBlocks, getModelArchitecture, } from '@/lib/model-architectures'; @@ -106,6 +107,9 @@ function renderDiagram( const subBlockH = 34; const subArrowH = 12; const subPadY = 10; + // Gap between a block and its drill-down flow so the dashed expansion rect + // reads as separate from the block above it (not overlapping its border). + const drillGap = 8; // Architecture flags const isMoE = arch.architectureType === 'moe'; @@ -133,6 +137,11 @@ function renderDiagram( const denseAttnExpanded = isAttnExpandable && expandedBlocks.has('denseAttention'); const denseFFNExpanded = expandedBlocks.has('denseFFN'); + // Hash-routed MoE prefix block (DeepSeek V4: first N layers route by token id). + const hasHashBlock = isMoE && (arch.hashRoutedLayers ?? 0) > 0; + const hashBlockExpanded = hasHashBlock && expandedBlocks.has('hashBlock'); + const hashExpertsExpanded = hasHashBlock && expandedBlocks.has('hashExperts'); + // Alternating block expand states (for models with alternating attention like gpt-oss) const altBlockExpanded = [ hasAlternatingLayers && expandedBlocks.has('altBlock0'), @@ -143,6 +152,21 @@ function renderDiagram( hasAlternatingLayers && expandedBlocks.has('altExperts1'), ]; + // Hybrid models (DeepSeek V4) expose an expandable attention drill-down inside + // each alternating block, revealing the sliding-window branch as an explicit + // block alongside the compressed branch. gpt-oss (AlternatingSinkGQA) keeps a + // static attention block. + const altAttnExpandable = hasAlternatingLayers && arch.attentionType === 'Hybrid'; + const altAttnExpanded = [ + altAttnExpandable && expandedBlocks.has('altAttention0'), + altAttnExpandable && expandedBlocks.has('altAttention1'), + ]; + const altAttnFlow: (SubBlockFlow | null)[] = altAttnExpandable + ? [0, 1].map((i) => + alternatingSpecs[i] ? getHybridAttentionSubBlocks(arch, alternatingSpecs[i]) : null, + ) + : [null, null]; + // Calculate flow height for either sequential or parallel layouts function getFlowHeight(flow: SubBlockFlow, hasLabel: boolean): number { if (flow.layout === 'sequential') { @@ -201,6 +225,11 @@ function renderDiagram( altExpertsExpanded[0] ? getFlowHeight(ffnFlow, true) : 0, altExpertsExpanded[1] ? getFlowHeight(ffnFlow, true) : 0, ]; + const altAttnExpandedH = [ + altAttnExpanded[0] && altAttnFlow[0] ? getFlowHeight(altAttnFlow[0], false) : 0, + altAttnExpanded[1] && altAttnFlow[1] ? getFlowHeight(altAttnFlow[1], false) : 0, + ]; + const hashExpertsExpandedH = hashExpertsExpanded ? getFlowHeight(ffnFlow, true) : 0; // Compute vertical positions let y = pad.top; @@ -236,6 +265,7 @@ function renderDiagram( denseAttnY = y; y += blockH; + if (denseAttnExpanded) y += drillGap; denseAttnExpandedStartY = y; if (denseAttnExpanded) { y += denseAttnExpandedH; @@ -268,11 +298,63 @@ function renderDiagram( y += arrowH; } + // === HASH-ROUTED MoE PREFIX BLOCK (DeepSeek V4: first N layers use hash routing) === + let hashTxStart = 0; + let hashNorm1Y = 0; + let hashAttnY = 0; + let hashMerge1Y = 0; + let hashNorm2Y = 0; + let hashExpertY = 0; + let hashFFNExpandedStartY = 0; + let hashMerge2Y = 0; + let hashTxEnd = 0; + + if (hasHashBlock) { + hashTxStart = y; + if (hashBlockExpanded) { + y += 14; + + hashNorm1Y = y; + y += smallH + arrowH; + + hashAttnY = y; + y += blockH; + + y += 4; + hashMerge1Y = y + mergeGap / 2; + y += mergeGap; + + y += arrowH; + hashNorm2Y = y; + y += smallH + arrowH; + + // Hash Router + Expert grid (drawExpertGrid lays out router above eY) + y += blockH + arrowH; + hashExpertY = y; + y += expertGridH; + + hashFFNExpandedStartY = y; + if (hashExpertsExpanded) { + y += hashExpertsExpandedH; + } + + hashMerge2Y = y + mergeGap / 2; + y += mergeGap; + + y += 14; + } else { + y += collapsedTxH; + } + hashTxEnd = y; + y += arrowH; + } + // === ALTERNATING TRANSFORMER BLOCKS (for models like gpt-oss with alternating attention) === const altBlockStart = [0, 0]; const altBlockEnd = [0, 0]; const altNorm1Y = [0, 0]; const altAttnY = [0, 0]; + const altAttnExpandedStartY = [0, 0]; const altMerge1Y = [0, 0]; const altNorm2Y = [0, 0]; const altRouterY = [0, 0]; @@ -311,6 +393,13 @@ function renderDiagram( altAttnY[bi] = y; y += blockH; + // Expanded hybrid-attention sub-blocks (sliding window + compressed) + if (altAttnExpanded[bi]) y += drillGap; + altAttnExpandedStartY[bi] = y; + if (altAttnExpanded[bi]) { + y += altAttnExpandedH[bi]; + } + y += 4; altMerge1Y[bi] = y + mergeGap / 2; y += mergeGap; @@ -361,6 +450,7 @@ function renderDiagram( y += blockH; // Expanded attention sub-blocks (only for non-MLA, non-AlternatingSinkGQA) + if (attnExpanded) y += drillGap; attnExpandedStartY = y; if (attnExpanded) { y += attnExpandedH; @@ -466,34 +556,85 @@ function renderDiagram( .attr('marker-end', 'url(#arch-arrow)'); } + // Draw a +, −, or × glyph from geometric strokes, perfectly centered at + // (gx, gy). Rendering these as lines instead of sidesteps font + // baseline drift, which left the symbol sitting slightly low inside the + // merge / expand circles regardless of dy tuning. + function drawCircleGlyph(gx: number, gy: number, color: string, symbol: string) { + const arm = 5; + const seg = (x1: number, y1: number, x2: number, y2: number) => + g + .append('line') + .attr('x1', x1) + .attr('y1', y1) + .attr('x2', x2) + .attr('y2', y2) + .attr('stroke', color) + .attr('stroke-width', 2) + .attr('stroke-linecap', 'round') + .style('pointer-events', 'none'); + if (symbol === '×') { + seg(gx - arm, gy - arm, gx + arm, gy + arm); + seg(gx - arm, gy + arm, gx + arm, gy - arm); + } else { + seg(gx - arm, gy, gx + arm, gy); // horizontal arm (+ and −) + if (symbol !== '−') seg(gx, gy - arm, gx, gy + arm); // vertical arm (+ only) + } + } + + // Models with hyper-connections (mHC) replace the plain residual add with a + // multi-stream mixer; when present, residual merges render as an "mHC ×N" pill + // instead of a "+" circle. N = number of parallel residual streams. + const hcStreams = arch.hyperConnections ?? 0; + const isHyperConn = hcStreams > 1; + function drawResidualBypass(branchY: number, mergeY: number) { + // Tap the residual from the input stream ABOVE the norm (in the arrow gap) + // so the horizontal connector doesn't run across the RMSNorm block. + const tapY = branchY - arrowH / 2; + // Keep the node's vertical half-height = circleR so the spine arrows (drawn + // by callers to mergeY ± circleR) still meet its top/bottom edges. + const nodeHalfW = isHyperConn ? 25 : circleR; bgG .append('path') .attr( 'd', - `M ${cx} ${branchY} L ${residLeftX} ${branchY} L ${residLeftX} ${mergeY} L ${cx - circleR} ${mergeY}`, + `M ${cx} ${tapY} L ${residLeftX} ${tapY} L ${residLeftX} ${mergeY} L ${cx - nodeHalfW} ${mergeY}`, ) .attr('fill', 'none') .attr('stroke', mutedFg) .attr('stroke-width', 1.5) .attr('opacity', 0.6); - g.append('circle') - .attr('cx', cx) - .attr('cy', mergeY) - .attr('r', circleR) - .attr('fill', bgSubtle) - .attr('stroke', mutedFg) - .attr('stroke-width', 1.5); - g.append('text') - .attr('x', cx) - .attr('y', mergeY) - .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') - .attr('fill', fg) - .attr('font-size', '14px') - .attr('font-weight', 700) - .attr('font-family', 'inherit') - .text('+'); + if (isHyperConn) { + g.append('rect') + .attr('x', cx - nodeHalfW) + .attr('y', mergeY - circleR) + .attr('width', nodeHalfW * 2) + .attr('height', circleR * 2) + .attr('rx', circleR) + .attr('fill', bgSubtle) + .attr('stroke', mutedFg) + .attr('stroke-width', 1.5); + g.append('text') + .attr('x', cx) + .attr('y', mergeY) + .attr('text-anchor', 'middle') + .attr('dy', '0.35em') + .attr('fill', fg) + .attr('font-size', '8.5px') + .attr('font-weight', 700) + .attr('font-family', 'inherit') + .text(`mHC ×${hcStreams}`); + } else { + g.append('circle') + .attr('cx', cx) + .attr('cy', mergeY) + .attr('r', circleR) + .attr('fill', bgSubtle) + .attr('stroke', mutedFg) + .attr('stroke-width', 1.5); + drawCircleGlyph(cx, mergeY, fg, '+'); + } } function drawBlock( @@ -520,7 +661,7 @@ function renderDiagram( .attr('x', x + w / 2) .attr('y', by + h / 2 - (subText ? 7 : 0)) .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') + .attr('dy', '0.35em') .attr('fill', fg) .attr('font-size', '13px') .attr('font-weight', 600) @@ -532,7 +673,7 @@ function renderDiagram( .attr('x', x + w / 2) .attr('y', by + h / 2 + 10) .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') + .attr('dy', '0.35em') .attr('fill', mutedFg) .attr('font-size', '11px') .attr('font-family', 'inherit') @@ -566,7 +707,7 @@ function renderDiagram( .attr('x', x + w / 2 - 8) .attr('y', by + h / 2 - (subText ? 7 : 0)) .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') + .attr('dy', '0.35em') .attr('fill', fg) .attr('font-size', '13px') .attr('font-weight', 600) @@ -579,7 +720,7 @@ function renderDiagram( .attr('x', x + w / 2 - 8) .attr('y', by + h / 2 + 10) .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') + .attr('dy', '0.35em') .attr('fill', mutedFg) .attr('font-size', '11px') .attr('font-family', 'inherit') @@ -599,16 +740,7 @@ function renderDiagram( .attr('stroke-width', 1) .style('pointer-events', 'none'); - g.append('text') - .attr('x', iconX) - .attr('y', iconY) - .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') - .attr('fill', c.stroke) - .attr('font-size', '14px') - .attr('font-weight', 700) - .style('pointer-events', 'none') - .text(isBlockExpanded ? '\u2212' : '+'); + drawCircleGlyph(iconX, iconY, c.stroke, isBlockExpanded ? '\u2212' : '+'); g.append('rect') .attr('x', x) @@ -649,7 +781,7 @@ function renderDiagram( .attr('x', bx + subBw / 2) .attr('y', by + subBlockH / 2 - (block.detail ? 5 : 0)) .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') + .attr('dy', '0.35em') .attr('fill', fg) .attr('font-size', fontSize.name) .attr('font-weight', 500) @@ -661,7 +793,7 @@ function renderDiagram( .attr('x', bx + subBw / 2) .attr('y', by + subBlockH / 2 + 8) .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') + .attr('dy', '0.35em') .attr('fill', mutedFg) .attr('font-size', fontSize.detail) .attr('font-family', 'inherit') @@ -698,7 +830,7 @@ function renderDiagram( .attr('x', x + w / 2) .attr('y', sy + 8) .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') + .attr('dy', '0.35em') .attr('fill', mutedFg) .attr('font-size', '10px') .attr('font-weight', 600) @@ -723,7 +855,7 @@ function renderDiagram( .attr('x', leftCx) .attr('y', sy + 6) .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') + .attr('dy', '0.35em') .attr('fill', mutedFg) .attr('font-size', '9px') .attr('font-weight', 600) @@ -735,7 +867,7 @@ function renderDiagram( .attr('x', rightCx) .attr('y', sy + 6) .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') + .attr('dy', '0.35em') .attr('fill', mutedFg) .attr('font-size', '9px') .attr('font-weight', 600) @@ -752,6 +884,16 @@ function renderDiagram( const parallelStartY = sy; const colFontSize = { name: '10px', detail: '8px' }; + const maxRows = Math.max(flow.leftPath.length, flow.rightPath.length); + const rowsH = (n: number) => n * subBlockH + Math.max(0, n - 1) * subArrowH; + const colAreaH = rowsH(maxRows); + // Vertically center each column within the shared column area so an unequal + // split (e.g. 1 local vs 2 compressed) reads as an intentional branch merge + // rather than leaving the shorter column's connector dangling as a long + // unattached line beside the taller column. + const leftStartY = parallelStartY + (colAreaH - rowsH(flow.leftPath.length)) / 2; + const rightStartY = parallelStartY + (colAreaH - rowsH(flow.rightPath.length)) / 2; + g.append('line') .attr('x1', mergeCx) .attr('y1', splitTopY) @@ -761,10 +903,7 @@ function renderDiagram( .attr('stroke-width', 1); g.append('path') - .attr( - 'd', - `M ${mergeCx} ${splitMidY} L ${leftCx} ${splitMidY} L ${leftCx} ${parallelStartY - 2}`, - ) + .attr('d', `M ${mergeCx} ${splitMidY} L ${leftCx} ${splitMidY} L ${leftCx} ${leftStartY - 2}`) .attr('fill', 'none') .attr('stroke', mutedFg) .attr('stroke-width', 1) @@ -773,14 +912,14 @@ function renderDiagram( g.append('path') .attr( 'd', - `M ${mergeCx} ${splitMidY} L ${rightCx} ${splitMidY} L ${rightCx} ${parallelStartY - 2}`, + `M ${mergeCx} ${splitMidY} L ${rightCx} ${splitMidY} L ${rightCx} ${rightStartY - 2}`, ) .attr('fill', 'none') .attr('stroke', mutedFg) .attr('stroke-width', 1) .attr('marker-end', 'url(#arch-arrow-sub)'); - let lsy = parallelStartY; + let lsy = leftStartY; for (let i = 0; i < flow.leftPath.length; i++) { drawSingleSubBlock(flow.leftPath[i], leftX, lsy, colW, colFontSize); lsy += subBlockH; @@ -798,7 +937,7 @@ function renderDiagram( } const leftEndY = lsy; - let rsy = parallelStartY; + let rsy = rightStartY; for (let i = 0; i < flow.rightPath.length; i++) { drawSingleSubBlock(flow.rightPath[i], rightX, rsy, colW, colFontSize); rsy += subBlockH; @@ -816,9 +955,7 @@ function renderDiagram( } const rightEndY = rsy; - const maxRows = Math.max(flow.leftPath.length, flow.rightPath.length); - const mergeStartY = - parallelStartY + maxRows * subBlockH + Math.max(0, maxRows - 1) * subArrowH + subArrowH + 4; + const mergeStartY = parallelStartY + colAreaH + subArrowH + 4; const subInnerXLocal = x + 16; const subInnerWLocal = w - 40; @@ -887,16 +1024,7 @@ function renderDiagram( .attr('fill', bgSubtle) .attr('stroke', mutedFg) .attr('stroke-width', 1.5); - g.append('text') - .attr('x', mergeCx) - .attr('y', circleCy) - .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') - .attr('fill', fg) - .attr('font-size', '14px') - .attr('font-weight', 700) - .attr('font-family', 'inherit') - .text(block.circleSymbol); + drawCircleGlyph(mergeCx, circleCy, fg, block.circleSymbol); } else { drawSingleSubBlock(block, subInnerXLocal, msy, subInnerWLocal); } @@ -963,7 +1091,7 @@ function renderDiagram( .attr('x', lcx) .attr('y', sy + 6) .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') + .attr('dy', '0.35em') .attr('fill', mutedFg) .attr('font-size', '9px') .attr('font-weight', 600) @@ -1227,7 +1355,7 @@ function renderDiagram( .attr('x', x + w / 2 - 8) .attr('y', by + h / 2 - 8) .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') + .attr('dy', '0.35em') .attr('fill', fg) .attr('font-size', '13px') .attr('font-weight', 600) @@ -1239,7 +1367,7 @@ function renderDiagram( .attr('x', x + w / 2 - 8) .attr('y', by + h / 2 + 10) .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') + .attr('dy', '0.35em') .attr('fill', mutedFg) .attr('font-size', '11px') .attr('font-family', 'inherit') @@ -1256,16 +1384,7 @@ function renderDiagram( .attr('stroke', borderColor) .attr('stroke-width', 1) .style('pointer-events', 'none'); - g.append('text') - .attr('x', iconX) - .attr('y', iconY) - .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') - .attr('fill', mutedFg) - .attr('font-size', '14px') - .attr('font-weight', 700) - .style('pointer-events', 'none') - .text('+'); + drawCircleGlyph(iconX, iconY, mutedFg, '+'); g.append('rect') .attr('x', x) @@ -1318,9 +1437,27 @@ function renderDiagram( .filter(Boolean) .join(' \u00B7 '); drawBlock(pad.left, embedY, bw, blockH, 'embedding', 'Token Embedding', embedSub || undefined); + + // A block's incoming arrow should land on its first RMSNorm when the block is + // expanded (so there's a continuous line into the norm, through the container + // border), or on the block's top when collapsed. + const denseEntryY = denseTxExpanded ? denseNorm1Y : denseTxStart; + const hashEntryY = hashBlockExpanded ? hashNorm1Y : hashTxStart; + const altEntryY = [ + altBlockExpanded[0] ? altNorm1Y[0] : altBlockStart[0], + altBlockExpanded[1] ? altNorm1Y[1] : altBlockStart[1], + ]; + const mainEntryY = txExpanded ? norm1Y : txStart; + drawArrow( embedY + blockH, - hasDenseLayers ? denseTxStart : hasAlternatingLayers ? altBlockStart[0] : txStart, + hasDenseLayers + ? denseEntryY + : hasHashBlock + ? hashEntryY + : hasAlternatingLayers + ? altEntryY[0] + : mainEntryY, ); // === DENSE TRANSFORMER BLOCK (for MoE models with initial dense layers) === @@ -1354,7 +1491,7 @@ function renderDiagram( .attr('x', width - pad.right - denseBadgeW / 2 - 4) .attr('y', denseTxStart) .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') + .attr('dy', '0.35em') .attr('fill', mutedFg) .attr('font-size', '11px') .attr('font-weight', 600) @@ -1448,7 +1585,10 @@ function renderDiagram( } // Arrow from dense block to next block - drawArrow(denseTxEnd, hasAlternatingLayers ? altBlockStart[0] : txStart); + drawArrow( + denseTxEnd, + hasHashBlock ? hashEntryY : hasAlternatingLayers ? altEntryY[0] : mainEntryY, + ); } // Compute labels @@ -1466,11 +1606,15 @@ function renderDiagram( n2Y: number, m2Y: number, expertBlockId: string, + routerLabel = 'MoE Router', + routerSubOverride?: string, ) { const routedCount = arch.hasSharedExpert ? (arch.numExperts || 0) - 1 : arch.numExperts; - const routerSub = `Top-${arch.activeExperts} of ${routedCount} routed${arch.hasSharedExpert ? ' + 1 shared' : ''}`; + const routerSub = + routerSubOverride ?? + `Top-${arch.activeExperts} of ${routedCount} routed${arch.hasSharedExpert ? ' + 1 shared' : ''}`; const rY = n2Y + smallH + arrowH; - drawBlock(innerX, rY, innerW, blockH, 'router', 'MoE Router', routerSub); + drawBlock(innerX, rY, innerW, blockH, 'router', routerLabel, routerSub); drawArrow(rY + blockH, rY + blockH + arrowH); const ec = getColor('expert', isDark); @@ -1512,7 +1656,7 @@ function renderDiagram( .attr('x', ex + expertSize / 2) .attr('y', ey + expertSize / 2) .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') + .attr('dy', '0.35em') .attr('fill', isActive ? fg : mutedFg) .attr('font-size', '9px') .attr('font-weight', isActive ? 600 : 400) @@ -1527,7 +1671,7 @@ function renderDiagram( .attr('x', ex + expertSize / 2) .attr('y', ey + expertSize / 2) .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') + .attr('dy', '0.35em') .attr('fill', mutedFg) .attr('font-size', '14px') .attr('font-weight', 700) @@ -1549,7 +1693,7 @@ function renderDiagram( .attr('x', ex + expertSize / 2) .attr('y', ey + expertSize / 2) .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') + .attr('dy', '0.35em') .attr('fill', mutedFg) .attr('font-size', '9px') .attr('font-weight', 600) @@ -1568,16 +1712,7 @@ function renderDiagram( .attr('stroke', ec.stroke) .attr('stroke-width', 1) .style('pointer-events', 'none'); - g.append('text') - .attr('x', expIconX) - .attr('y', expIconY) - .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') - .attr('fill', ec.stroke) - .attr('font-size', '14px') - .attr('font-weight', 700) - .style('pointer-events', 'none') - .text(isExpExpanded ? '\u2212' : '+'); + drawCircleGlyph(expIconX, expIconY, ec.stroke, isExpExpanded ? '\u2212' : '+'); g.append('rect') .attr('x', innerX) @@ -1601,6 +1736,114 @@ function renderDiagram( drawResidualBypass(n2Y, m2Y); } + // === HASH-ROUTED MoE PREFIX BLOCK (DeepSeek V4: first N layers route by token id) === + if (hasHashBlock) { + if (hashBlockExpanded) { + // Container + g.append('rect') + .attr('x', pad.left - 4) + .attr('y', hashTxStart) + .attr('width', bw + 8) + .attr('height', hashTxEnd - hashTxStart) + .attr('rx', 10) + .attr('fill', 'none') + .attr('stroke', borderColor) + .attr('stroke-width', 2) + .attr('stroke-dasharray', '6,3'); + + // Collapse badge + const hashBadge = `− ×${arch.hashRoutedLayers} layers`; + const hashBadgeW = hashBadge.length * 7 + 16; + g.append('rect') + .attr('x', width - pad.right - hashBadgeW - 4) + .attr('y', hashTxStart - 11) + .attr('width', hashBadgeW) + .attr('height', 22) + .attr('rx', 11) + .attr('fill', bgSubtle) + .attr('stroke', borderColor) + .attr('stroke-width', 1); + g.append('text') + .attr('x', width - pad.right - hashBadgeW / 2 - 4) + .attr('y', hashTxStart) + .attr('text-anchor', 'middle') + .attr('dy', '0.35em') + .attr('fill', mutedFg) + .attr('font-size', '11px') + .attr('font-weight', 600) + .attr('font-family', 'inherit') + .text(hashBadge); + g.append('rect') + .attr('x', width - pad.right - hashBadgeW - 4) + .attr('y', hashTxStart - 11) + .attr('width', hashBadgeW) + .attr('height', 22) + .attr('rx', 11) + .attr('fill', 'transparent') + .style('cursor', 'pointer') + .attr('data-testid', 'collapse-hashBlock') + .on('click', () => onBlockClick('hashBlock')); + + // RMSNorm 1 + drawBlock(innerX, hashNorm1Y, innerW, smallH, 'norm', 'RMSNorm'); + drawArrow(hashNorm1Y + smallH, hashAttnY); + + // Attention — static hybrid block (same attention stack as the rest) + const hashHeadSub = [ + arch.numHeads ? `${arch.numHeads} heads` : null, + arch.numKVHeads ? `${arch.numKVHeads} KV heads` : null, + ] + .filter(Boolean) + .join(' · '); + drawBlock( + innerX, + hashAttnY, + innerW, + blockH, + 'attention', + attnLabel, + hashHeadSub || undefined, + ); + + drawArrow(hashAttnY + blockH + 4, hashMerge1Y - circleR); + drawResidualBypass(hashNorm1Y, hashMerge1Y); + drawArrow(hashMerge1Y + circleR, hashNorm2Y); + + // RMSNorm 2 + drawBlock(innerX, hashNorm2Y, innerW, smallH, 'norm', 'RMSNorm'); + drawArrow(hashNorm2Y + smallH, hashNorm2Y + smallH + arrowH); + + // Hash Router + Expert grid (token-id → fixed experts, not a learned gate) + const hashRoutedCount = arch.hasSharedExpert ? (arch.numExperts || 0) - 1 : arch.numExperts; + const hashRouterSub = `token-id → ${arch.activeExperts} of ${hashRoutedCount}${arch.hasSharedExpert ? ' + 1 shared' : ''}`; + drawExpertGrid( + hashExpertY, + hashExpertsExpanded, + hashExpertsExpandedH, + hashFFNExpandedStartY, + hashNorm2Y, + hashMerge2Y, + 'hashExperts', + 'Hash Router', + hashRouterSub, + ); + } else { + const hashSub = `×${arch.hashRoutedLayers} first layers · token-id → experts`; + drawCollapsedTransformerBlock( + pad.left, + hashTxStart, + bw, + collapsedTxH, + 'Hash-Routed MoE', + hashSub, + 'hashBlock', + ); + } + + // Arrow from the hash block to the first alternating block (or main transformer) + drawArrow(hashTxEnd, hasAlternatingLayers ? altEntryY[0] : mainEntryY); + } + // === ALTERNATING TRANSFORMER BLOCKS (gpt-oss style) === if (hasAlternatingLayers) { for (let bi = 0; bi < 2; bi++) { @@ -1639,7 +1882,7 @@ function renderDiagram( .attr('x', width - pad.right - badgeW / 2 - 4) .attr('y', altBlockStart[bi]) .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') + .attr('dy', '0.35em') .attr('fill', mutedFg) .attr('font-size', '11px') .attr('font-weight', 600) @@ -1660,20 +1903,44 @@ function renderDiagram( drawBlock(innerX, altNorm1Y[bi], innerW, smallH, 'norm', 'RMSNorm'); drawArrow(altNorm1Y[bi] + smallH, altAttnY[bi]); - // Attention (non-expandable — AlternatingSinkGQA) + // Attention — expandable hybrid drill-down (DeepSeek V4 CSA/HCA) or a + // static block (gpt-oss AlternatingSinkGQA). const headSub = [ arch.numHeads ? `${arch.numHeads} heads` : null, arch.numKVHeads ? `${arch.numKVHeads} KV heads` : null, ] .filter(Boolean) .join(' \u00B7 '); - const attnSub = - bi === 0 && arch.slidingWindow - ? `${headSub}${headSub ? ' \u00B7 ' : ''}window=${arch.slidingWindow}` - : headSub || undefined; - drawBlock(innerX, altAttnY[bi], innerW, blockH, 'attention', spec.label, attnSub); + // Sliding-window note is per layer-type: a hybrid model (e.g. DeepSeek + // V4) carries the window on every attention variant, whereas gpt-oss + // only puts it on its sliding block. Drive it off the spec, not bi. + const specWindow = spec.slidingWindow; + const attnSub = specWindow + ? `${headSub}${headSub ? ' \u00B7 ' : ''}window=${specWindow}` + : headSub || undefined; + if (altAttnExpandable) { + drawExpandableBlock( + innerX, + altAttnY[bi], + innerW, + blockH, + 'attention', + spec.label, + attnSub, + altAttnExpanded[bi], + `altAttention${bi}`, + ); + const flow = altAttnFlow[bi]; + if (altAttnExpanded[bi] && flow) { + drawFlow(flow, altAttnExpandedStartY[bi], innerX, innerW); + } + } else { + drawBlock(innerX, altAttnY[bi], innerW, blockH, 'attention', spec.label, attnSub); + } - const aBottom = altAttnY[bi] + blockH + 4; + const aBottom = altAttnExpanded[bi] + ? altAttnExpandedStartY[bi] + altAttnExpandedH[bi] + 4 + : altAttnY[bi] + blockH + 4; drawArrow(aBottom, altMerge1Y[bi] - circleR); drawResidualBypass(altNorm1Y[bi], altMerge1Y[bi]); drawArrow(altMerge1Y[bi] + circleR, altNorm2Y[bi]); @@ -1709,7 +1976,7 @@ function renderDiagram( // Draw alternating indicator between the two blocks if (bi === 0) { // Arrow from block 0 end through indicator to block 1 start (drawn first, behind text) - drawArrow(altBlockEnd[0] + 2, altBlockStart[1]); + drawArrow(altBlockEnd[0] + 2, altEntryY[1]); // Opaque background rect behind the label so it doesn't overlap the arrow const cardBg = isDark ? '#131416' : '#eaebec'; @@ -1731,7 +1998,7 @@ function renderDiagram( .attr('x', cx) .attr('y', altIndicatorY) .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') + .attr('dy', '0.35em') .attr('fill', mutedFg) .attr('font-size', `${labelFontSize}px`) .attr('font-weight', 500) @@ -1772,7 +2039,7 @@ function renderDiagram( .attr('x', width - pad.right - badgeW / 2 - 4) .attr('y', txStart) .attr('text-anchor', 'middle') - .attr('dominant-baseline', 'central') + .attr('dy', '0.35em') .attr('fill', mutedFg) .attr('font-size', '11px') .attr('font-weight', 600) @@ -1909,7 +2176,7 @@ function renderDiagram( }, { label: 'Attention', - value: hasAlternatingLayers ? 'Sink/Full GQA' : arch.attentionType, + value: arch.attentionType === 'AlternatingSinkGQA' ? 'Sink/Full GQA' : arch.attentionType, }, { label: 'Context', @@ -1919,7 +2186,9 @@ function renderDiagram( ? [ { label: 'Experts', - value: `${arch.activeExperts}/${arch.numExperts}`, + // Active per token = routed top-k + the always-on shared expert, so + // show "6+1/385" (not "6/385"): the shared expert is active too. + value: `${arch.activeExperts}${arch.hasSharedExpert ? '+1' : ''}/${arch.numExperts}`, }, ] : []), @@ -2092,6 +2361,42 @@ export default function ModelArchitectureDiagram({ >
+ {/* The drill-down only renders while its parent block is expanded, so gate + the caption on the parent too — collapsing the parent leaves the child + id in expandedBlocks (state is restored on re-expand), and the caption + must not outlive the drawing it explains. */} + {arch.attentionType === 'Hybrid' && + [0, 1].some( + (i) => expandedBlocks.has(`altBlock${i}`) && expandedBlocks.has(`altAttention${i}`), + ) && ( +

+ Local and{' '} + Compressed are two KV sources, + not two separate attentions: each query attends in a{' '} + single softmax to the union of + sliding-window + selected compressed keys, with a learnable per-head attention sink. +

+ )} + {(arch.hyperConnections ?? 0) > 1 && + ['altBlock0', 'altBlock1', 'hashBlock', 'transformer', 'denseTransformer'].some((id) => + expandedBlocks.has(id), + ) && ( +

+ + Hyper-Connections (mHC ×{arch.hyperConnections}) + {' '} + replace each residual with {arch.hyperConnections} parallel streams combined by + learned, Sinkhorn-normalized weights — read ({arch.hyperConnections}→1), output, and + a {arch.hyperConnections}×{arch.hyperConnections} stream mix — shown as the mHC × + {arch.hyperConnections} nodes. +

+ )} {arch.features && arch.features.length > 0 && (
diff --git a/packages/app/src/lib/model-architectures.test.ts b/packages/app/src/lib/model-architectures.test.ts index 68b913c4..e55ba84e 100644 --- a/packages/app/src/lib/model-architectures.test.ts +++ b/packages/app/src/lib/model-architectures.test.ts @@ -10,6 +10,7 @@ import { getAttentionLabel, getAttentionSubBlocks, getFFNSubBlocks, + getHybridAttentionSubBlocks, getModelArchitecture, MODEL_ARCHITECTURES, } from './model-architectures'; @@ -20,6 +21,7 @@ describe('MODEL_ARCHITECTURES', () => { Model.Llama3_3_70B, Model.Llama3_1_70B, Model.DeepSeek_R1, + Model.DeepSeek_V4_Pro, Model.GptOss, Model.Kimi_K2_5, Model.MiniMax_M2_5, @@ -158,6 +160,72 @@ describe('getModelArchitecture', () => { expect(arch?.vocabSize).toBe(129280); }); + it('returns architecture for DeepSeek V4 Pro with MoE and Hybrid attention details', () => { + const arch = getModelArchitecture(Model.DeepSeek_V4_Pro); + expect(arch).toBeDefined(); + expect(arch?.totalParams).toBe(1600); + expect(arch?.activeParams).toBe(49); + expect(arch?.architectureType).toBe('moe'); + expect(arch?.attentionType).toBe('Hybrid'); + expect(arch?.attentionExpandable).toBe(false); + expect(arch?.numLayers).toBe(61); + expect(arch?.hiddenSize).toBe(7168); + expect(arch?.numHeads).toBe(128); + expect(arch?.numKVHeads).toBe(1); + expect(arch?.headDim).toBe(512); + expect(arch?.ffnDim).toBe(3072); + expect(arch?.numExperts).toBe(385); + expect(arch?.activeExperts).toBe(6); + expect(arch?.hasSharedExpert).toBe(true); + // First 3 layers use hash-routed MoE (not dense FFN), so no dense block — + // they render as a dedicated hash-routed prefix block instead. + expect(arch?.denseFFNLayers).toBeUndefined(); + expect(arch?.hashRoutedLayers).toBe(3); + // mHC: residuals are replaced by 4 parallel hyper-connection streams. + expect(arch?.hyperConnections).toBe(4); + expect(arch?.slidingWindow).toBe(128); + expect(arch?.contextWindow).toBe(1048576); + expect(arch?.developer).toBe('DeepSeek'); + expect(arch?.vocabSize).toBe(129280); + expect(arch?.sourceUrl).toBe('https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro'); + }); + + it('DeepSeek V4 Pro surfaces sliding-window attention and hybrid components in features', () => { + const arch = getModelArchitecture(Model.DeepSeek_V4_Pro); + expect(arch?.features).toBeDefined(); + expect(arch?.features).toContain('Sliding window (128 tokens)'); + expect(arch?.features).toContain('Hybrid CSA + HCA Attention'); + expect(arch?.features).toContain('Attention Sink'); + expect(arch?.features).toContain('Multi-Token Prediction'); + }); + + it('DeepSeek V4 Pro has alternatingLayers with CSA and HCA specs, each carrying a sliding window', () => { + const arch = getModelArchitecture(Model.DeepSeek_V4_Pro); + expect(arch?.alternatingLayers).toBeDefined(); + expect(arch?.alternatingLayers).toHaveLength(2); + + // Counts describe the learned-router layers (the first 3 hash-routed layers + // are split out into their own block): 29 HCA + 29 CSA + 3 hash = 61. + const [hca, csa] = arch!.alternatingLayers!; + expect(hca.label).toBe('Heavily Compressed Attention'); + expect(hca.count).toBe(29); + expect(hca.description).toContain('sliding window'); + expect(hca.slidingWindow).toBe(128); + + expect(csa.label).toBe('Compressed Sparse Attention'); + expect(csa.count).toBe(29); + expect(csa.description).toContain('sliding window'); + expect(csa.description).toContain('lightning indexer'); + expect(csa.slidingWindow).toBe(128); + }); + + it('DeepSeek V4 Pro alternating + hash-routed layer counts sum to numLayers', () => { + const arch = getModelArchitecture(Model.DeepSeek_V4_Pro); + expect(arch?.alternatingLayers).toBeDefined(); + const totalAlternating = arch!.alternatingLayers!.reduce((sum, l) => sum + l.count, 0); + expect(totalAlternating + (arch!.hashRoutedLayers ?? 0)).toBe(arch!.numLayers); + }); + it('returns architecture for Kimi K2.5 with MoE and MLA details', () => { const arch = getModelArchitecture(Model.Kimi_K2_5); expect(arch).toBeDefined(); @@ -241,10 +309,13 @@ describe('getModelArchitecture', () => { expect(sliding.count).toBe(18); expect(sliding.description).toContain('128-token sliding window'); expect(sliding.description).toContain('attention sink'); + expect(sliding.slidingWindow).toBe(128); expect(full.label).toBe('Causal Grouped Query Attention'); expect(full.count).toBe(18); expect(full.description).toContain('full causal masking'); + // Full-attention block has no sliding window (per-spec, not block-index). + expect(full.slidingWindow).toBeUndefined(); }); it('gpt-oss alternating layer counts sum to numLayers', () => { @@ -298,6 +369,11 @@ describe('getArchitectureSummary', () => { expect(getArchitectureSummary(arch!)).toBe('MoE 671B (37B active)'); }); + it('returns MoE summary for DeepSeek V4 Pro with trillion-scale params', () => { + const arch = getModelArchitecture(Model.DeepSeek_V4_Pro); + expect(getArchitectureSummary(arch!)).toBe('MoE 1.6T (49B active)'); + }); + it('returns MoE summary for gpt-oss 120B', () => { const arch = getModelArchitecture(Model.GptOss); expect(getArchitectureSummary(arch!)).toBe('MoE 120B (5B active)'); @@ -652,6 +728,54 @@ describe('getFFNSubBlocks', () => { }); }); +describe('getHybridAttentionSubBlocks', () => { + it('exposes the sliding-window branch as an explicit block for DeepSeek V4', () => { + const arch = getModelArchitecture(Model.DeepSeek_V4_Pro)!; + const [hca, csa] = arch.alternatingLayers!; + + const csaFlow = getHybridAttentionSubBlocks(arch, csa); + expect(csaFlow.layout).toBe('parallel'); + if (csaFlow.layout !== 'parallel') return; + expect(csaFlow.leftLabel).toBe('Local'); + // Local branch is the sliding-window KV source (one explicit block). The + // sink is NOT here — it is a learnable softmax bias on the shared MQA. + expect(csaFlow.leftPath[0].name).toBe('Sliding Window'); + expect(csaFlow.leftPath[0].detail).toContain('128'); + expect(csaFlow.leftPath).toHaveLength(1); + expect(csaFlow.leftPath.some((b) => b.name === 'Attention Sink')).toBe(false); + // CSA compressed branch: light compression then the learned lightning + // indexer (sparse top-k) — two stages. + expect(csaFlow.rightPath.map((b) => b.name)).toEqual([ + 'Token Compression', + 'Lightning Indexer', + ]); + // The fused attention is a single shared-KV MQA that carries the sink + expect(csaFlow.mergeBlocks[0].name).toBe('Shared-KV MQA + Sink'); + expect(csaFlow.mergeBlocks.at(-1)?.name).toBe('Output Projection'); + + const hcaFlow = getHybridAttentionSubBlocks(arch, hca); + if (hcaFlow.layout !== 'parallel') return; + expect(hcaFlow.leftPath[0].name).toBe('Sliding Window'); + expect(hcaFlow.leftPath).toHaveLength(1); + // HCA compressed branch is a single heavy-compression source (no indexer) + expect(hcaFlow.rightPath.some((b) => b.name === 'Lightning Indexer')).toBe(false); + expect(hcaFlow.rightPath.map((b) => b.name)).toEqual(['Heavy Compression']); + expect(hcaFlow.mergeBlocks[0].name).toBe('Shared-KV MQA + Sink'); + }); + + it('all hybrid sub-blocks have valid types', () => { + const arch = getModelArchitecture(Model.DeepSeek_V4_Pro)!; + const validTypes = ['projection', 'activation', 'operation', 'attention']; + for (const spec of arch.alternatingLayers!) { + const flow = getHybridAttentionSubBlocks(arch, spec); + for (const block of getAllBlocks(flow)) { + expect(validTypes).toContain(block.type); + expect(block.name.length).toBeGreaterThan(0); + } + } + }); +}); + /** Helper: get all blocks from a flow (flat list for easy assertions) */ function getAllBlocks(flow: SubBlockFlow) { if (flow.layout === 'sequential') return flow.blocks; diff --git a/packages/app/src/lib/model-architectures.ts b/packages/app/src/lib/model-architectures.ts index e7db84cd..83e02044 100644 --- a/packages/app/src/lib/model-architectures.ts +++ b/packages/app/src/lib/model-architectures.ts @@ -23,6 +23,12 @@ export interface AlternatingLayerSpec { count: number; /** Color key for visual distinction */ colorKey: 'attention' | 'ffn' | 'norm' | 'router' | 'expert'; + /** + * Sliding-window size (in tokens) for this layer type, when it includes a + * local sliding-window attention branch. Rendered as `window=N` in the + * diagram. Omit for layer types that use full / non-windowed attention. + */ + slidingWindow?: number; } /** @@ -63,6 +69,11 @@ export interface ModelArchitecture { denseFFNLayers?: number; /** Intermediate dimension of the dense FFN layers (differs from MoE expert FFN dim) */ denseFFNDim?: number; + /** + * Number of leading MoE layers that use hash routing (token-id → fixed experts) + * instead of the learned gate. Rendered as a separate stacked prefix block. + */ + hashRoutedLayers?: number; /** * Alternating layer type pattern (e.g., gpt-oss uses sliding_attention/full_attention). * Each entry describes one category of layer and how many of that type exist. @@ -70,6 +81,11 @@ export interface ModelArchitecture { alternatingLayers?: AlternatingLayerSpec[]; /** Sliding window size in tokens (for models using sliding/local attention) */ slidingWindow?: number; + /** + * Number of parallel residual streams for hyper-connections (mHC). When > 1, + * residual merges render as "mHC ×N" mixer nodes instead of a plain "+" add. + */ + hyperConnections?: number; /** Context window size (in tokens) */ contextWindow?: number; /** Special architectural features */ @@ -93,6 +109,7 @@ export interface ModelArchitecture { * - https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct * - https://huggingface.co/deepseek-ai/DeepSeek-R1-0528 * - https://github.com/deepseek-ai/DeepSeek-V3 + * - https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro (config.json, inference/model.py, DeepSeek_V4.pdf) * - https://huggingface.co/moonshotai/Kimi-K2.5/blob/main/config.json * - https://huggingface.co/openai/gpt-oss-120b/blob/main/config.json * - https://huggingface.co/MiniMaxAI/MiniMax-M2/blob/main/config.json @@ -124,6 +141,72 @@ export const MODEL_ARCHITECTURES: Partial> = { developer: 'DeepSeek', sourceUrl: 'https://huggingface.co/deepseek-ai/DeepSeek-R1-0528', }, + [Model.DeepSeek_V4_Pro]: { + model: Model.DeepSeek_V4_Pro, + totalParams: 1600, // 1.6T + activeParams: 49, + architectureType: 'moe', + attentionType: 'Hybrid', + // Hybrid CSA/HCA is a bespoke compressed-attention stack, not the standard + // Q/K/V GQA layout — render it as static blocks, not the GQA drill-down. + attentionExpandable: false, + numLayers: 61, + hiddenSize: 7168, + numHeads: 128, + // Shared single-latent KV (MLA-lineage MQA): num_key_value_heads = 1. + numKVHeads: 1, + headDim: 512, + vocabSize: 129280, + ffnDim: 3072, // moe_intermediate_size + numExperts: 385, // 384 routed + 1 shared + activeExperts: 6, + hasSharedExpert: true, + // First 3 layers use hash-routed MoE (shown as a separate prefix block); the + // remaining 58 learned-router layers interleave two compressed-attention + // variants. Every layer also carries a 128-token sliding-window branch plus a + // learnable attention sink. Counts below are the learned-router layers: + // 29 HCA + 29 CSA + 3 hash-routed = 61 (the extra MTP block is SWA-only). + hashRoutedLayers: 3, + alternatingLayers: [ + { + label: 'Heavily Compressed Attention', + description: + 'HCA (learned-router layers): the KV of every 128 tokens is consolidated into a single entry and attended densely, alongside a 128-token sliding window of uncompressed KV and a learnable attention sink.', + count: 29, + colorKey: 'attention', + slidingWindow: 128, + }, + { + label: 'Compressed Sparse Attention', + description: + 'CSA (learned-router layers): the KV of every 4 tokens is compressed to one entry, then a lightning indexer selects the top-1024 compressed blocks for sparse attention, alongside a 128-token sliding window and a learnable attention sink.', + count: 29, + colorKey: 'attention', + slidingWindow: 128, + }, + ], + slidingWindow: 128, + hyperConnections: 4, // mHC: 4 parallel residual streams (hc_mult) + contextWindow: 1048576, // 1M + features: [ + 'Hybrid CSA + HCA Attention', + 'Sliding window (128 tokens)', + 'Attention Sink', + 'MLA-style Shared-KV MQA', + 'Lightning Indexer (sparse top-k)', + 'Manifold-Constrained Hyper-Connections (mHC)', + 'sqrt-softplus Routing', + 'Auxiliary-loss-free Load Balancing', + 'Hash Routing (first 3 layers)', + 'Multi-Token Prediction', + 'YaRN RoPE (1M context)', + 'FP4 Experts + FP8 Mixed Precision', + 'Muon Optimizer', + ], + releaseDate: '2026-06-08', + developer: 'DeepSeek', + sourceUrl: 'https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro', + }, [Model.Llama3_3_70B]: { model: Model.Llama3_3_70B, totalParams: 70, @@ -182,6 +265,7 @@ export const MODEL_ARCHITECTURES: Partial> = { description: 'GQA with 128-token sliding window and learnable attention sink tokens', count: 18, colorKey: 'attention', + slidingWindow: 128, }, { label: 'Causal Grouped Query Attention', @@ -488,3 +572,67 @@ export function getFFNSubBlocks( ], }; } + +/** + * Hybrid attention sub-blocks (DeepSeek V4-style CSA / HCA layers). + * + * Unlike a standard GQA layer, every hybrid attention layer fuses two KV + * sources for each query: a local sliding-window branch (recent uncompressed + * tokens) and a compressed-KV branch, combined by a shared-KV MQA with a + * learnable attention sink. The compressed branch depends on the layer type — + * CSA runs a lightning indexer (sparse top-k) over lightly compressed KV, while + * HCA attends densely over heavily compressed KV. Rendering this as a flow makes + * the sliding-window attention an explicit, visible block rather than a one-line + * `window=N` annotation. + */ +export function getHybridAttentionSubBlocks( + arch: ModelArchitecture, + spec: AlternatingLayerSpec, +): SubBlockFlow { + const win = spec.slidingWindow ?? arch.slidingWindow; + const isSparse = /sparse/iu.test(spec.label); + + // Both branches are KV *sources* whose selected indices are unioned and fed to + // a single shared-KV MQA softmax — they are not two attentions merged after + // the fact. The local branch contributes the recent sliding-window tokens; the + // compressed branch contributes selected long-range tokens. CSA lightly + // compresses (1/4) then sparsely selects via the learned lightning indexer; + // HCA compresses heavily (1/128) and keeps the few resulting entries. + const localPath: ArchSubBlock[] = [ + { + name: 'Sliding Window', + detail: win ? `last ${win} tokens` : 'local KV', + type: 'attention', + }, + ]; + + const compressedPath: ArchSubBlock[] = isSparse + ? [ + { name: 'Token Compression', detail: '1 entry / 4 tokens', type: 'operation' }, + { name: 'Lightning Indexer', detail: 'sparse top-1024', type: 'attention' }, + ] + : [{ name: 'Heavy Compression', detail: '1 entry / 128 tokens', type: 'attention' }]; + + return { + layout: 'parallel', + leftLabel: 'Local', + rightLabel: 'Compressed', + leftPath: localPath, + rightPath: compressedPath, + // The union of both branches' indices is consumed by one MQA softmax that + // carries a per-head learnable attention sink (a softmax-denominator bias, + // not literal sink tokens) — hence the sink lives on the MQA block here. + mergeBlocks: [ + { + name: 'Shared-KV MQA + Sink', + detail: arch.numHeads ? `${arch.numHeads} heads · ${arch.numKVHeads ?? 1} KV` : undefined, + type: 'attention', + }, + { + name: 'Output Projection', + detail: arch.hiddenSize ? `→ ${arch.hiddenSize.toLocaleString()}` : undefined, + type: 'projection', + }, + ], + }; +}