Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 40 additions & 1 deletion packages/cli/src/capture/assetCataloger.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ export interface CatalogedAsset {
sectionClasses?: string;
/** Whether the image is above the fold (visible without scrolling) */
aboveFold?: boolean;
/** Element sits inside <header>, <nav>, or [role="banner"] — logo signal */
inBanner?: boolean;
/** Element sits inside <a> with site-root href ("/", "#", origin-only) — brand-home link */
inHomeLink?: boolean;
/** alt/aria-label/title contains the brand segment of document.title */
matchesTitleBrand?: boolean;
}

/**
Expand Down Expand Up @@ -62,6 +68,29 @@ export async function catalogAssets(page: Page): Promise<CatalogedAsset[]> {
var rect = el.getBoundingClientRect();
ctx.aboveFold = rect.top < window.innerHeight;
} catch(e) {}
// Logo signals — surfaced explicitly so the downloader can prefix
// logo-<hash> reliably. Real-AI-test on heygen.com + huly.io showed
// the prior class-substring detector caught 0 logos; these explicit
// structural signals catch the header logo across modern React/
// Tailwind builds where "logo" isn't in any className.
// 1. inBanner: element sits inside <header>, <nav>, or [role=banner].
ctx.inBanner = el.closest('header, nav, [role="banner"]') !== null;
// 2. inHomeLink: element sits inside an <a> whose href is the site
// root ("/", "#", "./" or origin-only URL) — the brand-home link.
var homeAnchor = el.closest('a[href]');
if (homeAnchor) {
var aHref = homeAnchor.getAttribute('href') || '';
ctx.inHomeLink = aHref === '/' || aHref === '#' || aHref === './' ||
/^https?:\\/\\/[^/]+\\/?$/.test(aHref);
}
// 3. matchesTitleBrand: alt/aria-label/title contains the brand
// segment of the page title (everything before " - " / " | " /
// " — ") — the "alt=HeyGen" / "aria-label=Huly" pattern.
var titleBrand = (document.title || '').split(/[-|—]/)[0].trim();
if (desc && titleBrand.length > 1 && titleBrand.length < 30 &&
desc.toLowerCase().indexOf(titleBrand.toLowerCase()) !== -1) {
ctx.matchesTitleBrand = true;
}
return ctx;
}

Expand Down Expand Up @@ -92,12 +121,18 @@ export async function catalogAssets(page: Page): Promise<CatalogedAsset[]> {
if (notes && !entry.notes) {
entry.notes = notes;
}
// Merge rich context (first one wins)
// Merge rich context. Text fields: first-occurrence wins. Boolean
// signals (inBanner / inHomeLink / matchesTitleBrand): any positive
// sample wins — if ANY DOM occurrence of this URL is in a header,
// the URL is a header-context asset.
if (richCtx) {
if (richCtx.description && !entry.description) entry.description = richCtx.description;
if (richCtx.nearestHeading && !entry.nearestHeading) entry.nearestHeading = richCtx.nearestHeading;
if (richCtx.sectionClasses && !entry.sectionClasses) entry.sectionClasses = richCtx.sectionClasses;
if (richCtx.aboveFold !== undefined && entry.aboveFold === undefined) entry.aboveFold = richCtx.aboveFold;
if (richCtx.inBanner) entry.inBanner = true;
if (richCtx.inHomeLink) entry.inHomeLink = true;
if (richCtx.matchesTitleBrand) entry.matchesTitleBrand = true;
}
}

Expand Down Expand Up @@ -324,6 +359,10 @@ function deduplicateSrcsetVariants(assets: CatalogedAsset[]): CatalogedAsset[] {
if (a.notes && !existing.notes) {
existing.notes = a.notes;
}
// Boolean logo signals: any positive sample wins through the merge.
if (a.inBanner) existing.inBanner = true;
if (a.inHomeLink) existing.inHomeLink = true;
if (a.matchesTitleBrand) existing.matchesTitleBrand = true;
// Keep the URL with highest w= value (largest image)
const existingW = getWidthParam(existing.url);
const newW = getWidthParam(a.url);
Expand Down
60 changes: 54 additions & 6 deletions packages/cli/src/capture/assetDownloader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,22 @@

import { writeFileSync, mkdirSync } from "node:fs";
import { join, extname } from "node:path";
import { createHash } from "node:crypto";
import type { DesignTokens, DownloadedAsset } from "./types.js";
import type { CatalogedAsset } from "./assetCataloger.js";

/**
* Content-hash slug for SVGs — `svg-<8-char-sha1>` for icons / `logo-<hash>`
* when DOM evidence says it's a logo. Replaces label-derived slugging which
* mis-assigned brand names to the wrong SVG bodies (e.g. `heygen-logo.svg`
* landing on the Google partner-logo SVG). The hash is a function of the
* bytes, so the filename can never mismatch the content.
*/
function svgContentHashSlug(svgSource: string | Buffer, isLogo: boolean): string {
const hash = createHash("sha1").update(svgSource).digest("hex").slice(0, 8);
return isLogo ? `logo-${hash}` : `svg-${hash}`;
}

export async function downloadAssets(
tokens: DesignTokens,
outputDir: string,
Expand All @@ -22,15 +35,20 @@
const assets: DownloadedAsset[] = [];
const downloadedUrls = new Set<string>();

// 1. ALL inline SVGs — save as files (logos get priority naming)
// 1. ALL inline SVGs — save as files. Names are content-hash based
// (`svg-<hash>.svg` or `logo-<hash>.svg`) so the filename can never
// drift from the SVG body. The DOM-derived `label` is unreliable —
// it has misassigned `heygen-logo.svg` to the Google partner SVG in
// past captures because aria-label / nearest-heading inference can
// pick up text from the wrong ancestor. Content-hash is invariant.
mkdirSync(join(outputDir, "assets", "svgs"), { recursive: true });
const usedSvgNames = new Set<string>();
for (let i = 0; i < tokens.svgs.length && i < 30; i++) {
const svg = tokens.svgs[i]!;
if (!svg.outerHTML || svg.outerHTML.length < 50) continue;
const label = svg.label?.replace(/[^a-zA-Z0-9-_ ]/g, "").trim();
let slug = label ? slugify(label) : svg.isLogo ? `logo-${i}` : `icon-${i}`;
// Deduplicate — two SVGs with same aria-label get suffixed
const slug = svgContentHashSlug(svg.outerHTML, !!svg.isLogo);
// Hash collisions are negligible for 8-char sha1 prefix over <30 SVGs,
// but suffix-dedupe anyway for safety + idempotent re-runs.
let finalSlug = slug;
let suffix = 2;
while (usedSvgNames.has(finalSlug)) {
Expand Down Expand Up @@ -135,8 +153,38 @@
if (result.status !== "fulfilled" || !result.value) continue;
const { url, isPoster, parsedUrl, ext, buffer, catalog } = result.value;
try {
// Generate human-readable name from catalog context
const slug = deriveAssetName(parsedUrl, catalog, isPoster, imgIdx, usedNames);
// SVGs use content-hash names because catalog-derived slugs
// mis-assigned brand names to the wrong SVG bodies (the same
// alignment failure that produced `heygen-logo.svg` containing
// the Google wordmark). Rasters keep the catalog-derived
// human-readable slug — they were not affected by the bug.
let slug: string;
if (ext === ".svg") {
// isLogo signals — broadened. The original `contexts` substring
// check never fired in practice because contexts hold HTML
// positions like 'img[src]' / 'video[poster]', not semantic
// labels. Real signals come from DOM structure + alt/aria text:
// 1. The cataloger now flags inBanner (inside <header>/<nav>/
// [role=banner]), inHomeLink (inside <a href="/">), and
// matchesTitleBrand (alt/aria matches document.title's
// brand segment) — see assetCataloger.ts getElementContext.
// 2. As a backstop, also check description / nearestHeading /
// sectionClasses for "logo" / "brand" / "wordmark" text.
const c = catalog;
const brandRe = /logo|brand|wordmark/i;
const isLogo = !!(
c?.inBanner ||
c?.inHomeLink ||
c?.matchesTitleBrand ||
c?.contexts?.some((s) => brandRe.test(s)) ||
(c?.description && brandRe.test(c.description)) ||
(c?.nearestHeading && brandRe.test(c.nearestHeading)) ||
(c?.sectionClasses && brandRe.test(c.sectionClasses))
);
slug = svgContentHashSlug(buffer, isLogo);
} else {
slug = deriveAssetName(parsedUrl, catalog, isPoster, imgIdx, usedNames);
}
const name = `${slug}${ext}`;
usedNames.add(slug);
const localPath = `assets/${name}`;
Expand Down Expand Up @@ -328,7 +376,7 @@
let current = url;
for (let hop = 0; hop <= MAX_FETCH_REDIRECTS; hop++) {
if (isPrivateUrl(current)) return null;
const res = await fetch(current, { ...init, redirect: "manual" });

Check warning

Code scanning / CodeQL

File data in outbound network request Medium

Outbound network request depends on
file data
.
if (res.status >= 300 && res.status < 400) {
const loc = res.headers.get("location");
if (!loc) return res;
Expand Down
102 changes: 85 additions & 17 deletions packages/cli/src/capture/contentExtractor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import type { Page } from "puppeteer-core";
import { existsSync, readdirSync, statSync, readFileSync } from "node:fs";
import { join } from "node:path";
import sharp from "sharp";
import type { CatalogedAsset } from "./assetCataloger.js";
import type { DesignTokens } from "./types.js";

Expand Down Expand Up @@ -232,7 +233,12 @@ export async function captionImagesWithGemini(
}
progress("design", `${Object.keys(geminiCaptions).length} images captioned with Gemini`);

// Caption SVGs by sending source code as text (vision API rejects image/svg+xml).
// Caption SVGs by RENDERING each to PNG via sharp first, then sending the
// PNG bytes to the Vision API — same call shape as raster images.
// Previous implementation sent SVG path markup as TEXT, which produced
// pure hallucinations on wordmarks (`hubspot-logo.svg` → "VIVIENNE",
// `huly-logo.svg` → "Kube", `workday.svg` → "wrestling"). Vision models
// can't reliably mental-render path commands; they need actual pixels.
const svgFiles: Array<{ file: string; relPath: string }> = [];
const assetsDir = join(outputDir, "assets");
for (const f of readdirSync(assetsDir)) {
Expand All @@ -246,30 +252,63 @@ export async function captionImagesWithGemini(
}

if (svgFiles.length > 0) {
progress("design", `Captioning ${svgFiles.length} SVGs via code analysis...`);
progress("design", `Rasterizing + captioning ${svgFiles.length} SVGs via vision API...`);
const SVG_BATCH = 20;
const MAX_SVG_CHARS = 10_000;
const SVG_RENDER_SIZE = 256; // px — enough resolution for Gemini to read wordmarks, small enough to keep payload sub-MB
for (let i = 0; i < svgFiles.length; i += SVG_BATCH) {
const batch = svgFiles.slice(i, i + SVG_BATCH);
const results = await Promise.allSettled(
batch.map(async ({ relPath }) => {
const filePath = join(assetsDir, relPath);
let svgText = readFileSync(filePath, "utf-8");
if (svgText.length > MAX_SVG_CHARS) {
svgText = svgText.slice(0, MAX_SVG_CHARS) + "\n<!-- truncated -->";
let pngBase64: string;
try {
// Detect SVG fill polarity so we can pick a contrasting flatten
// background. White-glyph SVGs (huly's "✕ huly" wordmark uses
// fill="#fff") render invisible against white; dark-glyph SVGs
// render invisible against black. Choosing the background by
// dominant fill keeps both polarities readable for the vision API.
const svgSource = readFileSync(filePath, "utf-8");
const lightFillHits = (
svgSource.match(/fill\s*=\s*["'](#fff(fff)?|white|#f[ef][ef])["']/gi) || []
).length;
const darkFillHits = (
svgSource.match(/fill\s*=\s*["'](#000(000)?|black|#[0-3]{6}|#[0-3]{3})["']/gi) || []
).length;
const bg =
lightFillHits > darkFillHits
? { r: 32, g: 32, b: 32 } // dark slate behind light glyphs
: { r: 255, g: 255, b: 255 }; // white behind dark glyphs (default)
// sharp rasterizes SVG → PNG natively.
const pngBuffer = await sharp(filePath)
.resize({
width: SVG_RENDER_SIZE,
height: SVG_RENDER_SIZE,
fit: "inside",
withoutEnlargement: false,
})
.flatten({ background: bg })
.png()
.toBuffer();
pngBase64 = pngBuffer.toString("base64");
} catch {
// SVG rasterization can fail on exotic features (external fonts,
// foreignObject, filters with missing primitives). Skip caption
// rather than block — agent will fall back to contact-sheet view.
return { file: relPath, caption: "" };
}
const response = await ai.models.generateContent({
model,
contents: [
{
role: "user",
parts: [
{ inlineData: { mimeType: "image/png", data: pngBase64 } },
{
text:
"This SVG code is from a website. Describe what it renders in ONE short sentence " +
"for a video storyboard. Focus on: what shape/icon/illustration it is, its colors. " +
"Be factual.\n\n" +
svgText,
"Describe this SVG asset rendered from a website in ONE short sentence for a video storyboard. " +
"Focus on: what shape/icon/illustration/wordmark it is, its colors, any text it contains. " +
"If you see a wordmark, READ THE LETTERS LITERALLY — do not guess a brand from context. " +
"Be factual.",
},
],
},
Expand Down Expand Up @@ -334,13 +373,28 @@ export function generateAssetDescriptions(
const heading = catalogMatch?.nearestHeading || "";
const section = catalogMatch?.sectionClasses || "";
const aboveFold = catalogMatch?.aboveFold ? "above fold" : "";
// Logo signals — let the no-Gemini fallback still surface logos
// grep-ably even when Vision wasn't available to describe them.
const isLikelyLogo = !!(
catalogMatch?.inBanner ||
catalogMatch?.inHomeLink ||
catalogMatch?.matchesTitleBrand ||
/logo|brand|wordmark/i.test(desc) ||
/logo|brand|wordmark/i.test(section) ||
file.includes("logo")
);
const geminiCaption = geminiCaptions[file];
const cleanName = file.replace(/\.[^.]+$/, "").replace(/[-_]/g, " ");
const parts = [`${file} — ${sizeKb}KB`];
if (geminiCaption) {
// Even with Gemini's description, prepend the LOGO tag if
// structural signals fired — gives a stable grep target for
// agents searching for "the logo."
if (isLikelyLogo) parts.push("LOGO");
parts.push(geminiCaption);
captionedLines.push(parts.join(", "));
} else {
if (isLikelyLogo) parts.push("LOGO");
if (desc) parts.push(`"${desc.slice(0, 80)}"`);
if (heading) parts.push(`section: "${heading.slice(0, 60)}"`);
else if (section) parts.push(`in: ${section.split(" ").slice(0, 3).join(" ")}`);
Expand All @@ -358,11 +412,6 @@ export function generateAssetDescriptions(
const svgsPath = join(assetsPath, "svgs");
for (const file of readdirSync(svgsPath)) {
if (!file.endsWith(".svg")) continue;
const geminiCaption = geminiCaptions[`svgs/${file}`];
if (geminiCaption) {
svgLines.push(`svgs/${file} — ${geminiCaption}`);
continue;
}
const svgMatch = tokens.svgs.find(
(s) =>
s.label &&
Expand All @@ -373,9 +422,28 @@ export function generateAssetDescriptions(
.slice(0, 15),
),
);
// Filename prefix is now the most reliable logo signal: the
// capture pipeline names DOM-marked logos `logo-<hash>.svg` and
// everything else `svg-<hash>.svg`. Fall back to the tokens.svgs
// isLogo flag for legacy captures + a filename-includes-"logo"
// check for human-readable rasters.
//
// Compute this BEFORE the Gemini-caption branch so SVG logos that
// got Vision captions still receive the LOGO marker — without it
// an inline header `<svg>` named `logo-<hash>.svg` would land in
// asset-descriptions.md as plain text, defeating the LOGO grep.
const isLogo = file.startsWith("logo-") || svgMatch?.isLogo || file.includes("logo");
const geminiCaption = geminiCaptions[`svgs/${file}`];
if (geminiCaption) {
const prefix = isLogo ? "LOGO: " : "";
svgLines.push(`svgs/${file} — ${prefix}${geminiCaption}`);
continue;
}
const label = svgMatch?.label || file.replace(".svg", "").replace(/-/g, " ");
const isLogo = svgMatch?.isLogo || file.includes("logo");
svgLines.push(`svgs/${file} — ${isLogo ? "logo: " : "icon: "}${label}`);
// Use uppercase "LOGO:" so agents can grep for it as a single,
// unambiguous token. The lowercase "logo:" prefix was easy to miss
// since real Vision captions also use the word casually.
svgLines.push(`svgs/${file} — ${isLogo ? "LOGO: " : "icon: "}${label}`);
}
} catch {
/* no svgs dir */
Expand Down
13 changes: 9 additions & 4 deletions packages/cli/src/capture/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -579,14 +579,19 @@ export async function captureWebsite(
const lines = generateAssetDescriptions(outputDir, tokens, catalogedAssets, geminiCaptions);

if (lines.length > 0) {
const hasGeminiKey = !!(process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY);
const header = hasGeminiKey
? "# Asset Descriptions\n\nOne line per file. Read this instead of opening every image individually. Lines tagged `LOGO` are the brand-mark candidates — search for the brand name here BEFORE composing a logo from scratch.\n\n"
: "# Asset Descriptions\n\n⚠️ GEMINI_API_KEY not set — descriptions below are catalog-derived (alt text, headings, section context, filename) instead of Vision-generated. Lines tagged `LOGO` are the brand-mark candidates per DOM-structural signals (inside header/nav, inside home anchor, or alt-text matching the page title). To get richer Vision descriptions on the next capture, set GEMINI_API_KEY (or GOOGLE_API_KEY) and re-run.\n\nWhen the description is too weak to identify a captured logo by description alone, open the LOGO-tagged SVGs in a previewer or `sharp`-render them to PNG before referencing — the alternative (composing a fake logo) ships off-brand in the final video.\n\n";
writeFileSync(
join(outputDir, "extracted", "asset-descriptions.md"),
"# Asset Descriptions\n\nOne line per file. Read this instead of opening every image individually.\n\n" +
lines.map((l) => "- " + l).join("\n") +
"\n",
header + lines.map((l) => "- " + l).join("\n") + "\n",
"utf-8",
);
progress("design", `${lines.length} asset descriptions written`);
progress(
"design",
`${lines.length} asset descriptions written${hasGeminiKey ? "" : " (no Gemini key — catalog-fallback mode)"}`,
);
}
} catch {
/* non-critical */
Expand Down
Loading
Loading