From 912a9837666234dba08a978198e8251d2da41778 Mon Sep 17 00:00:00 2001 From: waleed Date: Sun, 14 Jun 2026 21:08:16 -0700 Subject: [PATCH 1/6] fix(uploads): authorize internal file URLs before download downloadFileFromUrl treated any URL containing /api/files/serve/ as trusted-internal and read the object straight from storage by key with no access check, while every other resolution path in the file calls verifyFileAccess. Reachable during workflow execution via file[] inputs (type: 'url'), letting an authenticated user read arbitrary storage objects across tenants by supplying a storage key. Thread the caller's userId into downloadFileFromUrl and run verifyFileAccess(key, userId, undefined, context, false) on the resolved key before downloadFile; fail closed when no userId is present. Update all callers (execution file inputs, tool file outputs, KB ingestion); webhook and chat inputs already thread userId via processExecutionFiles. --- .../sim/executor/utils/file-tool-processor.ts | 2 +- apps/sim/lib/execution/files.ts | 2 +- .../knowledge/documents/document-processor.ts | 45 ++++++++++++------- .../lib/uploads/utils/file-utils.server.ts | 35 +++++++++++++-- 4 files changed, 62 insertions(+), 22 deletions(-) diff --git a/apps/sim/executor/utils/file-tool-processor.ts b/apps/sim/executor/utils/file-tool-processor.ts index 1fa86ffc731..eabac4b35c8 100644 --- a/apps/sim/executor/utils/file-tool-processor.ts +++ b/apps/sim/executor/utils/file-tool-processor.ts @@ -138,7 +138,7 @@ export class FileToolProcessor { } if (!buffer && data.url) { - buffer = await downloadFileFromUrl(data.url) + buffer = await downloadFileFromUrl(data.url, { userId: context.userId }) } if (buffer) { diff --git a/apps/sim/lib/execution/files.ts b/apps/sim/lib/execution/files.ts index c0b69446fcf..e23582aba18 100644 --- a/apps/sim/lib/execution/files.ts +++ b/apps/sim/lib/execution/files.ts @@ -57,7 +57,7 @@ export async function processExecutionFile( if (file.type === 'url' && file.data) { const { downloadFileFromUrl } = await import('@/lib/uploads/utils/file-utils.server') - const buffer = await downloadFileFromUrl(file.data) + const buffer = await downloadFileFromUrl(file.data, { userId }) if (buffer.length > MAX_FILE_SIZE) { const fileSizeMB = (buffer.length / (1024 * 1024)).toFixed(2) diff --git a/apps/sim/lib/knowledge/documents/document-processor.ts b/apps/sim/lib/knowledge/documents/document-processor.ts index 33ee06eb11b..63f0bb7eae4 100644 --- a/apps/sim/lib/knowledge/documents/document-processor.ts +++ b/apps/sim/lib/knowledge/documents/document-processor.ts @@ -295,7 +295,7 @@ async function parseDocument( if (isPDF && (hasAzureMistralOCR || hasMistralOCR)) { if (hasAzureMistralOCR) { logger.info(`Using Azure Mistral OCR: ${filename}`) - return parseWithAzureMistralOCR(fileUrl, filename, mimeType) + return parseWithAzureMistralOCR(fileUrl, filename, mimeType, userId) } if (hasMistralOCR) { @@ -305,7 +305,7 @@ async function parseDocument( } logger.info(`Using file parser: ${filename}`) - return parseWithFileParser(fileUrl, filename, mimeType) + return parseWithFileParser(fileUrl, filename, mimeType, userId) } async function handleFileForOCR( @@ -321,7 +321,7 @@ async function handleFileForOCR( if (mimeType === 'application/pdf') { logger.info(`handleFileForOCR: Downloading external PDF to check page count`) try { - const buffer = await downloadFileWithTimeout(fileUrl) + const buffer = await downloadFileWithTimeout(fileUrl, userId) logger.info(`handleFileForOCR: Downloaded external PDF: ${buffer.length} bytes`) return { httpsUrl: fileUrl, buffer } } catch (error) { @@ -340,7 +340,7 @@ async function handleFileForOCR( logger.info(`Uploading "${filename}" to cloud storage for OCR`) - const buffer = await downloadFileWithTimeout(fileUrl) + const buffer = await downloadFileWithTimeout(fileUrl, userId) logger.info(`Downloaded ${filename}: ${buffer.length} bytes`) @@ -380,11 +380,11 @@ async function handleFileForOCR( } } -async function downloadFileWithTimeout(fileUrl: string): Promise { - return downloadFileFromUrl(fileUrl, TIMEOUTS.FILE_DOWNLOAD) +async function downloadFileWithTimeout(fileUrl: string, userId?: string): Promise { + return downloadFileFromUrl(fileUrl, { timeoutMs: TIMEOUTS.FILE_DOWNLOAD, userId }) } -async function downloadFileForBase64(fileUrl: string): Promise { +async function downloadFileForBase64(fileUrl: string, userId?: string): Promise { if (/^data:/i.test(fileUrl)) { const [, base64Data] = fileUrl.split(',') if (!base64Data) { @@ -393,7 +393,7 @@ async function downloadFileForBase64(fileUrl: string): Promise { return Buffer.from(base64Data, 'base64') } if (/^https?:\/\//i.test(fileUrl)) { - return downloadFileWithTimeout(fileUrl) + return downloadFileWithTimeout(fileUrl, userId) } throw new Error('Unsupported fileUrl scheme: only data: URIs and http(s):// URLs are allowed') } @@ -468,7 +468,12 @@ async function makeOCRRequest( } } -async function parseWithAzureMistralOCR(fileUrl: string, filename: string, mimeType: string) { +async function parseWithAzureMistralOCR( + fileUrl: string, + filename: string, + mimeType: string, + userId?: string +) { validateOCRConfig( env.OCR_AZURE_API_KEY, env.OCR_AZURE_ENDPOINT, @@ -476,7 +481,7 @@ async function parseWithAzureMistralOCR(fileUrl: string, filename: string, mimeT 'Azure Mistral OCR' ) - const fileBuffer = await downloadFileForBase64(fileUrl) + const fileBuffer = await downloadFileForBase64(fileUrl, userId) if (mimeType === 'application/pdf') { const pageCount = await getPdfPageCount(fileBuffer) @@ -485,7 +490,7 @@ async function parseWithAzureMistralOCR(fileUrl: string, filename: string, mimeT `PDF has ${pageCount} pages, exceeds Azure OCR limit of ${MISTRAL_MAX_PAGES}. ` + `Falling back to file parser.` ) - return parseWithFileParser(fileUrl, filename, mimeType) + return parseWithFileParser(fileUrl, filename, mimeType, userId) } logger.info(`Azure Mistral OCR: PDF page count for ${filename}: ${pageCount}`) } @@ -529,7 +534,7 @@ async function parseWithAzureMistralOCR(fileUrl: string, filename: string, mimeT }) logger.info(`Falling back to file parser: ${filename}`) - return parseWithFileParser(fileUrl, filename, mimeType) + return parseWithFileParser(fileUrl, filename, mimeType, userId) } } @@ -589,7 +594,7 @@ async function parseWithMistralOCR( }) logger.info(`Falling back to file parser: ${filename}`) - return parseWithFileParser(fileUrl, filename, mimeType) + return parseWithFileParser(fileUrl, filename, mimeType, userId) } } @@ -773,7 +778,12 @@ async function processMistralOCRInBatches( } } -async function parseWithFileParser(fileUrl: string, filename: string, mimeType: string) { +async function parseWithFileParser( + fileUrl: string, + filename: string, + mimeType: string, + userId?: string +) { try { let content: string let metadata: FileParseMetadata = {} @@ -781,7 +791,7 @@ async function parseWithFileParser(fileUrl: string, filename: string, mimeType: if (/^data:/i.test(fileUrl)) { content = await parseDataURI(fileUrl, filename, mimeType) } else if (/^https?:\/\//i.test(fileUrl)) { - const result = await parseHttpFile(fileUrl, filename, mimeType) + const result = await parseHttpFile(fileUrl, filename, mimeType, userId) content = result.content metadata = result.metadata || {} } else { @@ -820,9 +830,10 @@ async function parseDataURI(fileUrl: string, filename: string, mimeType: string) async function parseHttpFile( fileUrl: string, filename: string, - mimeType?: string + mimeType?: string, + userId?: string ): Promise<{ content: string; metadata?: FileParseMetadata }> { - const buffer = await downloadFileWithTimeout(fileUrl) + const buffer = await downloadFileWithTimeout(fileUrl, userId) const extension = resolveParserExtension(filename, mimeType) const result = await parseBuffer(buffer, extension) diff --git a/apps/sim/lib/uploads/utils/file-utils.server.ts b/apps/sim/lib/uploads/utils/file-utils.server.ts index f0fb7a606a6..752733066ae 100644 --- a/apps/sim/lib/uploads/utils/file-utils.server.ts +++ b/apps/sim/lib/uploads/utils/file-utils.server.ts @@ -137,20 +137,49 @@ export async function resolveFileInputToUrl( return { error: { status: 400, message: 'File input is required' } } } +/** + * Options for {@link downloadFileFromUrl}. + */ +export interface DownloadFileFromUrlOptions { + /** Download timeout for external URLs. Defaults to the max execution timeout. */ + timeoutMs?: number + /** Hard cap on the number of bytes read from the source. */ + maxBytes?: number + /** + * Principal the download is performed on behalf of. Required to authorize + * internal (`/api/files/serve/...`) URLs: the resolved storage key is checked + * with {@link verifyFileAccess} before any bytes are read. Without it, internal + * URLs are rejected (fail closed) so a `/api/files/serve/` substring can never + * be treated as implicitly trusted. + */ + userId?: string +} + /** * Download a file from a URL (internal or external) - * For internal URLs, uses direct storage access (server-side only) + * For internal URLs, uses direct storage access (server-side only) after + * authorizing the resolved storage key against `userId` * For external URLs, validates DNS/SSRF and uses secure fetch with IP pinning */ export async function downloadFileFromUrl( fileUrl: string, - timeoutMs = getMaxExecutionTimeout(), - maxBytes?: number + options: DownloadFileFromUrlOptions = {} ): Promise { + const { timeoutMs = getMaxExecutionTimeout(), maxBytes, userId } = options const { parseInternalFileUrl } = await import('./file-utils') if (isInternalFileUrl(fileUrl)) { const { key, context } = parseInternalFileUrl(fileUrl) + + if (!userId) { + throw new Error('Access denied: internal file URL requires an authenticated user') + } + + const hasAccess = await verifyFileAccess(key, userId, undefined, context, false) + if (!hasAccess) { + throw new Error('Access denied: file not found or insufficient permissions') + } + const { downloadFile } = await import('@/lib/uploads/core/storage-service') return downloadFile({ key, context, maxBytes }) } From 41ae71164e3537bdfd1c87c621a34446d5649e51 Mon Sep 17 00:00:00 2001 From: waleed Date: Mon, 15 Jun 2026 11:41:35 -0700 Subject: [PATCH 2/6] chore(uploads): log denied internal file downloads for rollout telemetry --- apps/sim/lib/uploads/utils/file-utils.server.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/apps/sim/lib/uploads/utils/file-utils.server.ts b/apps/sim/lib/uploads/utils/file-utils.server.ts index 752733066ae..99b132d6c94 100644 --- a/apps/sim/lib/uploads/utils/file-utils.server.ts +++ b/apps/sim/lib/uploads/utils/file-utils.server.ts @@ -1,6 +1,6 @@ 'use server' -import type { Logger } from '@sim/logger' +import { createLogger, type Logger } from '@sim/logger' import { getErrorMessage } from '@sim/utils/errors' import { getMaxExecutionTimeout } from '@/lib/core/execution-limits' import { @@ -25,6 +25,8 @@ import { import { verifyFileAccess } from '@/app/api/files/authorization' import type { UserFile } from '@/executor/types' +const logger = createLogger('FileUtilsServer') + /** * Result type for file input resolution */ @@ -172,11 +174,13 @@ export async function downloadFileFromUrl( const { key, context } = parseInternalFileUrl(fileUrl) if (!userId) { + logger.warn('Internal file download denied: no userId provided', { key, context }) throw new Error('Access denied: internal file URL requires an authenticated user') } const hasAccess = await verifyFileAccess(key, userId, undefined, context, false) if (!hasAccess) { + logger.warn('Internal file download denied: access check failed', { key, context, userId }) throw new Error('Access denied: file not found or insufficient permissions') } From e9b2e11ccd04d68a61835a7b108758dc9854fb18 Mon Sep 17 00:00:00 2001 From: waleed Date: Mon, 15 Jun 2026 11:47:31 -0700 Subject: [PATCH 3/6] fix(uploads): derive internal file context from key, not query param Cursor Bugbot flagged a context-spoofing bypass: downloadFileFromUrl resolved context via parseInternalFileUrl, which honors a caller-controlled ?context= query param. An attacker could label a private storage key with a world-readable context (profile-pictures/og-images/workspace-logos) so verifyFileAccess short-circuits to granted while downloadFile still reads the private object. Infer context from the key only (inferContextFromKey), mirroring how /api/files/serve resolves it; ignore the query param. Also move the userId guard ahead of key resolution so auth failures surface first. --- .../sim/lib/uploads/utils/file-utils.server.ts | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/apps/sim/lib/uploads/utils/file-utils.server.ts b/apps/sim/lib/uploads/utils/file-utils.server.ts index 99b132d6c94..637eba912f5 100644 --- a/apps/sim/lib/uploads/utils/file-utils.server.ts +++ b/apps/sim/lib/uploads/utils/file-utils.server.ts @@ -168,16 +168,26 @@ export async function downloadFileFromUrl( options: DownloadFileFromUrlOptions = {} ): Promise { const { timeoutMs = getMaxExecutionTimeout(), maxBytes, userId } = options - const { parseInternalFileUrl } = await import('./file-utils') if (isInternalFileUrl(fileUrl)) { - const { key, context } = parseInternalFileUrl(fileUrl) - if (!userId) { - logger.warn('Internal file download denied: no userId provided', { key, context }) + logger.warn('Internal file download denied: no userId provided', { fileUrl }) throw new Error('Access denied: internal file URL requires an authenticated user') } + const key = extractStorageKey(fileUrl) + if (!key) { + logger.warn('Internal file download denied: could not resolve storage key', { fileUrl }) + throw new Error('Access denied: could not resolve internal file key') + } + + // Derive the storage context from the key itself, never from a caller-controlled + // `?context=` query param. Trusting the param lets a private key be labeled with a + // world-readable context (e.g. profile-pictures), so verifyFileAccess short-circuits + // to granted while downloadFile still reads the private object. This mirrors how + // /api/files/serve resolves context (inferContextFromKey only). + const context = inferContextFromKey(key) + const hasAccess = await verifyFileAccess(key, userId, undefined, context, false) if (!hasAccess) { logger.warn('Internal file download denied: access check failed', { key, context, userId }) From a4bba94e783c93dabfa81c441808222b3ece4700 Mon Sep 17 00:00:00 2001 From: waleed Date: Mon, 15 Jun 2026 11:51:01 -0700 Subject: [PATCH 4/6] docs(uploads): move context-derivation rationale into TSDoc --- .../sim/lib/uploads/utils/file-utils.server.ts | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/apps/sim/lib/uploads/utils/file-utils.server.ts b/apps/sim/lib/uploads/utils/file-utils.server.ts index 637eba912f5..b5460c8440a 100644 --- a/apps/sim/lib/uploads/utils/file-utils.server.ts +++ b/apps/sim/lib/uploads/utils/file-utils.server.ts @@ -158,10 +158,17 @@ export interface DownloadFileFromUrlOptions { } /** - * Download a file from a URL (internal or external) + * Download a file from a URL (internal or external). + * * For internal URLs, uses direct storage access (server-side only) after - * authorizing the resolved storage key against `userId` - * For external URLs, validates DNS/SSRF and uses secure fetch with IP pinning + * authorizing the resolved storage key against `userId`. Context is derived + * from the key via {@link inferContextFromKey}, never from a caller-controlled + * `?context=` query param — trusting the param would let a private key be + * labeled with a world-readable context (e.g. profile-pictures) so + * {@link verifyFileAccess} short-circuits to granted while the private object is + * still read. This mirrors how `/api/files/serve` resolves context. + * + * For external URLs, validates DNS/SSRF and uses secure fetch with IP pinning. */ export async function downloadFileFromUrl( fileUrl: string, @@ -181,11 +188,6 @@ export async function downloadFileFromUrl( throw new Error('Access denied: could not resolve internal file key') } - // Derive the storage context from the key itself, never from a caller-controlled - // `?context=` query param. Trusting the param lets a private key be labeled with a - // world-readable context (e.g. profile-pictures), so verifyFileAccess short-circuits - // to granted while downloadFile still reads the private object. This mirrors how - // /api/files/serve resolves context (inferContextFromKey only). const context = inferContextFromKey(key) const hasAccess = await verifyFileAccess(key, userId, undefined, context, false) From 9f47c1487cd3598545464ab08172f2df2a94067e Mon Sep 17 00:00:00 2001 From: waleed Date: Mon, 15 Jun 2026 12:02:04 -0700 Subject: [PATCH 5/6] fix(uploads): match internal file marker in URL path only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit isInternalFileUrl matched the /api/files/serve/ substring anywhere in the string, so a crafted URL could carry it in a query string or fragment and skip DNS/SSRF validation. Match it in the path component only. The raw path is checked without URL normalization on purpose: the files parse route relies on traversal sequences surviving this check (an absolute https://host/api/files/serve/../.. URL must classify as internal so the '..' check rejects it, rather than being normalized to /etc/... and waved through as external). Host is intentionally not gated — cross-tenant reads are prevented at the storage sink by verifyFileAccess, and host-allowlisting would break self-hosted/multi-domain deployments. Adds unit tests. --- apps/sim/lib/uploads/utils/file-utils.test.ts | 37 ++++++++++++++++++- apps/sim/lib/uploads/utils/file-utils.ts | 27 +++++++++++++- 2 files changed, 61 insertions(+), 3 deletions(-) diff --git a/apps/sim/lib/uploads/utils/file-utils.test.ts b/apps/sim/lib/uploads/utils/file-utils.test.ts index f5e3c45ebde..63793e0e144 100644 --- a/apps/sim/lib/uploads/utils/file-utils.test.ts +++ b/apps/sim/lib/uploads/utils/file-utils.test.ts @@ -2,7 +2,42 @@ * @vitest-environment node */ import { describe, expect, it } from 'vitest' -import { isAbortError, isNetworkError } from '@/lib/uploads/utils/file-utils' +import { isAbortError, isInternalFileUrl, isNetworkError } from '@/lib/uploads/utils/file-utils' + +describe('isInternalFileUrl', () => { + it('classifies relative serve paths as internal', () => { + expect(isInternalFileUrl('/api/files/serve/kb/123-file.pdf')).toBe(true) + expect(isInternalFileUrl('/api/files/serve/workspace/ws-1/file.txt?context=workspace')).toBe( + true + ) + }) + + it('classifies absolute serve URLs as internal regardless of host', () => { + expect(isInternalFileUrl('https://www.sim.ai/api/files/serve/kb/x.pdf')).toBe(true) + expect(isInternalFileUrl('http://localhost:3000/api/files/serve/blob/kb/x')).toBe(true) + // Host is not used to gate (self-hosted/multi-domain); the storage sink authorizes. + expect(isInternalFileUrl('https://other-host/api/files/serve/workspace/v/x')).toBe(true) + }) + + it('does not match the marker outside the path (query/fragment)', () => { + expect(isInternalFileUrl('https://evil.com/x?next=/api/files/serve/secret')).toBe(false) + expect(isInternalFileUrl('https://evil.com/page#/api/files/serve/secret')).toBe(false) + expect(isInternalFileUrl('https://evil.com/redirect?u=/api/files/serve/kb/x')).toBe(false) + }) + + it('preserves traversal sequences so they survive downstream rejection', () => { + // Must stay internal (not normalized away) so the parse route applies its `..` check. + expect(isInternalFileUrl('https://attacker.com/api/files/serve/../../../etc/passwd')).toBe(true) + expect(isInternalFileUrl('/api/files/serve/../../app.js')).toBe(true) + }) + + it('returns false for non-internal and non-string inputs', () => { + expect(isInternalFileUrl('https://example.com/file.pdf')).toBe(false) + expect(isInternalFileUrl('data:text/plain;base64,abc')).toBe(false) + // @ts-expect-error verifying runtime guard + expect(isInternalFileUrl(undefined)).toBe(false) + }) +}) describe('isAbortError', () => { it('returns true for AbortError-named errors', () => { diff --git a/apps/sim/lib/uploads/utils/file-utils.ts b/apps/sim/lib/uploads/utils/file-utils.ts index d11fdf1b70f..e99d83c0564 100644 --- a/apps/sim/lib/uploads/utils/file-utils.ts +++ b/apps/sim/lib/uploads/utils/file-utils.ts @@ -534,10 +534,33 @@ export function extractStorageKey(filePath: string): string { } /** - * Check if a URL is an internal file serve URL + * Whether a URL targets the internal file-serve endpoint (`/api/files/serve/`). + * + * The marker is matched only in the URL's path component, so it cannot be + * smuggled through a query string or fragment (e.g. + * `https://evil.com/x?next=/api/files/serve/...`) to skip DNS/SSRF validation. + * + * The raw path is inspected without URL normalization on purpose: callers such + * as the files parse route rely on traversal sequences (`..`) surviving this + * check so they are rejected downstream rather than collapsed away. A path-only + * marker still classifies any host as internal (e.g. + * `https://other-host/api/files/serve/`); cross-tenant reads are prevented + * at the storage sink by {@link verifyFileAccess}, not by host matching, which + * would break self-hosted and multi-domain deployments. */ export function isInternalFileUrl(fileUrl: string): boolean { - return fileUrl.includes('/api/files/serve/') + if (typeof fileUrl !== 'string') { + return false + } + + let path = fileUrl + const scheme = /^[a-z][a-z0-9+.-]*:\/\/[^/?#]*/i.exec(path) + if (scheme) { + path = path.slice(scheme[0].length) + } + path = path.split(/[?#]/, 1)[0] + + return path.startsWith('/api/files/serve/') } /** From bb1d384d523a0d3a839bc8e60861888c137ce68c Mon Sep 17 00:00:00 2001 From: Vikhyath Mondreti Date: Mon, 15 Jun 2026 16:34:53 -0700 Subject: [PATCH 6/6] consolidate access, billing principals --- apps/sim/lib/knowledge/documents/service.ts | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/apps/sim/lib/knowledge/documents/service.ts b/apps/sim/lib/knowledge/documents/service.ts index 307db69bc7c..bdfe3e5d557 100644 --- a/apps/sim/lib/knowledge/documents/service.ts +++ b/apps/sim/lib/knowledge/documents/service.ts @@ -515,7 +515,6 @@ export async function processDocumentAsync( // KB config + workspace billing + doc tags in one JOIN (was 3 SELECTs). const contextRows = await db .select({ - userId: knowledgeBase.userId, workspaceId: knowledgeBase.workspaceId, chunkingConfig: knowledgeBase.chunkingConfig, embeddingModel: knowledgeBase.embeddingModel, @@ -644,7 +643,12 @@ export async function processDocumentAsync( kbConfig.maxSize, kbConfig.overlap, kbConfig.minSize, - ctx.userId, + // Authorize the source file (and run OCR/processing) as the billed + // actor — the uploader when known, else the workspace billed account — + // the same principal embeddings are billed to. Using the KB owner here + // would authorize an attacker-supplied internal fileUrl against the + // owner, letting a KB write-member ingest a file only the owner can read. + billingUserId, ctx.workspaceId, rawConfig?.strategy, rawConfig?.strategyOptions