diff --git a/apps/sim/executor/utils/file-tool-processor.ts b/apps/sim/executor/utils/file-tool-processor.ts index 1fa86ffc731..eabac4b35c8 100644 --- a/apps/sim/executor/utils/file-tool-processor.ts +++ b/apps/sim/executor/utils/file-tool-processor.ts @@ -138,7 +138,7 @@ export class FileToolProcessor { } if (!buffer && data.url) { - buffer = await downloadFileFromUrl(data.url) + buffer = await downloadFileFromUrl(data.url, { userId: context.userId }) } if (buffer) { diff --git a/apps/sim/lib/execution/files.ts b/apps/sim/lib/execution/files.ts index c0b69446fcf..e23582aba18 100644 --- a/apps/sim/lib/execution/files.ts +++ b/apps/sim/lib/execution/files.ts @@ -57,7 +57,7 @@ export async function processExecutionFile( if (file.type === 'url' && file.data) { const { downloadFileFromUrl } = await import('@/lib/uploads/utils/file-utils.server') - const buffer = await downloadFileFromUrl(file.data) + const buffer = await downloadFileFromUrl(file.data, { userId }) if (buffer.length > MAX_FILE_SIZE) { const fileSizeMB = (buffer.length / (1024 * 1024)).toFixed(2) diff --git a/apps/sim/lib/knowledge/documents/document-processor.ts b/apps/sim/lib/knowledge/documents/document-processor.ts index 33ee06eb11b..63f0bb7eae4 100644 --- a/apps/sim/lib/knowledge/documents/document-processor.ts +++ b/apps/sim/lib/knowledge/documents/document-processor.ts @@ -295,7 +295,7 @@ async function parseDocument( if (isPDF && (hasAzureMistralOCR || hasMistralOCR)) { if (hasAzureMistralOCR) { logger.info(`Using Azure Mistral OCR: ${filename}`) - return parseWithAzureMistralOCR(fileUrl, filename, mimeType) + return parseWithAzureMistralOCR(fileUrl, filename, mimeType, userId) } if (hasMistralOCR) { @@ -305,7 +305,7 @@ async function parseDocument( } logger.info(`Using file parser: ${filename}`) - return parseWithFileParser(fileUrl, filename, mimeType) + return parseWithFileParser(fileUrl, filename, mimeType, userId) } async function handleFileForOCR( @@ -321,7 +321,7 @@ async function handleFileForOCR( if (mimeType === 'application/pdf') { logger.info(`handleFileForOCR: Downloading external PDF to check page count`) try { - const buffer = await downloadFileWithTimeout(fileUrl) + const buffer = await downloadFileWithTimeout(fileUrl, userId) logger.info(`handleFileForOCR: Downloaded external PDF: ${buffer.length} bytes`) return { httpsUrl: fileUrl, buffer } } catch (error) { @@ -340,7 +340,7 @@ async function handleFileForOCR( logger.info(`Uploading "${filename}" to cloud storage for OCR`) - const buffer = await downloadFileWithTimeout(fileUrl) + const buffer = await downloadFileWithTimeout(fileUrl, userId) logger.info(`Downloaded ${filename}: ${buffer.length} bytes`) @@ -380,11 +380,11 @@ async function handleFileForOCR( } } -async function downloadFileWithTimeout(fileUrl: string): Promise { - return downloadFileFromUrl(fileUrl, TIMEOUTS.FILE_DOWNLOAD) +async function downloadFileWithTimeout(fileUrl: string, userId?: string): Promise { + return downloadFileFromUrl(fileUrl, { timeoutMs: TIMEOUTS.FILE_DOWNLOAD, userId }) } -async function downloadFileForBase64(fileUrl: string): Promise { +async function downloadFileForBase64(fileUrl: string, userId?: string): Promise { if (/^data:/i.test(fileUrl)) { const [, base64Data] = fileUrl.split(',') if (!base64Data) { @@ -393,7 +393,7 @@ async function downloadFileForBase64(fileUrl: string): Promise { return Buffer.from(base64Data, 'base64') } if (/^https?:\/\//i.test(fileUrl)) { - return downloadFileWithTimeout(fileUrl) + return downloadFileWithTimeout(fileUrl, userId) } throw new Error('Unsupported fileUrl scheme: only data: URIs and http(s):// URLs are allowed') } @@ -468,7 +468,12 @@ async function makeOCRRequest( } } -async function parseWithAzureMistralOCR(fileUrl: string, filename: string, mimeType: string) { +async function parseWithAzureMistralOCR( + fileUrl: string, + filename: string, + mimeType: string, + userId?: string +) { validateOCRConfig( env.OCR_AZURE_API_KEY, env.OCR_AZURE_ENDPOINT, @@ -476,7 +481,7 @@ async function parseWithAzureMistralOCR(fileUrl: string, filename: string, mimeT 'Azure Mistral OCR' ) - const fileBuffer = await downloadFileForBase64(fileUrl) + const fileBuffer = await downloadFileForBase64(fileUrl, userId) if (mimeType === 'application/pdf') { const pageCount = await getPdfPageCount(fileBuffer) @@ -485,7 +490,7 @@ async function parseWithAzureMistralOCR(fileUrl: string, filename: string, mimeT `PDF has ${pageCount} pages, exceeds Azure OCR limit of ${MISTRAL_MAX_PAGES}. ` + `Falling back to file parser.` ) - return parseWithFileParser(fileUrl, filename, mimeType) + return parseWithFileParser(fileUrl, filename, mimeType, userId) } logger.info(`Azure Mistral OCR: PDF page count for ${filename}: ${pageCount}`) } @@ -529,7 +534,7 @@ async function parseWithAzureMistralOCR(fileUrl: string, filename: string, mimeT }) logger.info(`Falling back to file parser: ${filename}`) - return parseWithFileParser(fileUrl, filename, mimeType) + return parseWithFileParser(fileUrl, filename, mimeType, userId) } } @@ -589,7 +594,7 @@ async function parseWithMistralOCR( }) logger.info(`Falling back to file parser: ${filename}`) - return parseWithFileParser(fileUrl, filename, mimeType) + return parseWithFileParser(fileUrl, filename, mimeType, userId) } } @@ -773,7 +778,12 @@ async function processMistralOCRInBatches( } } -async function parseWithFileParser(fileUrl: string, filename: string, mimeType: string) { +async function parseWithFileParser( + fileUrl: string, + filename: string, + mimeType: string, + userId?: string +) { try { let content: string let metadata: FileParseMetadata = {} @@ -781,7 +791,7 @@ async function parseWithFileParser(fileUrl: string, filename: string, mimeType: if (/^data:/i.test(fileUrl)) { content = await parseDataURI(fileUrl, filename, mimeType) } else if (/^https?:\/\//i.test(fileUrl)) { - const result = await parseHttpFile(fileUrl, filename, mimeType) + const result = await parseHttpFile(fileUrl, filename, mimeType, userId) content = result.content metadata = result.metadata || {} } else { @@ -820,9 +830,10 @@ async function parseDataURI(fileUrl: string, filename: string, mimeType: string) async function parseHttpFile( fileUrl: string, filename: string, - mimeType?: string + mimeType?: string, + userId?: string ): Promise<{ content: string; metadata?: FileParseMetadata }> { - const buffer = await downloadFileWithTimeout(fileUrl) + const buffer = await downloadFileWithTimeout(fileUrl, userId) const extension = resolveParserExtension(filename, mimeType) const result = await parseBuffer(buffer, extension) diff --git a/apps/sim/lib/knowledge/documents/service.ts b/apps/sim/lib/knowledge/documents/service.ts index 307db69bc7c..bdfe3e5d557 100644 --- a/apps/sim/lib/knowledge/documents/service.ts +++ b/apps/sim/lib/knowledge/documents/service.ts @@ -515,7 +515,6 @@ export async function processDocumentAsync( // KB config + workspace billing + doc tags in one JOIN (was 3 SELECTs). const contextRows = await db .select({ - userId: knowledgeBase.userId, workspaceId: knowledgeBase.workspaceId, chunkingConfig: knowledgeBase.chunkingConfig, embeddingModel: knowledgeBase.embeddingModel, @@ -644,7 +643,12 @@ export async function processDocumentAsync( kbConfig.maxSize, kbConfig.overlap, kbConfig.minSize, - ctx.userId, + // Authorize the source file (and run OCR/processing) as the billed + // actor — the uploader when known, else the workspace billed account — + // the same principal embeddings are billed to. Using the KB owner here + // would authorize an attacker-supplied internal fileUrl against the + // owner, letting a KB write-member ingest a file only the owner can read. + billingUserId, ctx.workspaceId, rawConfig?.strategy, rawConfig?.strategyOptions diff --git a/apps/sim/lib/uploads/utils/file-utils.server.ts b/apps/sim/lib/uploads/utils/file-utils.server.ts index f0fb7a606a6..b5460c8440a 100644 --- a/apps/sim/lib/uploads/utils/file-utils.server.ts +++ b/apps/sim/lib/uploads/utils/file-utils.server.ts @@ -1,6 +1,6 @@ 'use server' -import type { Logger } from '@sim/logger' +import { createLogger, type Logger } from '@sim/logger' import { getErrorMessage } from '@sim/utils/errors' import { getMaxExecutionTimeout } from '@/lib/core/execution-limits' import { @@ -25,6 +25,8 @@ import { import { verifyFileAccess } from '@/app/api/files/authorization' import type { UserFile } from '@/executor/types' +const logger = createLogger('FileUtilsServer') + /** * Result type for file input resolution */ @@ -138,19 +140,62 @@ export async function resolveFileInputToUrl( } /** - * Download a file from a URL (internal or external) - * For internal URLs, uses direct storage access (server-side only) - * For external URLs, validates DNS/SSRF and uses secure fetch with IP pinning + * Options for {@link downloadFileFromUrl}. + */ +export interface DownloadFileFromUrlOptions { + /** Download timeout for external URLs. Defaults to the max execution timeout. */ + timeoutMs?: number + /** Hard cap on the number of bytes read from the source. */ + maxBytes?: number + /** + * Principal the download is performed on behalf of. Required to authorize + * internal (`/api/files/serve/...`) URLs: the resolved storage key is checked + * with {@link verifyFileAccess} before any bytes are read. Without it, internal + * URLs are rejected (fail closed) so a `/api/files/serve/` substring can never + * be treated as implicitly trusted. + */ + userId?: string +} + +/** + * Download a file from a URL (internal or external). + * + * For internal URLs, uses direct storage access (server-side only) after + * authorizing the resolved storage key against `userId`. Context is derived + * from the key via {@link inferContextFromKey}, never from a caller-controlled + * `?context=` query param — trusting the param would let a private key be + * labeled with a world-readable context (e.g. profile-pictures) so + * {@link verifyFileAccess} short-circuits to granted while the private object is + * still read. This mirrors how `/api/files/serve` resolves context. + * + * For external URLs, validates DNS/SSRF and uses secure fetch with IP pinning. */ export async function downloadFileFromUrl( fileUrl: string, - timeoutMs = getMaxExecutionTimeout(), - maxBytes?: number + options: DownloadFileFromUrlOptions = {} ): Promise { - const { parseInternalFileUrl } = await import('./file-utils') + const { timeoutMs = getMaxExecutionTimeout(), maxBytes, userId } = options if (isInternalFileUrl(fileUrl)) { - const { key, context } = parseInternalFileUrl(fileUrl) + if (!userId) { + logger.warn('Internal file download denied: no userId provided', { fileUrl }) + throw new Error('Access denied: internal file URL requires an authenticated user') + } + + const key = extractStorageKey(fileUrl) + if (!key) { + logger.warn('Internal file download denied: could not resolve storage key', { fileUrl }) + throw new Error('Access denied: could not resolve internal file key') + } + + const context = inferContextFromKey(key) + + const hasAccess = await verifyFileAccess(key, userId, undefined, context, false) + if (!hasAccess) { + logger.warn('Internal file download denied: access check failed', { key, context, userId }) + throw new Error('Access denied: file not found or insufficient permissions') + } + const { downloadFile } = await import('@/lib/uploads/core/storage-service') return downloadFile({ key, context, maxBytes }) } diff --git a/apps/sim/lib/uploads/utils/file-utils.test.ts b/apps/sim/lib/uploads/utils/file-utils.test.ts index f5e3c45ebde..63793e0e144 100644 --- a/apps/sim/lib/uploads/utils/file-utils.test.ts +++ b/apps/sim/lib/uploads/utils/file-utils.test.ts @@ -2,7 +2,42 @@ * @vitest-environment node */ import { describe, expect, it } from 'vitest' -import { isAbortError, isNetworkError } from '@/lib/uploads/utils/file-utils' +import { isAbortError, isInternalFileUrl, isNetworkError } from '@/lib/uploads/utils/file-utils' + +describe('isInternalFileUrl', () => { + it('classifies relative serve paths as internal', () => { + expect(isInternalFileUrl('/api/files/serve/kb/123-file.pdf')).toBe(true) + expect(isInternalFileUrl('/api/files/serve/workspace/ws-1/file.txt?context=workspace')).toBe( + true + ) + }) + + it('classifies absolute serve URLs as internal regardless of host', () => { + expect(isInternalFileUrl('https://www.sim.ai/api/files/serve/kb/x.pdf')).toBe(true) + expect(isInternalFileUrl('http://localhost:3000/api/files/serve/blob/kb/x')).toBe(true) + // Host is not used to gate (self-hosted/multi-domain); the storage sink authorizes. + expect(isInternalFileUrl('https://other-host/api/files/serve/workspace/v/x')).toBe(true) + }) + + it('does not match the marker outside the path (query/fragment)', () => { + expect(isInternalFileUrl('https://evil.com/x?next=/api/files/serve/secret')).toBe(false) + expect(isInternalFileUrl('https://evil.com/page#/api/files/serve/secret')).toBe(false) + expect(isInternalFileUrl('https://evil.com/redirect?u=/api/files/serve/kb/x')).toBe(false) + }) + + it('preserves traversal sequences so they survive downstream rejection', () => { + // Must stay internal (not normalized away) so the parse route applies its `..` check. + expect(isInternalFileUrl('https://attacker.com/api/files/serve/../../../etc/passwd')).toBe(true) + expect(isInternalFileUrl('/api/files/serve/../../app.js')).toBe(true) + }) + + it('returns false for non-internal and non-string inputs', () => { + expect(isInternalFileUrl('https://example.com/file.pdf')).toBe(false) + expect(isInternalFileUrl('data:text/plain;base64,abc')).toBe(false) + // @ts-expect-error verifying runtime guard + expect(isInternalFileUrl(undefined)).toBe(false) + }) +}) describe('isAbortError', () => { it('returns true for AbortError-named errors', () => { diff --git a/apps/sim/lib/uploads/utils/file-utils.ts b/apps/sim/lib/uploads/utils/file-utils.ts index d11fdf1b70f..e99d83c0564 100644 --- a/apps/sim/lib/uploads/utils/file-utils.ts +++ b/apps/sim/lib/uploads/utils/file-utils.ts @@ -534,10 +534,33 @@ export function extractStorageKey(filePath: string): string { } /** - * Check if a URL is an internal file serve URL + * Whether a URL targets the internal file-serve endpoint (`/api/files/serve/`). + * + * The marker is matched only in the URL's path component, so it cannot be + * smuggled through a query string or fragment (e.g. + * `https://evil.com/x?next=/api/files/serve/...`) to skip DNS/SSRF validation. + * + * The raw path is inspected without URL normalization on purpose: callers such + * as the files parse route rely on traversal sequences (`..`) surviving this + * check so they are rejected downstream rather than collapsed away. A path-only + * marker still classifies any host as internal (e.g. + * `https://other-host/api/files/serve/`); cross-tenant reads are prevented + * at the storage sink by {@link verifyFileAccess}, not by host matching, which + * would break self-hosted and multi-domain deployments. */ export function isInternalFileUrl(fileUrl: string): boolean { - return fileUrl.includes('/api/files/serve/') + if (typeof fileUrl !== 'string') { + return false + } + + let path = fileUrl + const scheme = /^[a-z][a-z0-9+.-]*:\/\/[^/?#]*/i.exec(path) + if (scheme) { + path = path.slice(scheme[0].length) + } + path = path.split(/[?#]/, 1)[0] + + return path.startsWith('/api/files/serve/') } /**