Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/sim/executor/utils/file-tool-processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ export class FileToolProcessor {
}

if (!buffer && data.url) {
buffer = await downloadFileFromUrl(data.url)
buffer = await downloadFileFromUrl(data.url, { userId: context.userId })
}

if (buffer) {
Expand Down
2 changes: 1 addition & 1 deletion apps/sim/lib/execution/files.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ export async function processExecutionFile(

if (file.type === 'url' && file.data) {
const { downloadFileFromUrl } = await import('@/lib/uploads/utils/file-utils.server')
const buffer = await downloadFileFromUrl(file.data)
const buffer = await downloadFileFromUrl(file.data, { userId })

if (buffer.length > MAX_FILE_SIZE) {
const fileSizeMB = (buffer.length / (1024 * 1024)).toFixed(2)
Expand Down
45 changes: 28 additions & 17 deletions apps/sim/lib/knowledge/documents/document-processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ async function parseDocument(
if (isPDF && (hasAzureMistralOCR || hasMistralOCR)) {
if (hasAzureMistralOCR) {
logger.info(`Using Azure Mistral OCR: ${filename}`)
return parseWithAzureMistralOCR(fileUrl, filename, mimeType)
return parseWithAzureMistralOCR(fileUrl, filename, mimeType, userId)
}

if (hasMistralOCR) {
Expand All @@ -305,7 +305,7 @@ async function parseDocument(
}

logger.info(`Using file parser: ${filename}`)
return parseWithFileParser(fileUrl, filename, mimeType)
return parseWithFileParser(fileUrl, filename, mimeType, userId)
}

async function handleFileForOCR(
Expand All @@ -321,7 +321,7 @@ async function handleFileForOCR(
if (mimeType === 'application/pdf') {
logger.info(`handleFileForOCR: Downloading external PDF to check page count`)
try {
const buffer = await downloadFileWithTimeout(fileUrl)
const buffer = await downloadFileWithTimeout(fileUrl, userId)
logger.info(`handleFileForOCR: Downloaded external PDF: ${buffer.length} bytes`)
return { httpsUrl: fileUrl, buffer }
} catch (error) {
Expand All @@ -340,7 +340,7 @@ async function handleFileForOCR(

logger.info(`Uploading "${filename}" to cloud storage for OCR`)

const buffer = await downloadFileWithTimeout(fileUrl)
const buffer = await downloadFileWithTimeout(fileUrl, userId)

logger.info(`Downloaded ${filename}: ${buffer.length} bytes`)

Expand Down Expand Up @@ -380,11 +380,11 @@ async function handleFileForOCR(
}
}

async function downloadFileWithTimeout(fileUrl: string): Promise<Buffer> {
return downloadFileFromUrl(fileUrl, TIMEOUTS.FILE_DOWNLOAD)
async function downloadFileWithTimeout(fileUrl: string, userId?: string): Promise<Buffer> {
return downloadFileFromUrl(fileUrl, { timeoutMs: TIMEOUTS.FILE_DOWNLOAD, userId })
}

async function downloadFileForBase64(fileUrl: string): Promise<Buffer> {
async function downloadFileForBase64(fileUrl: string, userId?: string): Promise<Buffer> {
if (/^data:/i.test(fileUrl)) {
const [, base64Data] = fileUrl.split(',')
if (!base64Data) {
Expand All @@ -393,7 +393,7 @@ async function downloadFileForBase64(fileUrl: string): Promise<Buffer> {
return Buffer.from(base64Data, 'base64')
}
if (/^https?:\/\//i.test(fileUrl)) {
return downloadFileWithTimeout(fileUrl)
return downloadFileWithTimeout(fileUrl, userId)
}
throw new Error('Unsupported fileUrl scheme: only data: URIs and http(s):// URLs are allowed')
}
Expand Down Expand Up @@ -468,15 +468,20 @@ async function makeOCRRequest(
}
}

async function parseWithAzureMistralOCR(fileUrl: string, filename: string, mimeType: string) {
async function parseWithAzureMistralOCR(
fileUrl: string,
filename: string,
mimeType: string,
userId?: string
) {
validateOCRConfig(
env.OCR_AZURE_API_KEY,
env.OCR_AZURE_ENDPOINT,
env.OCR_AZURE_MODEL_NAME,
'Azure Mistral OCR'
)

const fileBuffer = await downloadFileForBase64(fileUrl)
const fileBuffer = await downloadFileForBase64(fileUrl, userId)

if (mimeType === 'application/pdf') {
const pageCount = await getPdfPageCount(fileBuffer)
Expand All @@ -485,7 +490,7 @@ async function parseWithAzureMistralOCR(fileUrl: string, filename: string, mimeT
`PDF has ${pageCount} pages, exceeds Azure OCR limit of ${MISTRAL_MAX_PAGES}. ` +
`Falling back to file parser.`
)
return parseWithFileParser(fileUrl, filename, mimeType)
return parseWithFileParser(fileUrl, filename, mimeType, userId)
}
logger.info(`Azure Mistral OCR: PDF page count for ${filename}: ${pageCount}`)
}
Expand Down Expand Up @@ -529,7 +534,7 @@ async function parseWithAzureMistralOCR(fileUrl: string, filename: string, mimeT
})

logger.info(`Falling back to file parser: ${filename}`)
return parseWithFileParser(fileUrl, filename, mimeType)
return parseWithFileParser(fileUrl, filename, mimeType, userId)
}
}

Expand Down Expand Up @@ -589,7 +594,7 @@ async function parseWithMistralOCR(
})

logger.info(`Falling back to file parser: ${filename}`)
return parseWithFileParser(fileUrl, filename, mimeType)
return parseWithFileParser(fileUrl, filename, mimeType, userId)
}
}

Expand Down Expand Up @@ -773,15 +778,20 @@ async function processMistralOCRInBatches(
}
}

async function parseWithFileParser(fileUrl: string, filename: string, mimeType: string) {
async function parseWithFileParser(
fileUrl: string,
filename: string,
mimeType: string,
userId?: string
) {
try {
let content: string
let metadata: FileParseMetadata = {}

if (/^data:/i.test(fileUrl)) {
content = await parseDataURI(fileUrl, filename, mimeType)
} else if (/^https?:\/\//i.test(fileUrl)) {
const result = await parseHttpFile(fileUrl, filename, mimeType)
const result = await parseHttpFile(fileUrl, filename, mimeType, userId)
content = result.content
metadata = result.metadata || {}
} else {
Expand Down Expand Up @@ -820,9 +830,10 @@ async function parseDataURI(fileUrl: string, filename: string, mimeType: string)
async function parseHttpFile(
fileUrl: string,
filename: string,
mimeType?: string
mimeType?: string,
userId?: string
): Promise<{ content: string; metadata?: FileParseMetadata }> {
const buffer = await downloadFileWithTimeout(fileUrl)
const buffer = await downloadFileWithTimeout(fileUrl, userId)

const extension = resolveParserExtension(filename, mimeType)
const result = await parseBuffer(buffer, extension)
Expand Down
8 changes: 6 additions & 2 deletions apps/sim/lib/knowledge/documents/service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,6 @@ export async function processDocumentAsync(
// KB config + workspace billing + doc tags in one JOIN (was 3 SELECTs).
const contextRows = await db
.select({
userId: knowledgeBase.userId,
workspaceId: knowledgeBase.workspaceId,
chunkingConfig: knowledgeBase.chunkingConfig,
embeddingModel: knowledgeBase.embeddingModel,
Expand Down Expand Up @@ -644,7 +643,12 @@ export async function processDocumentAsync(
kbConfig.maxSize,
kbConfig.overlap,
kbConfig.minSize,
ctx.userId,
// Authorize the source file (and run OCR/processing) as the billed
// actor — the uploader when known, else the workspace billed account —
// the same principal embeddings are billed to. Using the KB owner here
// would authorize an attacker-supplied internal fileUrl against the
// owner, letting a KB write-member ingest a file only the owner can read.
billingUserId,
ctx.workspaceId,
rawConfig?.strategy,
rawConfig?.strategyOptions
Expand Down
61 changes: 53 additions & 8 deletions apps/sim/lib/uploads/utils/file-utils.server.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
'use server'

import type { Logger } from '@sim/logger'
import { createLogger, type Logger } from '@sim/logger'
import { getErrorMessage } from '@sim/utils/errors'
import { getMaxExecutionTimeout } from '@/lib/core/execution-limits'
import {
Expand All @@ -25,6 +25,8 @@ import {
import { verifyFileAccess } from '@/app/api/files/authorization'
import type { UserFile } from '@/executor/types'

const logger = createLogger('FileUtilsServer')

/**
* Result type for file input resolution
*/
Expand Down Expand Up @@ -138,19 +140,62 @@ export async function resolveFileInputToUrl(
}

/**
* Download a file from a URL (internal or external)
* For internal URLs, uses direct storage access (server-side only)
* For external URLs, validates DNS/SSRF and uses secure fetch with IP pinning
* Options for {@link downloadFileFromUrl}.
*/
export interface DownloadFileFromUrlOptions {
/** Download timeout for external URLs. Defaults to the max execution timeout. */
timeoutMs?: number
/** Hard cap on the number of bytes read from the source. */
maxBytes?: number
/**
* Principal the download is performed on behalf of. Required to authorize
* internal (`/api/files/serve/...`) URLs: the resolved storage key is checked
* with {@link verifyFileAccess} before any bytes are read. Without it, internal
* URLs are rejected (fail closed) so a `/api/files/serve/` substring can never
* be treated as implicitly trusted.
*/
userId?: string
}

/**
* Download a file from a URL (internal or external).
*
* For internal URLs, uses direct storage access (server-side only) after
* authorizing the resolved storage key against `userId`. Context is derived
* from the key via {@link inferContextFromKey}, never from a caller-controlled
* `?context=` query param — trusting the param would let a private key be
* labeled with a world-readable context (e.g. profile-pictures) so
* {@link verifyFileAccess} short-circuits to granted while the private object is
* still read. This mirrors how `/api/files/serve` resolves context.
*
* For external URLs, validates DNS/SSRF and uses secure fetch with IP pinning.
*/
export async function downloadFileFromUrl(
fileUrl: string,
timeoutMs = getMaxExecutionTimeout(),
maxBytes?: number
options: DownloadFileFromUrlOptions = {}
): Promise<Buffer> {
const { parseInternalFileUrl } = await import('./file-utils')
const { timeoutMs = getMaxExecutionTimeout(), maxBytes, userId } = options

if (isInternalFileUrl(fileUrl)) {
const { key, context } = parseInternalFileUrl(fileUrl)
if (!userId) {
logger.warn('Internal file download denied: no userId provided', { fileUrl })
throw new Error('Access denied: internal file URL requires an authenticated user')
}

const key = extractStorageKey(fileUrl)
if (!key) {
logger.warn('Internal file download denied: could not resolve storage key', { fileUrl })
throw new Error('Access denied: could not resolve internal file key')
}

const context = inferContextFromKey(key)

const hasAccess = await verifyFileAccess(key, userId, undefined, context, false)
Comment thread
waleedlatif1 marked this conversation as resolved.
Comment thread
waleedlatif1 marked this conversation as resolved.
if (!hasAccess) {
logger.warn('Internal file download denied: access check failed', { key, context, userId })
throw new Error('Access denied: file not found or insufficient permissions')
}

const { downloadFile } = await import('@/lib/uploads/core/storage-service')
return downloadFile({ key, context, maxBytes })
Comment thread
waleedlatif1 marked this conversation as resolved.
}
Expand Down
37 changes: 36 additions & 1 deletion apps/sim/lib/uploads/utils/file-utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,42 @@
* @vitest-environment node
*/
import { describe, expect, it } from 'vitest'
import { isAbortError, isNetworkError } from '@/lib/uploads/utils/file-utils'
import { isAbortError, isInternalFileUrl, isNetworkError } from '@/lib/uploads/utils/file-utils'

describe('isInternalFileUrl', () => {
it('classifies relative serve paths as internal', () => {
expect(isInternalFileUrl('/api/files/serve/kb/123-file.pdf')).toBe(true)
expect(isInternalFileUrl('/api/files/serve/workspace/ws-1/file.txt?context=workspace')).toBe(
true
)
})

it('classifies absolute serve URLs as internal regardless of host', () => {
expect(isInternalFileUrl('https://www.sim.ai/api/files/serve/kb/x.pdf')).toBe(true)
expect(isInternalFileUrl('http://localhost:3000/api/files/serve/blob/kb/x')).toBe(true)
// Host is not used to gate (self-hosted/multi-domain); the storage sink authorizes.
expect(isInternalFileUrl('https://other-host/api/files/serve/workspace/v/x')).toBe(true)
})

it('does not match the marker outside the path (query/fragment)', () => {
expect(isInternalFileUrl('https://evil.com/x?next=/api/files/serve/secret')).toBe(false)
expect(isInternalFileUrl('https://evil.com/page#/api/files/serve/secret')).toBe(false)
expect(isInternalFileUrl('https://evil.com/redirect?u=/api/files/serve/kb/x')).toBe(false)
})

it('preserves traversal sequences so they survive downstream rejection', () => {
// Must stay internal (not normalized away) so the parse route applies its `..` check.
expect(isInternalFileUrl('https://attacker.com/api/files/serve/../../../etc/passwd')).toBe(true)
expect(isInternalFileUrl('/api/files/serve/../../app.js')).toBe(true)
})

it('returns false for non-internal and non-string inputs', () => {
expect(isInternalFileUrl('https://example.com/file.pdf')).toBe(false)
expect(isInternalFileUrl('data:text/plain;base64,abc')).toBe(false)
// @ts-expect-error verifying runtime guard
expect(isInternalFileUrl(undefined)).toBe(false)
})
})

describe('isAbortError', () => {
it('returns true for AbortError-named errors', () => {
Expand Down
27 changes: 25 additions & 2 deletions apps/sim/lib/uploads/utils/file-utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -534,10 +534,33 @@ export function extractStorageKey(filePath: string): string {
}

/**
* Check if a URL is an internal file serve URL
* Whether a URL targets the internal file-serve endpoint (`/api/files/serve/`).
*
* The marker is matched only in the URL's path component, so it cannot be
* smuggled through a query string or fragment (e.g.
* `https://evil.com/x?next=/api/files/serve/...`) to skip DNS/SSRF validation.
*
* The raw path is inspected without URL normalization on purpose: callers such
* as the files parse route rely on traversal sequences (`..`) surviving this
* check so they are rejected downstream rather than collapsed away. A path-only
* marker still classifies any host as internal (e.g.
* `https://other-host/api/files/serve/<key>`); cross-tenant reads are prevented
* at the storage sink by {@link verifyFileAccess}, not by host matching, which
* would break self-hosted and multi-domain deployments.
*/
export function isInternalFileUrl(fileUrl: string): boolean {
return fileUrl.includes('/api/files/serve/')
if (typeof fileUrl !== 'string') {
return false
}

let path = fileUrl
const scheme = /^[a-z][a-z0-9+.-]*:\/\/[^/?#]*/i.exec(path)
if (scheme) {
path = path.slice(scheme[0].length)
}
path = path.split(/[?#]/, 1)[0]

return path.startsWith('/api/files/serve/')
}

/**
Expand Down
Loading