Skip to content

Commit 13cf392

Browse files
committed
fix(files): never dedup external URL fetches by path filename
External URL fetches in the file parse route were keyed on the path filename, so two distinct URLs whose paths ended in `image.png` (e.g. every Slack clipboard paste) collided in the workspace cache and returned stale bytes from a prior fetch. Extracts the fetch + save flow into `fetchExternalUrlToWorkspace` so the broken dedup pattern can't be reintroduced. The helper always downloads; `uploadWorkspaceFile` handles name collisions on save by suffixing (`image.png` -> `image (1).png` -> ...).
1 parent a14d374 commit 13cf392

5 files changed

Lines changed: 454 additions & 154 deletions

File tree

apps/sim/app/api/files/parse/route.test.ts

Lines changed: 49 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,6 @@ const {
3131
mockFsWriteFile,
3232
mockJoin,
3333
actualPath,
34-
mockFileExistsInWorkspace,
35-
mockListWorkspaceFiles,
3634
mockUploadWorkspaceFile,
3735
} = vi.hoisted(() => {
3836
// eslint-disable-next-line @typescript-eslint/no-require-imports
@@ -62,9 +60,19 @@ const {
6260
return actualPath.join(...args)
6361
}),
6462
actualPath,
65-
mockFileExistsInWorkspace: vi.fn().mockResolvedValue(false),
66-
mockListWorkspaceFiles: vi.fn().mockResolvedValue([]),
67-
mockUploadWorkspaceFile: vi.fn().mockResolvedValue({}),
63+
mockUploadWorkspaceFile: vi
64+
.fn()
65+
.mockImplementation(
66+
async (workspaceId: string, _userId: string, _buffer: Buffer, fileName: string) => ({
67+
id: 'wf_test',
68+
name: fileName,
69+
size: 0,
70+
type: 'application/octet-stream',
71+
url: `/api/files/serve/${workspaceId}/${fileName}`,
72+
key: `${workspaceId}/${fileName}`,
73+
context: 'workspace',
74+
})
75+
),
6876
}
6977
})
7078

@@ -110,9 +118,7 @@ vi.mock('@/lib/uploads/contexts/execution', () => ({
110118
uploadExecutionFile: vi.fn(),
111119
}))
112120

113-
vi.mock('@/lib/uploads/contexts/workspace', () => ({
114-
fileExistsInWorkspace: mockFileExistsInWorkspace,
115-
listWorkspaceFiles: mockListWorkspaceFiles,
121+
vi.mock('@/lib/uploads/contexts/workspace/workspace-file-manager', () => ({
116122
uploadWorkspaceFile: mockUploadWorkspaceFile,
117123
}))
118124

@@ -190,9 +196,7 @@ describe('File Parse API Route', () => {
190196
mockFsStat.mockResolvedValue({ isFile: () => true, size: 17 })
191197
mockFsReadFile.mockResolvedValue(Buffer.from('test file content'))
192198
mockIsSupportedFileType.mockReturnValue(true)
193-
mockFileExistsInWorkspace.mockResolvedValue(false)
194-
mockListWorkspaceFiles.mockResolvedValue([])
195-
mockUploadWorkspaceFile.mockResolvedValue({})
199+
mockUploadWorkspaceFile.mockClear()
196200
mockParseFile.mockResolvedValue({
197201
content: 'parsed content',
198202
metadata: { pageCount: 1 },
@@ -384,34 +388,32 @@ describe('File Parse API Route', () => {
384388
)
385389
})
386390

387-
it('should preserve the full download cap when an external URL reuses a workspace file', async () => {
391+
it('should never dedup external URL fetches by path filename — two URLs sharing image.png both download', async () => {
388392
inputValidationMockFns.mockValidateUrlWithDNS.mockResolvedValue({
389393
isValid: true,
390394
resolvedIP: '203.0.113.10',
391395
})
392-
inputValidationMockFns.mockSecureFetchWithPinnedIP.mockResolvedValue(
393-
new Response('file content', {
394-
status: 200,
395-
headers: { 'content-type': 'text/plain' },
396-
})
397-
)
398-
mockFileExistsInWorkspace.mockResolvedValueOnce(false).mockResolvedValueOnce(true)
399-
mockListWorkspaceFiles.mockResolvedValueOnce([
400-
{ name: 'file2.txt', key: 'workspace-file2.txt' },
401-
])
402-
403-
mockParseBuffer
404-
.mockResolvedValueOnce({
405-
content: 'a'.repeat(4 * 1024 * 1024),
406-
metadata: { pageCount: 1 },
407-
})
408-
.mockResolvedValueOnce({
409-
content: 'second file',
410-
metadata: { pageCount: 1 },
411-
})
396+
inputValidationMockFns.mockSecureFetchWithPinnedIP
397+
.mockResolvedValueOnce(
398+
new Response('first image bytes', {
399+
status: 200,
400+
headers: { 'content-type': 'image/png' },
401+
})
402+
)
403+
.mockResolvedValueOnce(
404+
new Response('second image bytes — different content', {
405+
status: 200,
406+
headers: { 'content-type': 'image/png' },
407+
})
408+
)
409+
mockIsSupportedFileType.mockReturnValue(false)
410+
permissionsMockFns.mockGetUserEntityPermissions.mockResolvedValue('write')
412411

413412
const req = createMockRequest('POST', {
414-
filePath: ['https://example.com/file1.txt', 'https://example.com/file2.txt'],
413+
filePath: [
414+
'https://files.slack.com/files-pri/T07-FAAA/download/image.png',
415+
'https://files.slack.com/files-pri/T07-FBBB/download/image.png',
416+
],
415417
workspaceId: 'workspace-id',
416418
})
417419

@@ -420,9 +422,21 @@ describe('File Parse API Route', () => {
420422

421423
expect(response.status).toBe(200)
422424
expect(data.results).toHaveLength(2)
423-
expect(storageServiceMockFns.mockDownloadFile).toHaveBeenCalledWith(
424-
expect.objectContaining({ key: 'workspace-file2.txt', maxBytes: 100 * 1024 * 1024 })
425+
expect(inputValidationMockFns.mockSecureFetchWithPinnedIP).toHaveBeenCalledTimes(2)
426+
expect(inputValidationMockFns.mockSecureFetchWithPinnedIP).toHaveBeenNthCalledWith(
427+
1,
428+
'https://files.slack.com/files-pri/T07-FAAA/download/image.png',
429+
'203.0.113.10',
430+
expect.any(Object)
431+
)
432+
expect(inputValidationMockFns.mockSecureFetchWithPinnedIP).toHaveBeenNthCalledWith(
433+
2,
434+
'https://files.slack.com/files-pri/T07-FBBB/download/image.png',
435+
'203.0.113.10',
436+
expect.any(Object)
425437
)
438+
expect(mockUploadWorkspaceFile).toHaveBeenCalledTimes(2)
439+
expect(storageServiceMockFns.mockDownloadFile).not.toHaveBeenCalled()
426440
})
427441

428442
it('should stop multi-file parsing once the combined parsed output is too large', async () => {

apps/sim/app/api/files/parse/route.ts

Lines changed: 34 additions & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,15 @@ import { type NextRequest, NextResponse } from 'next/server'
1010
import { fileParseContract } from '@/lib/api/contracts/storage-transfer'
1111
import { getValidationErrorMessage, parseRequest } from '@/lib/api/server'
1212
import { checkInternalAuth } from '@/lib/auth/hybrid'
13-
import {
14-
secureFetchWithPinnedIP,
15-
validateUrlWithDNS,
16-
} from '@/lib/core/security/input-validation.server'
1713
import { sanitizeUrlForLog } from '@/lib/core/utils/logging'
18-
import {
19-
assertKnownSizeWithinLimit,
20-
DEFAULT_MAX_ERROR_BODY_BYTES,
21-
isPayloadSizeLimitError,
22-
readResponseTextWithLimit,
23-
readResponseToBufferWithLimit,
24-
} from '@/lib/core/utils/stream-limits'
14+
import { assertKnownSizeWithinLimit, isPayloadSizeLimitError } from '@/lib/core/utils/stream-limits'
2515
import { isSupportedFileType, parseFile } from '@/lib/file-parsers'
2616
import { isUsingCloudStorage, type StorageContext, StorageService } from '@/lib/uploads'
2717
import { uploadExecutionFile } from '@/lib/uploads/contexts/execution'
18+
import {
19+
ExternalUrlValidationError,
20+
fetchExternalUrlToWorkspace,
21+
} from '@/lib/uploads/contexts/workspace'
2822
import { UPLOAD_DIR_SERVER } from '@/lib/uploads/core/setup.server'
2923
import { getFileMetadataByKey } from '@/lib/uploads/server/metadata'
3024
import {
@@ -36,7 +30,6 @@ import {
3630
inferContextFromKey,
3731
isInternalFileUrl,
3832
} from '@/lib/uploads/utils/file-utils'
39-
import { getUserEntityPermissions } from '@/lib/workspaces/permissions/utils'
4033
import { verifyFileAccess } from '@/app/api/files/authorization'
4134
import type { UserFile } from '@/executor/types'
4235
import '@/lib/uploads/core/setup.server'
@@ -453,9 +446,16 @@ function validateFilePath(filePath: string): { isValid: boolean; error?: string
453446
}
454447

455448
/**
456-
* Handle external URL
457-
* If workspaceId is provided, checks if file already exists and saves to workspace if not
458-
* If executionContext is provided, also stores the file in execution storage and returns UserFile
449+
* Handle external URL.
450+
*
451+
* Always fetches the URL fresh — there is no filename-based dedup. Distinct URLs
452+
* commonly share a path tail (e.g. every Slack clipboard paste is `image.png`),
453+
* so keying a cache by filename returns stale bytes. `fetchExternalUrlToWorkspace`
454+
* delegates to `uploadWorkspaceFile`, which suffix-disambiguates collisions on save.
455+
*
456+
* Workspace save is skipped when the URL already points at our execution-files
457+
* bucket (re-uploading our own bytes is wasteful and would generate `image (1).png`
458+
* style aliases for files we already own).
459459
*/
460460
async function handleExternalUrl(
461461
url: string,
@@ -470,23 +470,6 @@ async function handleExternalUrl(
470470
): Promise<ParseResult> {
471471
try {
472472
logger.info('Fetching external URL:', url)
473-
logger.info('WorkspaceId for URL save:', workspaceId)
474-
475-
const urlValidation = await validateUrlWithDNS(url, 'fileUrl')
476-
if (!urlValidation.isValid) {
477-
logger.warn(`Blocked external URL request: ${urlValidation.error}`)
478-
return {
479-
success: false,
480-
error: urlValidation.error || 'Invalid external URL',
481-
filePath: url,
482-
}
483-
}
484-
485-
const urlPath = new URL(url).pathname
486-
const filename = urlPath.split('/').pop() || 'download'
487-
const extension = path.extname(filename).toLowerCase().substring(1)
488-
489-
logger.info(`Extracted filename: ${filename}, workspaceId: ${workspaceId}`)
490473

491474
const {
492475
S3_EXECUTION_FILES_CONFIG,
@@ -511,104 +494,27 @@ async function handleExternalUrl(
511494
isExecutionFile = false
512495
}
513496

514-
// Only apply workspace deduplication if:
515-
// 1. WorkspaceId is provided
516-
// 2. URL is NOT from execution files bucket/container
517-
const shouldCheckWorkspace = workspaceId && !isExecutionFile
518-
519-
if (shouldCheckWorkspace) {
520-
const permission = await getUserEntityPermissions(userId, 'workspace', workspaceId)
521-
if (permission === null) {
522-
logger.warn('User does not have workspace access for file parse', {
523-
userId,
524-
workspaceId,
525-
filename,
526-
})
527-
return {
528-
success: false,
529-
error: 'File not found',
530-
filePath: url,
531-
}
532-
}
533-
534-
const { fileExistsInWorkspace, listWorkspaceFiles } = await import(
535-
'@/lib/uploads/contexts/workspace'
536-
)
537-
const exists = await fileExistsInWorkspace(workspaceId, filename)
538-
539-
if (exists) {
540-
logger.info(`File ${filename} already exists in workspace, using existing file`)
541-
const workspaceFiles = await listWorkspaceFiles(workspaceId)
542-
const existingFile = workspaceFiles.find((f) => f.name === filename)
543-
544-
if (existingFile) {
545-
const storageFilePath = `/api/files/serve/${existingFile.key}`
546-
return handleCloudFile(
547-
storageFilePath,
548-
fileType,
549-
'workspace',
550-
userId,
551-
executionContext,
552-
maxDownloadBytes,
553-
maxParsedOutputBytes
554-
)
555-
}
556-
}
557-
}
558-
559-
const response = await secureFetchWithPinnedIP(url, urlValidation.resolvedIP!, {
560-
timeout: DOWNLOAD_TIMEOUT_MS,
561-
maxResponseBytes: maxDownloadBytes,
562-
signal,
563-
...(headers && Object.keys(headers).length > 0 && { headers }),
564-
})
565-
if (!response.ok) {
566-
await readResponseTextWithLimit(response, {
567-
maxBytes: DEFAULT_MAX_ERROR_BODY_BYTES,
568-
label: 'file download error response',
569-
signal,
570-
}).catch(() => '')
571-
throw new Error(`Failed to fetch URL: ${response.status} ${response.statusText}`)
572-
}
573-
574-
const buffer = await readResponseToBufferWithLimit(response, {
575-
maxBytes: maxDownloadBytes,
576-
label: 'file download',
497+
const { filename, buffer, mimeType } = await fetchExternalUrlToWorkspace({
498+
url,
499+
userId,
500+
workspaceId: workspaceId || undefined,
501+
saveToWorkspace: Boolean(workspaceId) && !isExecutionFile,
502+
headers,
577503
signal,
504+
maxDownloadBytes,
505+
timeoutMs: DOWNLOAD_TIMEOUT_MS,
578506
})
507+
const extension = path.extname(filename).toLowerCase().substring(1)
579508

580509
logger.info(`Downloaded file from URL: ${url}, size: ${buffer.length} bytes`)
581510

582511
let userFile: UserFile | undefined
583-
const mimeType = response.headers.get('content-type') || getMimeTypeFromExtension(extension)
584-
585512
if (executionContext) {
586513
try {
587514
userFile = await uploadExecutionFile(executionContext, buffer, filename, mimeType, userId)
588515
logger.info(`Stored file in execution storage: ${filename}`, { key: userFile.key })
589516
} catch (uploadError) {
590-
logger.warn(`Failed to store file in execution storage:`, uploadError)
591-
// Continue without userFile - parsing can still work
592-
}
593-
}
594-
595-
if (shouldCheckWorkspace) {
596-
try {
597-
const permission = await getUserEntityPermissions(userId, 'workspace', workspaceId)
598-
if (permission !== 'admin' && permission !== 'write') {
599-
logger.warn('User does not have write permission for workspace file save', {
600-
userId,
601-
workspaceId,
602-
filename,
603-
permission,
604-
})
605-
} else {
606-
const { uploadWorkspaceFile } = await import('@/lib/uploads/contexts/workspace')
607-
await uploadWorkspaceFile(workspaceId, userId, buffer, filename, mimeType)
608-
logger.info(`Saved URL file to workspace storage: ${filename}`)
609-
}
610-
} catch (saveError) {
611-
logger.warn(`Failed to save URL file to workspace:`, saveError)
517+
logger.warn('Failed to store file in execution storage:', uploadError)
612518
}
613519
}
614520

@@ -657,6 +563,15 @@ async function handleExternalUrl(
657563
}
658564
}
659565

566+
if (error instanceof ExternalUrlValidationError) {
567+
logger.warn(`Blocked external URL request: ${error.message}`)
568+
return {
569+
success: false,
570+
error: error.message,
571+
filePath: url,
572+
}
573+
}
574+
660575
return {
661576
success: false,
662577
error: `Error fetching URL: ${(error as Error).message}`,

0 commit comments

Comments
 (0)