Skip to content

Commit b27dca7

Browse files
committed
fix(socket): recover from denied handshake to clear stuck "Reconnecting"
Accepting an invite switches the active org and immediately redirects into the workspace, so the socket bootstraps under a just-rotated session. A transient token-mint failure during that window left the realtime socket stuck showing "Reconnecting..." until a manual page reload. Follow the documented socket.io pattern and branch connect_error on socket.active. A server-denied handshake (active === false — e.g. a null or expired token rejected by the auth middleware, which destroys the socket and does not auto-reconnect) now retries socket.connect() with capped exponential backoff, re-running the auth callback to mint a fresh token. This recovers a transient failure on the next attempt and bounds a genuine logout to MAX_AUTH_RETRY_ATTEMPTS before latching authFailed for a manual reload. The connect handler clears isReconnecting so a healthy socket never shows it. Replaces the prior error-message sniffing and socket teardown/rebuild.
1 parent 05408fd commit b27dca7

1 file changed

Lines changed: 70 additions & 36 deletions

File tree

apps/sim/app/workspace/providers/socket-provider.tsx

Lines changed: 70 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@ import {
1111
useState,
1212
} from 'react'
1313
import { createLogger } from '@sim/logger'
14+
import { getErrorMessage } from '@sim/utils/errors'
1415
import { generateId } from '@sim/utils/id'
16+
import { backoffWithJitter } from '@sim/utils/retry'
1517
import { useParams } from 'next/navigation'
1618
import type { Socket } from 'socket.io-client'
1719
import { getSocketUrl } from '@/lib/core/utils/urls'
@@ -28,6 +30,9 @@ import { useWorkflowRegistry as useWorkflowRegistryStore } from '@/stores/workfl
2830

2931
const logger = createLogger('SocketContext')
3032

33+
/** Cap on auto-retries after an auth failure before latching for a manual reload, so a genuine logout stops re-minting tokens. */
34+
const MAX_AUTH_RETRY_ATTEMPTS = 10
35+
3136
const TAB_SESSION_ID_KEY = 'sim_tab_session_id'
3237

3338
function getTabSessionId(): string {
@@ -162,6 +167,8 @@ export function SocketProvider({ children, user }: SocketProviderProps) {
162167
const explicitWorkflowIdRef = useRef<string | null>(explicitWorkflowId)
163168
const joinControllerRef = useRef(new SocketJoinController())
164169
const joinRetryTimeoutRef = useRef<ReturnType<typeof setTimeout> | null>(null)
170+
const authRetryTimeoutRef = useRef<ReturnType<typeof setTimeout> | null>(null)
171+
const authRetryAttemptRef = useRef(0)
165172

166173
const params = useParams()
167174
const urlWorkflowId = params?.workflowId as string | undefined
@@ -213,6 +220,48 @@ export function SocketProvider({ children, user }: SocketProviderProps) {
213220
}
214221
}, [])
215222

223+
const clearAuthRetryTimeout = useCallback(() => {
224+
if (authRetryTimeoutRef.current !== null) {
225+
clearTimeout(authRetryTimeoutRef.current)
226+
authRetryTimeoutRef.current = null
227+
}
228+
}, [])
229+
230+
/**
231+
* Recover from a server-denied handshake (token/auth rejection). Socket.IO does
232+
* not auto-reconnect after a namespace middleware rejection — the socket is
233+
* destroyed and `socket.active` is `false` — so we re-run `connect()` with
234+
* backoff, which re-invokes the auth callback to mint a fresh token. This covers
235+
* both a transient mint failure (recovers on the next attempt) and a real 401;
236+
* after {@link MAX_AUTH_RETRY_ATTEMPTS} we latch `authFailed` for a manual reload
237+
* instead of re-minting forever.
238+
*/
239+
const scheduleAuthRetry = useCallback(
240+
(socketInstance: Socket) => {
241+
clearAuthRetryTimeout()
242+
const attempt = authRetryAttemptRef.current
243+
244+
if (attempt >= MAX_AUTH_RETRY_ATTEMPTS) {
245+
setIsReconnecting(false)
246+
setAuthFailed(true)
247+
logger.warn('Socket auth retries exhausted; latching until manual reload', {
248+
attempts: attempt,
249+
})
250+
return
251+
}
252+
253+
setIsReconnecting(true)
254+
const delay = backoffWithJitter(attempt + 1, null, { baseMs: 1000, maxMs: 30000 })
255+
authRetryTimeoutRef.current = setTimeout(() => {
256+
authRetryTimeoutRef.current = null
257+
authRetryAttemptRef.current = attempt + 1
258+
logger.info('Retrying socket connection after denied handshake', { attempt })
259+
socketInstance.connect()
260+
}, delay)
261+
},
262+
[clearAuthRetryTimeout]
263+
)
264+
216265
const resetVisibleWorkflowState = useCallback((workflowId?: string | null) => {
217266
if (workflowId) {
218267
useOperationQueueStore.getState().cancelOperationsForWorkflow(workflowId)
@@ -326,11 +375,6 @@ export function SocketProvider({ children, user }: SocketProviderProps) {
326375
useEffect(() => {
327376
if (!user?.id) return
328377

329-
if (authFailed) {
330-
logger.info('Socket initialization skipped - auth failed, waiting for retry')
331-
return
332-
}
333-
334378
if (initializedRef.current || socket || isConnecting) {
335379
logger.info('Socket already exists or is connecting, skipping initialization')
336380
return
@@ -360,22 +404,23 @@ export function SocketProvider({ children, user }: SocketProviderProps) {
360404
timeout: 10000,
361405
auth: async (cb) => {
362406
try {
363-
const freshToken = await generateSocketToken()
364-
cb({ token: freshToken })
407+
cb({ token: await generateSocketToken() })
365408
} catch (error) {
366-
logger.error('Failed to generate fresh token for connection:', error)
367-
if (error instanceof Error && error.message === 'Authentication required') {
368-
// True auth failure - pass null token, server will reject with "Authentication required"
369-
cb({ token: null })
370-
}
371-
// For server errors, don't call cb - connection will timeout and Socket.IO will retry
409+
logger.warn('Failed to mint socket token; handshake will be denied and retried', {
410+
error: getErrorMessage(error),
411+
})
412+
cb({ token: null })
372413
}
373414
},
374415
})
375416

376417
socketInstance.on('connect', () => {
377418
setIsConnected(true)
378419
setIsConnecting(false)
420+
setIsReconnecting(false)
421+
setAuthFailed(false)
422+
authRetryAttemptRef.current = 0
423+
clearAuthRetryTimeout()
379424
setCurrentSocketId(socketInstance.id ?? null)
380425
logger.info('Socket connected successfully', {
381426
socketId: socketInstance.id,
@@ -406,24 +451,10 @@ export function SocketProvider({ children, user }: SocketProviderProps) {
406451
setIsConnecting(false)
407452
logger.error('Socket connection error:', { message: error.message })
408453

409-
// Check if this is an authentication failure
410-
const isAuthError =
411-
error.message?.includes('Token validation failed') ||
412-
error.message?.includes('Authentication failed') ||
413-
error.message?.includes('Authentication required')
414-
415-
if (isAuthError) {
416-
logger.warn(
417-
'Authentication failed - stopping reconnection attempts. User may need to refresh/re-login.'
418-
)
419-
socketInstance.disconnect()
420-
setSocket(null)
421-
setAuthFailed(true)
422-
setIsReconnecting(false)
423-
initializedRef.current = false
424-
} else if (socketInstance.active) {
425-
// Temporary failure, will auto-reconnect
454+
if (socketInstance.active) {
426455
setIsReconnecting(true)
456+
} else {
457+
scheduleAuthRetry(socketInstance)
427458
}
428459
})
429460

@@ -722,6 +753,7 @@ export function SocketProvider({ children, user }: SocketProviderProps) {
722753

723754
return () => {
724755
clearJoinRetryTimeout()
756+
clearAuthRetryTimeout()
725757
positionUpdateTimeouts.current.forEach((timeoutId) => {
726758
clearTimeout(timeoutId)
727759
})
@@ -735,7 +767,7 @@ export function SocketProvider({ children, user }: SocketProviderProps) {
735767
socketRef.current = null
736768
}
737769
}
738-
}, [user?.id, authFailed])
770+
}, [user?.id])
739771

740772
const hydrationPhase = useWorkflowRegistryStore((s) => s.hydration.phase)
741773

@@ -770,19 +802,21 @@ export function SocketProvider({ children, user }: SocketProviderProps) {
770802
}, [])
771803

772804
/**
773-
* Retry socket connection after auth failure.
774-
* Call this when user has re-authenticated (e.g., after login redirect).
805+
* Manually retry after auth retries were exhausted and `authFailed` latched.
806+
* Resets the backoff counter and reconnects the existing socket, which re-runs
807+
* the auth callback to mint a fresh token (e.g. after re-authenticating).
775808
*/
776809
const retryConnection = useCallback(() => {
777810
if (!authFailed) {
778811
logger.info('retryConnection called but no auth failure - ignoring')
779812
return
780813
}
781814
logger.info('Retrying socket connection after auth failure')
815+
clearAuthRetryTimeout()
816+
authRetryAttemptRef.current = 0
782817
setAuthFailed(false)
783-
// initializedRef.current was already reset in connect_error handler
784-
// Effect will re-run and attempt connection
785-
}, [authFailed])
818+
socketRef.current?.connect()
819+
}, [authFailed, clearAuthRetryTimeout])
786820

787821
const emitWorkflowOperation = useCallback(
788822
(workflowId: string, operation: string, target: string, payload: any, operationId?: string) => {

0 commit comments

Comments
 (0)