@@ -11,7 +11,9 @@ import {
1111 useState ,
1212} from 'react'
1313import { createLogger } from '@sim/logger'
14+ import { getErrorMessage } from '@sim/utils/errors'
1415import { generateId } from '@sim/utils/id'
16+ import { backoffWithJitter } from '@sim/utils/retry'
1517import { useParams } from 'next/navigation'
1618import type { Socket } from 'socket.io-client'
1719import { getSocketUrl } from '@/lib/core/utils/urls'
@@ -28,6 +30,9 @@ import { useWorkflowRegistry as useWorkflowRegistryStore } from '@/stores/workfl
2830
2931const logger = createLogger ( 'SocketContext' )
3032
33+ /** Cap on auto-retries after an auth failure before latching for a manual reload, so a genuine logout stops re-minting tokens. */
34+ const MAX_AUTH_RETRY_ATTEMPTS = 10
35+
3136const TAB_SESSION_ID_KEY = 'sim_tab_session_id'
3237
3338function getTabSessionId ( ) : string {
@@ -162,6 +167,8 @@ export function SocketProvider({ children, user }: SocketProviderProps) {
162167 const explicitWorkflowIdRef = useRef < string | null > ( explicitWorkflowId )
163168 const joinControllerRef = useRef ( new SocketJoinController ( ) )
164169 const joinRetryTimeoutRef = useRef < ReturnType < typeof setTimeout > | null > ( null )
170+ const authRetryTimeoutRef = useRef < ReturnType < typeof setTimeout > | null > ( null )
171+ const authRetryAttemptRef = useRef ( 0 )
165172
166173 const params = useParams ( )
167174 const urlWorkflowId = params ?. workflowId as string | undefined
@@ -213,6 +220,48 @@ export function SocketProvider({ children, user }: SocketProviderProps) {
213220 }
214221 } , [ ] )
215222
223+ const clearAuthRetryTimeout = useCallback ( ( ) => {
224+ if ( authRetryTimeoutRef . current !== null ) {
225+ clearTimeout ( authRetryTimeoutRef . current )
226+ authRetryTimeoutRef . current = null
227+ }
228+ } , [ ] )
229+
230+ /**
231+ * Recover from a server-denied handshake (token/auth rejection). Socket.IO does
232+ * not auto-reconnect after a namespace middleware rejection — the socket is
233+ * destroyed and `socket.active` is `false` — so we re-run `connect()` with
234+ * backoff, which re-invokes the auth callback to mint a fresh token. This covers
235+ * both a transient mint failure (recovers on the next attempt) and a real 401;
236+ * after {@link MAX_AUTH_RETRY_ATTEMPTS} we latch `authFailed` for a manual reload
237+ * instead of re-minting forever.
238+ */
239+ const scheduleAuthRetry = useCallback (
240+ ( socketInstance : Socket ) => {
241+ clearAuthRetryTimeout ( )
242+ const attempt = authRetryAttemptRef . current
243+
244+ if ( attempt >= MAX_AUTH_RETRY_ATTEMPTS ) {
245+ setIsReconnecting ( false )
246+ setAuthFailed ( true )
247+ logger . warn ( 'Socket auth retries exhausted; latching until manual reload' , {
248+ attempts : attempt ,
249+ } )
250+ return
251+ }
252+
253+ setIsReconnecting ( true )
254+ const delay = backoffWithJitter ( attempt + 1 , null , { baseMs : 1000 , maxMs : 30000 } )
255+ authRetryTimeoutRef . current = setTimeout ( ( ) => {
256+ authRetryTimeoutRef . current = null
257+ authRetryAttemptRef . current = attempt + 1
258+ logger . info ( 'Retrying socket connection after denied handshake' , { attempt } )
259+ socketInstance . connect ( )
260+ } , delay )
261+ } ,
262+ [ clearAuthRetryTimeout ]
263+ )
264+
216265 const resetVisibleWorkflowState = useCallback ( ( workflowId ?: string | null ) => {
217266 if ( workflowId ) {
218267 useOperationQueueStore . getState ( ) . cancelOperationsForWorkflow ( workflowId )
@@ -326,11 +375,6 @@ export function SocketProvider({ children, user }: SocketProviderProps) {
326375 useEffect ( ( ) => {
327376 if ( ! user ?. id ) return
328377
329- if ( authFailed ) {
330- logger . info ( 'Socket initialization skipped - auth failed, waiting for retry' )
331- return
332- }
333-
334378 if ( initializedRef . current || socket || isConnecting ) {
335379 logger . info ( 'Socket already exists or is connecting, skipping initialization' )
336380 return
@@ -360,22 +404,23 @@ export function SocketProvider({ children, user }: SocketProviderProps) {
360404 timeout : 10000 ,
361405 auth : async ( cb ) => {
362406 try {
363- const freshToken = await generateSocketToken ( )
364- cb ( { token : freshToken } )
407+ cb ( { token : await generateSocketToken ( ) } )
365408 } catch ( error ) {
366- logger . error ( 'Failed to generate fresh token for connection:' , error )
367- if ( error instanceof Error && error . message === 'Authentication required' ) {
368- // True auth failure - pass null token, server will reject with "Authentication required"
369- cb ( { token : null } )
370- }
371- // For server errors, don't call cb - connection will timeout and Socket.IO will retry
409+ logger . warn ( 'Failed to mint socket token; handshake will be denied and retried' , {
410+ error : getErrorMessage ( error ) ,
411+ } )
412+ cb ( { token : null } )
372413 }
373414 } ,
374415 } )
375416
376417 socketInstance . on ( 'connect' , ( ) => {
377418 setIsConnected ( true )
378419 setIsConnecting ( false )
420+ setIsReconnecting ( false )
421+ setAuthFailed ( false )
422+ authRetryAttemptRef . current = 0
423+ clearAuthRetryTimeout ( )
379424 setCurrentSocketId ( socketInstance . id ?? null )
380425 logger . info ( 'Socket connected successfully' , {
381426 socketId : socketInstance . id ,
@@ -406,24 +451,10 @@ export function SocketProvider({ children, user }: SocketProviderProps) {
406451 setIsConnecting ( false )
407452 logger . error ( 'Socket connection error:' , { message : error . message } )
408453
409- // Check if this is an authentication failure
410- const isAuthError =
411- error . message ?. includes ( 'Token validation failed' ) ||
412- error . message ?. includes ( 'Authentication failed' ) ||
413- error . message ?. includes ( 'Authentication required' )
414-
415- if ( isAuthError ) {
416- logger . warn (
417- 'Authentication failed - stopping reconnection attempts. User may need to refresh/re-login.'
418- )
419- socketInstance . disconnect ( )
420- setSocket ( null )
421- setAuthFailed ( true )
422- setIsReconnecting ( false )
423- initializedRef . current = false
424- } else if ( socketInstance . active ) {
425- // Temporary failure, will auto-reconnect
454+ if ( socketInstance . active ) {
426455 setIsReconnecting ( true )
456+ } else {
457+ scheduleAuthRetry ( socketInstance )
427458 }
428459 } )
429460
@@ -722,6 +753,7 @@ export function SocketProvider({ children, user }: SocketProviderProps) {
722753
723754 return ( ) => {
724755 clearJoinRetryTimeout ( )
756+ clearAuthRetryTimeout ( )
725757 positionUpdateTimeouts . current . forEach ( ( timeoutId ) => {
726758 clearTimeout ( timeoutId )
727759 } )
@@ -735,7 +767,7 @@ export function SocketProvider({ children, user }: SocketProviderProps) {
735767 socketRef . current = null
736768 }
737769 }
738- } , [ user ?. id , authFailed ] )
770+ } , [ user ?. id ] )
739771
740772 const hydrationPhase = useWorkflowRegistryStore ( ( s ) => s . hydration . phase )
741773
@@ -770,19 +802,21 @@ export function SocketProvider({ children, user }: SocketProviderProps) {
770802 } , [ ] )
771803
772804 /**
773- * Retry socket connection after auth failure.
774- * Call this when user has re-authenticated (e.g., after login redirect).
805+ * Manually retry after auth retries were exhausted and `authFailed` latched.
806+ * Resets the backoff counter and reconnects the existing socket, which re-runs
807+ * the auth callback to mint a fresh token (e.g. after re-authenticating).
775808 */
776809 const retryConnection = useCallback ( ( ) => {
777810 if ( ! authFailed ) {
778811 logger . info ( 'retryConnection called but no auth failure - ignoring' )
779812 return
780813 }
781814 logger . info ( 'Retrying socket connection after auth failure' )
815+ clearAuthRetryTimeout ( )
816+ authRetryAttemptRef . current = 0
782817 setAuthFailed ( false )
783- // initializedRef.current was already reset in connect_error handler
784- // Effect will re-run and attempt connection
785- } , [ authFailed ] )
818+ socketRef . current ?. connect ( )
819+ } , [ authFailed , clearAuthRetryTimeout ] )
786820
787821 const emitWorkflowOperation = useCallback (
788822 ( workflowId : string , operation : string , target : string , payload : any , operationId ?: string ) => {
0 commit comments