Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@ import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'

import HttpError from '@/errors/http'
import RetriableError, { DEFAULT_DELAY_MS } from '@/errors/retriable-error'
import StepError from '@/errors/step'
import createHttpClient, { type IHttpClient } from '@/helpers/http-client'

import m365ExcelApp from '../..'
import { EXCEL_504_ERROR_CODE } from '../../common/interceptors/request-error-handler'

function mockAxiosAdapterToThrowOnce(
status: AxiosResponse['status'],
Expand Down Expand Up @@ -90,6 +92,12 @@ describe('M365 request error handlers', () => {
tenantKey: 'test-tenant',
},
},
step: {
position: 1,
},
app: {
name: 'M365 Excel',
},
} as unknown as IGlobalVariable
http = createHttpClient({
$,
Expand Down Expand Up @@ -211,6 +219,70 @@ describe('M365 request error handlers', () => {
)
})

it('throws RetriableError with EXCEL_504 errorCode on 504 (live run)', async () => {
// Default $ doesn't have execution.testRun, so this is the live run path
mockAxiosAdapterToThrowOnce(504)
await http
.get('/test-url')
.then(() => {
expect.unreachable()
})
.catch((error): void => {
expect(error).toBeInstanceOf(RetriableError)
expect(error.delayType).toEqual('step')
expect(error.delayInMs).toEqual(DEFAULT_DELAY_MS)
expect(error.errorCode).toEqual(EXCEL_504_ERROR_CODE)
// Message contains the user-friendly error (will be saved as errorDetails)
expect(error.message).toContain('Excel request timed out')
})
expect(mocks.logWarning).toHaveBeenCalledWith(
expect.stringContaining('HTTP 504'),
expect.objectContaining({ event: 'm365-http-504' }),
)
})

it('throws StepError immediately on 504 (test run)', async () => {
// Create http client with testRun: true
const $testRun = {
auth: {
data: {
tenantKey: 'test-tenant',
},
},
step: {
position: 1,
},
app: {
name: 'M365 Excel',
},
execution: {
testRun: true,
},
} as unknown as IGlobalVariable
const httpTestRun = createHttpClient({
$: $testRun,
baseURL: 'http://localhost/mock-m365-graph-api',
beforeRequest: [],
requestErrorHandler: m365ExcelApp.requestErrorHandler,
})

mockAxiosAdapterToThrowOnce(504)
await httpTestRun
.get('/test-url')
.then(() => {
expect.unreachable()
})
.catch((error): void => {
expect(error).toBeInstanceOf(StepError)
expect(error.message).toContain('Excel request timed out')
expect(error.message).toContain('long-running formulas')
})
expect(mocks.logWarning).toHaveBeenCalledWith(
expect.stringContaining('HTTP 504'),
expect.objectContaining({ event: 'm365-http-504' }),
)
})

it('throws a RetriableError with default step delay on ETIMEDOUT', async () => {
mockAxiosAdapterToThrowNetworkErrorOnce('ETIMEDOUT')
await expect(http.get('/test-url')).rejects.toThrow(RetriableError)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import type { IApp } from '@plumber/types'

import RetriableError from '@/errors/retriable-error'
import StepError from '@/errors/step'
import logger from '@/helpers/logger'
import { parseRetryAfterToMs } from '@/helpers/parse-retry-after-to-ms'

Expand Down Expand Up @@ -98,6 +99,48 @@ const handle500and502and503: ThrowingHandler = function ($, error) {
})
}

//
// Handle 504 Gateway Timeout - typically caused by long-running formulas.
// Test runs: fail immediately with helpful message.
// Live runs: retry up to 3 times, then fail with helpful message.
//
export const EXCEL_504_ERROR_CODE = 'EXCEL_504'
export const EXCEL_504_MAX_ATTEMPTS = 3
const EXCEL_504_ERROR_MESSAGE = {
name: 'Excel request timed out',
solution:
'Your Excel file most likely has long-running formulas. Please either simplify the formulas or set the calculation options (under the Formulas tab) to manual instead of automatic.',
}

const handle504: ThrowingHandler = function ($, error) {
logger.warn('Received HTTP 504 from MS Graph', {
event: 'm365-http-504',
tenant: $.auth?.data?.tenantKey as string,
baseUrl: error.response.config.baseURL,
url: error.response.config.url,
flowId: $.flow?.id,
stepId: $.step?.id,
executionId: $.execution?.id,
})

// For test runs, fail immediately with user-friendly message
if ($.execution?.testRun) {
throw new StepError(
EXCEL_504_ERROR_MESSAGE.name,
EXCEL_504_ERROR_MESSAGE.solution,
)
}

// For live runs, throw RetriableError with user-friendly message
// The message will be saved as errorDetails, so it's correct from the start
throw new RetriableError({
error: EXCEL_504_ERROR_MESSAGE,
delayType: 'step',
delayInMs: 'default',
errorCode: EXCEL_504_ERROR_CODE,
})
}

//
// Handle exceeding bandwidth limit
//
Expand Down Expand Up @@ -131,6 +174,8 @@ const errorHandler: IApp['requestErrorHandler'] = async function ($, error) {
case 502: // Bad gateway
case 503: // Transient error
return handle500and502and503($, error)
case 504: // Gateway timeout - likely due to long-running formulas
return handle504($, error)
case 509: // Bandwidth limit reached
return handle509($, error)
default:
Expand Down
11 changes: 10 additions & 1 deletion packages/backend/src/errors/retriable-error.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ interface RetriableErrorParams {
error: ConstructorParameters<typeof BaseError>[0]
delayInMs: number | 'default'
delayType: 'step' | 'group' | 'queue'
/** Optional error code for custom retry behavior (e.g., different max attempts) */
errorCode?: string
}

/**
Expand Down Expand Up @@ -35,11 +37,18 @@ interface RetriableErrorParams {
export default class RetriableError extends BaseError {
delayInMs: number
delayType: RetriableErrorParams['delayType']
errorCode?: string

constructor({ error, delayInMs, delayType }: RetriableErrorParams) {
constructor({
error,
delayInMs,
delayType,
errorCode,
}: RetriableErrorParams) {
super(error)

this.delayInMs = delayInMs === 'default' ? DEFAULT_DELAY_MS : delayInMs
this.delayType = delayType
this.errorCode = errorCode
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,25 @@ import { UnrecoverableError } from '@taskforcesh/bullmq-pro'
import { type Span } from 'dd-trace'
import get from 'lodash.get'

import {
EXCEL_504_ERROR_CODE,
EXCEL_504_MAX_ATTEMPTS,
} from '@/apps/m365-excel/common/interceptors/request-error-handler'
import HttpError from '@/errors/http'
import RetriableError from '@/errors/retriable-error'
import StepError from '@/errors/step'
import ExecutionStep from '@/models/execution-step'

import { MAXIMUM_JOB_ATTEMPTS } from '../default-job-configuration'

/**
* Map of error codes to their custom max attempts.
* When max attempts is reached, the job fails with the error message
* that was already set in errorDetails (from RetriableError).
*/
const ERROR_CODE_MAX_ATTEMPTS: Record<string, number> = {
[EXCEL_504_ERROR_CODE]: EXCEL_504_MAX_ATTEMPTS,
}
import { parseRetryAfterToMs } from '../parse-retry-after-to-ms'

/**
Expand All @@ -33,16 +46,17 @@ function handleRetriableError(
executionError: RetriableError,
context: HandleFailedStepAndThrowParams['context'],
): never {
const { delayType, delayInMs } = executionError
const { delayType, delayInMs, errorCode } = executionError
const { worker, job, isQueueDelayable } = context

// Use custom max attempts if errorCode has one configured
const maxAttempts = errorCode
? ERROR_CODE_MAX_ATTEMPTS[errorCode] ?? MAXIMUM_JOB_ATTEMPTS
: MAXIMUM_JOB_ATTEMPTS

switch (delayType) {
case 'queue':
checkIfAttemptsExhausted(
job.attemptsStarted,
MAXIMUM_JOB_ATTEMPTS,
executionError,
)
checkIfAttemptsExhausted(job.attemptsStarted, maxAttempts, executionError)
if (isQueueDelayable) {
worker.rateLimit(delayInMs)
throw WorkerPro.RateLimitError()
Expand All @@ -56,11 +70,7 @@ function handleRetriableError(
// off a small alert.
throw executionError
case 'group': {
checkIfAttemptsExhausted(
job.attemptsStarted,
MAXIMUM_JOB_ATTEMPTS,
executionError,
)
checkIfAttemptsExhausted(job.attemptsStarted, maxAttempts, executionError)
const groupId = job.opts?.group?.id
if (groupId) {
worker.rateLimitGroup(job, delayInMs)
Expand All @@ -72,7 +82,9 @@ function handleRetriableError(
throw executionError
}
case 'step':
// Finally, OK to pass this through to our worker's retry handler
// Check if max attempts reached for custom error codes
checkIfAttemptsExhausted(job.attemptsStarted, maxAttempts, executionError)
// OK to pass this through to our worker's retry handler
throw executionError
}
}
Expand Down Expand Up @@ -168,12 +180,21 @@ export function handleFailedStepAndThrow(
throw new UnrecoverableError(JSON.stringify(errorDetails))
} catch (finalError) {
// Update span and execution status as necessary.
// Use custom max attempts if the error has an errorCode configured
const errorCode =
executionError instanceof RetriableError
? executionError.errorCode
: undefined
const effectiveMaxAttempts = errorCode
? ERROR_CODE_MAX_ATTEMPTS[errorCode] ?? MAXIMUM_JOB_ATTEMPTS
: MAXIMUM_JOB_ATTEMPTS

const isRetriable =
!(finalError instanceof UnrecoverableError) &&
// -1 is needed because BullMQ only increments attemptsMade _after_ the
// job processor finishes, but we're currently still inside the job
// processor.
job.attemptsMade < MAXIMUM_JOB_ATTEMPTS - 1
job.attemptsMade < effectiveMaxAttempts - 1

span?.addTags({
willRetry: isRetriable ? 'true' : 'false',
Expand Down
Loading