Skip to content

Commit c171550

Browse files
authored
fix: gracefully handle JIT config failures and terminate unconfigured instance (#4990)
This pull request enhances the robustness and reliability of the GitHub Actions runner scaling logic by improving error handling and retry mechanisms for GitHub API calls. It introduces the `@octokit/plugin-retry` plugin to automatically retry failed API requests, adds detailed logging for retry attempts, and ensures that failures in creating JIT configs for individual runner instances do not halt the entire scaling process. Additionally, new tests are added to verify handling of various API failure scenarios. **GitHub API client improvements:** * Added `@octokit/plugin-retry` to dependencies (`package.json`) and integrated it into the Octokit client initialization to enable automatic retries for failed GitHub API requests. [[1]](diffhunk://#diff-37d09418dae74ded5678eabfa3b60993ee491e2fd9e49e11142f639b078ac9b2R41) [[2]](diffhunk://#diff-cf7cdd79fe0ed0e3a2e8928c0c7667a096c47c47abdb2354ddadee67e80a226dR21) [[3]](diffhunk://#diff-cf7cdd79fe0ed0e3a2e8928c0c7667a096c47c47abdb2354ddadee67e80a226dL29-R30) * Configured the retry plugin to log detailed warnings on each retry attempt, including the HTTP method, URL, error message, and status code. **Error handling and resilience in JIT config creation:** * Updated `createJitConfig` in `scale-up.ts` to catch and log errors for individual runner instances when creating JIT configs, allowing the process to continue for remaining instances and logging a summary of failed attempts at the end. [[1]](diffhunk://#diff-fbc68af2a40bf14ad13a80b13958c0b52d1d0fde5f0009416a693fb4b691ceaeR537-R542) [[2]](diffhunk://#diff-fbc68af2a40bf14ad13a80b13958c0b52d1d0fde5f0009416a693fb4b691ceaeR582-R596) * Instances that failed to generate a configuration, will now be terminated to avoid generating waste. **Testing improvements:** * Added comprehensive tests to `scale-up.test.ts` to verify correct behavior when GitHub API calls fail for some instances, including retryable errors (e.g., 5xx), non-retryable errors (e.g., 4xx), and partial failures, ensuring only successful JIT configs are stored.
1 parent 7755b7f commit c171550

File tree

5 files changed

+302
-45
lines changed

5 files changed

+302
-45
lines changed

lambdas/functions/control-plane/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
"@middy/core": "^6.4.5",
3939
"@octokit/auth-app": "8.2.0",
4040
"@octokit/core": "7.0.6",
41+
"@octokit/plugin-retry": "8.0.3",
4142
"@octokit/plugin-throttling": "11.0.3",
4243
"@octokit/rest": "22.0.1",
4344
"cron-parser": "^5.4.0"

lambdas/functions/control-plane/src/github/auth.ts

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ type StrategyOptions = {
1919
import { createSign, randomUUID } from 'node:crypto';
2020
import { request } from '@octokit/request';
2121
import { Octokit } from '@octokit/rest';
22+
import { retry } from '@octokit/plugin-retry';
2223
import { throttling } from '@octokit/plugin-throttling';
2324
import { createChildLogger } from '@aws-github-runner/aws-powertools-util';
2425
import { getParameter } from '@aws-github-runner/aws-ssm-util';
@@ -27,7 +28,7 @@ import { EndpointDefaults } from '@octokit/types';
2728
const logger = createChildLogger('gh-auth');
2829

2930
export async function createOctokitClient(token: string, ghesApiUrl = ''): Promise<Octokit> {
30-
const CustomOctokit = Octokit.plugin(throttling);
31+
const CustomOctokit = Octokit.plugin(retry, throttling);
3132
const ocktokitOptions: OctokitOptions = {
3233
auth: token,
3334
};
@@ -39,6 +40,17 @@ export async function createOctokitClient(token: string, ghesApiUrl = ''): Promi
3940
return new CustomOctokit({
4041
...ocktokitOptions,
4142
userAgent: process.env.USER_AGENT || 'github-aws-runners',
43+
retry: {
44+
onRetry: (retryCount: number, error: Error, request: { method: string; url: string }) => {
45+
logger.warn('GitHub API request retry attempt', {
46+
retryCount,
47+
method: request.method,
48+
url: request.url,
49+
error: error.message,
50+
status: (error as Error & { status?: number }).status,
51+
});
52+
},
53+
},
4254
throttle: {
4355
onRateLimit: (retryAfter: number, options: Required<EndpointDefaults>) => {
4456
logger.warn(

lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,172 @@ describe('scaleUp with GHES', () => {
343343
],
344344
});
345345
});
346+
347+
it('should create JIT config for all remaining instances even when GitHub API fails for one instance', async () => {
348+
process.env.RUNNERS_MAXIMUM_COUNT = '5';
349+
mockCreateRunner.mockImplementation(async () => {
350+
return ['i-instance-1', 'i-instance-2', 'i-instance-3'];
351+
});
352+
mockListRunners.mockImplementation(async () => {
353+
return [];
354+
});
355+
356+
mockOctokit.actions.generateRunnerJitconfigForOrg.mockImplementation(({ name }) => {
357+
if (name === 'unit-test-i-instance-2') {
358+
// Simulate a 503 Service Unavailable error from GitHub
359+
const error = new Error('Service Unavailable') as Error & {
360+
status: number;
361+
response: { status: number; data: { message: string } };
362+
};
363+
error.status = 503;
364+
error.response = {
365+
status: 503,
366+
data: { message: 'Service temporarily unavailable' },
367+
};
368+
throw error;
369+
}
370+
return {
371+
data: {
372+
runner: { id: 9876543210 },
373+
encoded_jit_config: `TEST_JIT_CONFIG_${name}`,
374+
},
375+
headers: {},
376+
};
377+
});
378+
379+
await scaleUpModule.scaleUp(TEST_DATA);
380+
381+
expect(mockOctokit.actions.generateRunnerJitconfigForOrg).toHaveBeenCalledWith({
382+
org: TEST_DATA_SINGLE.repositoryOwner,
383+
name: 'unit-test-i-instance-1',
384+
runner_group_id: 1,
385+
labels: ['label1', 'label2'],
386+
});
387+
388+
expect(mockOctokit.actions.generateRunnerJitconfigForOrg).toHaveBeenCalledWith({
389+
org: TEST_DATA_SINGLE.repositoryOwner,
390+
name: 'unit-test-i-instance-2',
391+
runner_group_id: 1,
392+
labels: ['label1', 'label2'],
393+
});
394+
395+
expect(mockOctokit.actions.generateRunnerJitconfigForOrg).toHaveBeenCalledWith({
396+
org: TEST_DATA_SINGLE.repositoryOwner,
397+
name: 'unit-test-i-instance-3',
398+
runner_group_id: 1,
399+
labels: ['label1', 'label2'],
400+
});
401+
402+
expect(mockSSMClient).toHaveReceivedCommandWith(PutParameterCommand, {
403+
Name: '/github-action-runners/default/runners/config/i-instance-1',
404+
Value: 'TEST_JIT_CONFIG_unit-test-i-instance-1',
405+
Type: 'SecureString',
406+
Tags: [{ Key: 'InstanceId', Value: 'i-instance-1' }],
407+
});
408+
409+
expect(mockSSMClient).toHaveReceivedCommandWith(PutParameterCommand, {
410+
Name: '/github-action-runners/default/runners/config/i-instance-3',
411+
Value: 'TEST_JIT_CONFIG_unit-test-i-instance-3',
412+
Type: 'SecureString',
413+
Tags: [{ Key: 'InstanceId', Value: 'i-instance-3' }],
414+
});
415+
416+
expect(mockSSMClient).not.toHaveReceivedCommandWith(PutParameterCommand, {
417+
Name: '/github-action-runners/default/runners/config/i-instance-2',
418+
});
419+
});
420+
421+
it('should handle retryable errors with error handling logic', async () => {
422+
process.env.RUNNERS_MAXIMUM_COUNT = '5';
423+
mockCreateRunner.mockImplementation(async () => {
424+
return ['i-instance-1', 'i-instance-2'];
425+
});
426+
mockListRunners.mockImplementation(async () => {
427+
return [];
428+
});
429+
430+
mockOctokit.actions.generateRunnerJitconfigForOrg.mockImplementation(({ name }) => {
431+
if (name === 'unit-test-i-instance-1') {
432+
const error = new Error('Internal Server Error') as Error & {
433+
status: number;
434+
response: { status: number; data: { message: string } };
435+
};
436+
error.status = 500;
437+
error.response = {
438+
status: 500,
439+
data: { message: 'Internal server error' },
440+
};
441+
throw error;
442+
}
443+
return {
444+
data: {
445+
runner: { id: 9876543210 },
446+
encoded_jit_config: `TEST_JIT_CONFIG_${name}`,
447+
},
448+
headers: {},
449+
};
450+
});
451+
452+
await scaleUpModule.scaleUp(TEST_DATA);
453+
454+
expect(mockSSMClient).toHaveReceivedCommandWith(PutParameterCommand, {
455+
Name: '/github-action-runners/default/runners/config/i-instance-2',
456+
Value: 'TEST_JIT_CONFIG_unit-test-i-instance-2',
457+
Type: 'SecureString',
458+
Tags: [{ Key: 'InstanceId', Value: 'i-instance-2' }],
459+
});
460+
461+
expect(mockSSMClient).not.toHaveReceivedCommandWith(PutParameterCommand, {
462+
Name: '/github-action-runners/default/runners/config/i-instance-1',
463+
});
464+
});
465+
466+
it('should handle non-retryable 4xx errors gracefully', async () => {
467+
process.env.RUNNERS_MAXIMUM_COUNT = '5';
468+
mockCreateRunner.mockImplementation(async () => {
469+
return ['i-instance-1', 'i-instance-2'];
470+
});
471+
mockListRunners.mockImplementation(async () => {
472+
return [];
473+
});
474+
475+
mockOctokit.actions.generateRunnerJitconfigForOrg.mockImplementation(({ name }) => {
476+
if (name === 'unit-test-i-instance-1') {
477+
// 404 is not retryable - will fail immediately
478+
const error = new Error('Not Found') as Error & {
479+
status: number;
480+
response: { status: number; data: { message: string } };
481+
};
482+
error.status = 404;
483+
error.response = {
484+
status: 404,
485+
data: { message: 'Resource not found' },
486+
};
487+
throw error;
488+
}
489+
return {
490+
data: {
491+
runner: { id: 9876543210 },
492+
encoded_jit_config: `TEST_JIT_CONFIG_${name}`,
493+
},
494+
headers: {},
495+
};
496+
});
497+
498+
await scaleUpModule.scaleUp(TEST_DATA);
499+
500+
expect(mockSSMClient).toHaveReceivedCommandWith(PutParameterCommand, {
501+
Name: '/github-action-runners/default/runners/config/i-instance-2',
502+
Value: 'TEST_JIT_CONFIG_unit-test-i-instance-2',
503+
Type: 'SecureString',
504+
Tags: [{ Key: 'InstanceId', Value: 'i-instance-2' }],
505+
});
506+
507+
expect(mockSSMClient).not.toHaveReceivedCommandWith(PutParameterCommand, {
508+
Name: '/github-action-runners/default/runners/config/i-instance-1',
509+
});
510+
});
511+
346512
it.each(RUNNER_TYPES)(
347513
'calls create start runner config of 40' + ' instances (ssm rate limit condition) to test time delay ',
348514
async (type: RunnerType) => {

0 commit comments

Comments
 (0)