Skip to content

Commit 4d2b910

Browse files
committed
feat: mark orphan runners before removing them
1 parent a0f7e5c commit 4d2b910

File tree

6 files changed

+154
-32
lines changed

6 files changed

+154
-32
lines changed

lambdas/functions/control-plane/jest.config.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@ const config: Config = {
66
...defaultConfig,
77
coverageThreshold: {
88
global: {
9-
statements: 97.79,
10-
branches: 96.13,
11-
functions: 95.4,
12-
lines: 98.06,
9+
statements: 97.99,
10+
branches: 96.04,
11+
functions: 97.53,
12+
lines: 98.27,
1313
},
1414
},
1515
};

lambdas/functions/control-plane/src/aws/runners.d.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ export interface RunnerList {
99
type?: string;
1010
repo?: string;
1111
org?: string;
12+
orphan?: boolean;
1213
}
1314

1415
export interface RunnerInfo {

lambdas/functions/control-plane/src/aws/runners.test.ts

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,23 @@ describe('list instances', () => {
6868
launchTime: new Date('2020-10-10T14:48:00.000+09:00'),
6969
type: 'Org',
7070
owner: 'CoderToCat',
71+
orphan: false,
72+
});
73+
});
74+
75+
it('check orphan tag.', async () => {
76+
const instances: DescribeInstancesResult = mockRunningInstances;
77+
instances.Reservations![0].Instances![0].Tags!.push({ Key: 'ghr:orphan', Value: 'true' });
78+
mockEC2Client.on(DescribeInstancesCommand).resolves(instances);
79+
80+
const resp = await listEC2Runners();
81+
expect(resp.length).toBe(1);
82+
expect(resp).toContainEqual({
83+
instanceId: instances.Reservations![0].Instances![0].InstanceId!,
84+
launchTime: instances.Reservations![0].Instances![0].LaunchTime!,
85+
type: 'Org',
86+
owner: 'CoderToCat',
87+
orphan: true,
7188
});
7289
});
7390

@@ -115,6 +132,23 @@ describe('list instances', () => {
115132
});
116133
});
117134

135+
it('filters instances on environment and orphan', async () => {
136+
mockRunningInstances.Reservations![0].Instances![0].Tags!.push({
137+
Key: 'ghr:orphan',
138+
Value: 'true',
139+
});
140+
mockEC2Client.on(DescribeInstancesCommand).resolves(mockRunningInstances);
141+
await listEC2Runners({ environment: ENVIRONMENT, orphan: true });
142+
expect(mockEC2Client).toHaveReceivedCommandWith(DescribeInstancesCommand, {
143+
Filters: [
144+
{ Name: 'instance-state-name', Values: ['running', 'pending'] },
145+
{ Name: 'tag:ghr:environment', Values: [ENVIRONMENT] },
146+
{ Name: 'tag:ghr:orphan', Values: ['true'] },
147+
{ Name: 'tag:ghr:Application', Values: ['github-action-runner'] },
148+
],
149+
});
150+
});
151+
118152
it('No instances, undefined reservations list.', async () => {
119153
const noInstances: DescribeInstancesResult = {
120154
Reservations: undefined,

lambdas/functions/control-plane/src/aws/runners.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ function getRunnerInfo(runningInstances: DescribeInstancesResult) {
9090
type: i.Tags?.find((e) => e.Key === 'ghr:Type')?.Value as string,
9191
repo: i.Tags?.find((e) => e.Key === 'ghr:Repo')?.Value as string,
9292
org: i.Tags?.find((e) => e.Key === 'ghr:Org')?.Value as string,
93+
orphan: i.Tags?.find((e) => e.Key === 'ghr:orphan')?.Value === 'true',
9394
});
9495
}
9596
}

lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts

Lines changed: 88 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import nock from 'nock';
55

66
import { RunnerInfo, RunnerList } from '../aws/runners.d';
77
import * as ghAuth from '../gh-auth/gh-auth';
8-
import { listEC2Runners, terminateRunner } from './../aws/runners';
8+
import { listEC2Runners, terminateRunner, tag } from './../aws/runners';
99
import { githubCache } from './cache';
1010
import { newestFirstStrategy, oldestFirstStrategy, scaleDown } from './scale-down';
1111

@@ -42,6 +42,7 @@ const mockedAppAuth = mocked(ghAuth.createGithubAppAuth, { shallow: false });
4242
const mockedInstallationAuth = mocked(ghAuth.createGithubInstallationAuth, { shallow: false });
4343
const mockCreateClient = mocked(ghAuth.createOctoClient, { shallow: false });
4444
const mockListRunners = mocked(listEC2Runners);
45+
const mockTagRunners = mocked(tag);
4546
const mockTerminateRunners = mocked(terminateRunner);
4647

4748
export interface TestData {
@@ -172,7 +173,7 @@ describe('Scale down runners', () => {
172173
describe.each(runnerTypes)('For %s runners.', (type) => {
173174
it('Should not call terminate when no runners online.', async () => {
174175
// setup
175-
mockListRunners.mockResolvedValue([]);
176+
mockAwsRunners([]);
176177

177178
// act
178179
await scaleDown();
@@ -197,7 +198,8 @@ describe('Scale down runners', () => {
197198

198199
mockGitHubRunners(runners);
199200
mockListRunners.mockResolvedValue(runners);
200-
// act
201+
mockAwsRunners(runners);
202+
201203
await scaleDown();
202204

203205
// assert
@@ -220,7 +222,7 @@ describe('Scale down runners', () => {
220222
const runners = [createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES - 1, true, false, false)];
221223

222224
mockGitHubRunners(runners);
223-
mockListRunners.mockResolvedValue(runners);
225+
mockAwsRunners(runners);
224226

225227
// act
226228
await scaleDown();
@@ -235,7 +237,7 @@ describe('Scale down runners', () => {
235237
const runners = [createRunnerTestData('booting-1', type, MINIMUM_BOOT_TIME - 1, false, false, false)];
236238

237239
mockGitHubRunners(runners);
238-
mockListRunners.mockResolvedValue(runners);
240+
mockAwsRunners(runners);
239241

240242
// act
241243
await scaleDown();
@@ -250,7 +252,7 @@ describe('Scale down runners', () => {
250252
const runners = [createRunnerTestData('busy-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, false)];
251253

252254
mockGitHubRunners(runners);
253-
mockListRunners.mockResolvedValue(runners);
255+
mockAwsRunners(runners);
254256

255257
// act
256258
await scaleDown();
@@ -274,7 +276,7 @@ describe('Scale down runners', () => {
274276
];
275277

276278
mockGitHubRunners(runners);
277-
mockListRunners.mockResolvedValue(runners);
279+
mockAwsRunners(runners);
278280
mockOctokit.actions.deleteSelfHostedRunnerFromRepo.mockImplementation(() => {
279281
return { status: 500 };
280282
});
@@ -293,10 +295,31 @@ describe('Scale down runners', () => {
293295

294296
it(`Should terminate orphan.`, async () => {
295297
// setup
296-
const runners = [createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, true, true)];
298+
const orphanRunner = createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, false, false);
299+
const idleRunner = createRunnerTestData('idle-1', type, MINIMUM_BOOT_TIME + 1, true, false, false);
300+
const runners = [orphanRunner, idleRunner];
297301

298-
mockGitHubRunners([]);
299-
mockListRunners.mockResolvedValue(runners);
302+
mockGitHubRunners([idleRunner]);
303+
mockAwsRunners(runners);
304+
305+
// act
306+
await scaleDown();
307+
308+
// assert
309+
checkTerminated(runners);
310+
checkNonTerminated(runners);
311+
312+
expect(mockTagRunners).toHaveBeenCalledWith(orphanRunner.instanceId, [
313+
{
314+
Key: 'orphan',
315+
Value: 'true',
316+
},
317+
]);
318+
expect(mockTagRunners).not.toHaveBeenCalledWith(idleRunner.instanceId, expect.anything());
319+
320+
// next cycle, update test data set orphan to true and terminate should be true
321+
orphanRunner.orphan = true;
322+
orphanRunner.shouldBeTerminated = true;
300323

301324
// act
302325
await scaleDown();
@@ -306,21 +329,60 @@ describe('Scale down runners', () => {
306329
checkNonTerminated(runners);
307330
});
308331

309-
it(`Should orphan termination failure is not resulting in an exception..`, async () => {
332+
it(`Should ignore errors when termination orphan fails.`, async () => {
310333
// setup
311-
const runners = [createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, true, true)];
334+
const orphanRunner = createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, true, true);
335+
const runners = [orphanRunner];
312336

313337
mockGitHubRunners([]);
314-
mockListRunners.mockResolvedValue(runners);
315-
mockTerminateRunners.mockRejectedValue(new Error('Termination failed'));
338+
mockAwsRunners(runners);
339+
mockTerminateRunners.mockImplementation(() => {
340+
throw new Error('Failed to terminate');
341+
});
316342

317-
// act and ensure no exception is thrown
318-
await expect(scaleDown()).resolves.not.toThrow();
343+
// act
344+
await scaleDown();
345+
346+
// assert
347+
checkTerminated(runners);
348+
checkNonTerminated(runners);
349+
});
350+
351+
describe('When orphan termination fails', () => {
352+
it(`Should not throw in case of list runner exception.`, async () => {
353+
// setup
354+
const runners = [createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, true, true)];
355+
356+
mockGitHubRunners([]);
357+
mockListRunners.mockRejectedValueOnce(new Error('Failed to list runners'));
358+
mockAwsRunners(runners);
359+
360+
// ac
361+
await scaleDown();
362+
363+
// assert
364+
checkNonTerminated(runners);
365+
});
366+
367+
it(`Should not throw in case of terminate runner exception.`, async () => {
368+
// setup
369+
const runners = [createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, true, true)];
370+
371+
mockGitHubRunners([]);
372+
mockAwsRunners(runners);
373+
mockTerminateRunners.mockRejectedValue(new Error('Failed to terminate'));
374+
375+
// act and ensure no exception is thrown
376+
await scaleDown();
377+
378+
// assert
379+
checkNonTerminated(runners);
380+
});
319381
});
320382

321383
it(`Should not terminate instance in case de-register fails.`, async () => {
322384
// setup
323-
const runners = [createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, true, false)];
385+
const runners = [createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, false)];
324386

325387
mockOctokit.actions.deleteSelfHostedRunnerFromOrg.mockImplementation(() => {
326388
return { status: 500 };
@@ -330,7 +392,7 @@ describe('Scale down runners', () => {
330392
});
331393

332394
mockGitHubRunners(runners);
333-
mockListRunners.mockResolvedValue(runners);
395+
mockAwsRunners(runners);
334396

335397
// act and should resolve
336398
await expect(scaleDown()).resolves.not.toThrow();
@@ -352,7 +414,7 @@ describe('Scale down runners', () => {
352414
});
353415

354416
mockGitHubRunners(runners);
355-
mockListRunners.mockResolvedValue(runners);
417+
mockAwsRunners(runners);
356418

357419
// act
358420
await expect(scaleDown()).resolves.not.toThrow();
@@ -383,7 +445,7 @@ describe('Scale down runners', () => {
383445
];
384446

385447
mockGitHubRunners(runners);
386-
mockListRunners.mockResolvedValue(runners);
448+
mockAwsRunners(runners);
387449

388450
// act
389451
await scaleDown();
@@ -502,6 +564,12 @@ describe('Scale down runners', () => {
502564
});
503565
});
504566

567+
function mockAwsRunners(runners: RunnerTestItem[]) {
568+
mockListRunners.mockImplementation(async (filter) => {
569+
return runners.filter((r) => !filter?.orphan || filter?.orphan === r.orphan);
570+
});
571+
}
572+
505573
function checkNonTerminated(runners: RunnerTestItem[]) {
506574
const notTerminated = runners.filter((r) => !r.shouldBeTerminated);
507575
for (const toTerminate of notTerminated) {

lambdas/functions/control-plane/src/scale-runners/scale-down.ts

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { createChildLogger } from '@terraform-aws-github-runner/aws-powertools-u
33
import moment from 'moment';
44

55
import { createGithubAppAuth, createGithubInstallationAuth, createOctoClient } from '../gh-auth/gh-auth';
6-
import { bootTimeExceeded, listEC2Runners, terminateRunner } from './../aws/runners';
6+
import { bootTimeExceeded, listEC2Runners, tag, terminateRunner } from './../aws/runners';
77
import { RunnerInfo, RunnerList } from './../aws/runners.d';
88
import { GhRunners, githubCache } from './cache';
99
import { ScalingDownConfig, getEvictionStrategy, getIdleRunnerCount } from './scale-down-config';
@@ -184,8 +184,7 @@ async function evaluateAndRemoveRunners(
184184
}
185185
} else {
186186
if (bootTimeExceeded(ec2Runner)) {
187-
logger.info(`Runner '${ec2Runner.instanceId}' is orphaned and will be removed.`);
188-
terminateOrphan(ec2Runner.instanceId);
187+
markOrphan(ec2Runner.instanceId);
189188
} else {
190189
logger.debug(`Runner ${ec2Runner.instanceId} has not yet booted.`);
191190
}
@@ -194,11 +193,26 @@ async function evaluateAndRemoveRunners(
194193
}
195194
}
196195

197-
async function terminateOrphan(instanceId: string): Promise<void> {
196+
async function markOrphan(instanceId: string): Promise<void> {
198197
try {
199-
await terminateRunner(instanceId);
198+
await tag(instanceId, [{ Key: 'orphan', Value: 'true' }]);
199+
logger.info(`Runner '${instanceId}' marked as orphan.`);
200200
} catch (e) {
201-
logger.debug(`Orphan runner '${instanceId}' cannot be removed.`);
201+
logger.warn(`Orphan runner '${instanceId}' cannot be marked.`);
202+
}
203+
}
204+
205+
async function terminateOrphan(environment: string): Promise<void> {
206+
try {
207+
const orphanRunners = await listEC2Runners({ environment, orphan: true });
208+
209+
for (const runner of orphanRunners) {
210+
await terminateRunner(runner.instanceId).catch((e) => {
211+
logger.error(`Failed to terminate orphan runner '${runner.instanceId}'`, { error: e });
212+
});
213+
}
214+
} catch (e) {
215+
logger.warn(`Failure during orphan runner termination.`, { error: e });
202216
}
203217
}
204218

@@ -221,14 +235,18 @@ async function listRunners(environment: string) {
221235
}
222236

223237
function filterRunners(ec2runners: RunnerList[]): RunnerInfo[] {
224-
return ec2runners.filter((ec2Runner) => ec2Runner.type) as RunnerInfo[];
238+
return ec2runners.filter((ec2Runner) => ec2Runner.type && !ec2Runner.orphan) as RunnerInfo[];
225239
}
226240

227241
export async function scaleDown(): Promise<void> {
228242
githubCache.reset();
229-
const scaleDownConfigs = JSON.parse(process.env.SCALE_DOWN_CONFIG) as [ScalingDownConfig];
230243
const environment = process.env.ENVIRONMENT;
244+
const scaleDownConfigs = JSON.parse(process.env.SCALE_DOWN_CONFIG) as [ScalingDownConfig];
245+
246+
// first runners marked to be orphan.
247+
await terminateOrphan(environment);
231248

249+
// next scale down idle runners with respect to config and mark potential orphans
232250
const ec2Runners = await listRunners(environment);
233251
const activeEc2RunnersCount = ec2Runners.length;
234252
logger.info(`Found: '${activeEc2RunnersCount}' active GitHub EC2 runner instances before clean-up.`);

0 commit comments

Comments
 (0)