refactor(core): Move queue recovery to scaling service (no-changelog) (#10368)

This commit is contained in:
Iván Ovejero
2024-08-13 15:06:47 +02:00
committed by GitHub
parent 5ac65b36bc
commit 56c4692c94
7 changed files with 196 additions and 227 deletions

View File

@@ -7,6 +7,9 @@ import type { Job, JobData, JobOptions, JobQueue } from '../types';
import { ApplicationError } from 'n8n-workflow';
import { mockInstance } from '@test/mocking';
import { GlobalConfig } from '@n8n/config';
import { InstanceSettings } from 'n8n-core';
import type { OrchestrationService } from '@/services/orchestration.service';
import Container from 'typedi';
import type { JobProcessor } from '../job-processor';
const queue = mock<JobQueue>({
@@ -34,9 +37,27 @@ describe('ScalingService', () => {
},
});
const instanceSettings = Container.get(InstanceSettings);
const orchestrationService = mock<OrchestrationService>({ isMultiMainSetupEnabled: false });
const jobProcessor = mock<JobProcessor>();
let scalingService: ScalingService;
beforeEach(() => {
jest.clearAllMocks();
config.set('generic.instanceType', 'main');
scalingService = new ScalingService(
mock(),
mock(),
jobProcessor,
globalConfig,
mock(),
instanceSettings,
orchestrationService,
);
});
afterEach(() => {
scalingService.stopQueueRecovery();
});
describe('setupQueue', () => {
@@ -44,7 +65,6 @@ describe('ScalingService', () => {
/**
* Arrange
*/
const scalingService = new ScalingService(mock(), mock(), mock(), globalConfig);
const { prefix, settings } = globalConfig.queue.bull;
const Bull = jest.mocked(BullModule.default);
@@ -72,7 +92,15 @@ describe('ScalingService', () => {
* Arrange
*/
config.set('generic.instanceType', 'worker');
const scalingService = new ScalingService(mock(), mock(), mock(), globalConfig);
const scalingService = new ScalingService(
mock(),
mock(),
mock(),
globalConfig,
mock(),
instanceSettings,
orchestrationService,
);
await scalingService.setupQueue();
const concurrency = 5;
@@ -91,7 +119,6 @@ describe('ScalingService', () => {
/**
* Arrange
*/
const scalingService = new ScalingService(mock(), mock(), mock(), globalConfig);
await scalingService.setupQueue();
/**
@@ -102,14 +129,13 @@ describe('ScalingService', () => {
});
describe('stop', () => {
it('should pause the queue and check for running jobs', async () => {
it('should pause the queue, check for running jobs, and stop queue recovery', async () => {
/**
* Arrange
*/
const jobProcessor = mock<JobProcessor>();
const scalingService = new ScalingService(mock(), mock(), jobProcessor, globalConfig);
await scalingService.setupQueue();
jobProcessor.getRunningJobIds.mockReturnValue([]);
const stopQueueRecoverySpy = jest.spyOn(scalingService, 'stopQueueRecovery');
const getRunningJobsCountSpy = jest.spyOn(scalingService, 'getRunningJobsCount');
/**
@@ -121,6 +147,7 @@ describe('ScalingService', () => {
* Assert
*/
expect(queue.pause).toHaveBeenCalledWith(true, true);
expect(stopQueueRecoverySpy).toHaveBeenCalled();
expect(getRunningJobsCountSpy).toHaveBeenCalled();
});
});
@@ -130,7 +157,6 @@ describe('ScalingService', () => {
/**
* Arrange
*/
const scalingService = new ScalingService(mock(), mock(), mock(), globalConfig);
await scalingService.setupQueue();
/**
@@ -150,7 +176,6 @@ describe('ScalingService', () => {
/**
* Arrange
*/
const scalingService = new ScalingService(mock(), mock(), mock(), globalConfig);
await scalingService.setupQueue();
queue.add.mockResolvedValue(mock<Job>({ id: '456' }));
@@ -173,7 +198,6 @@ describe('ScalingService', () => {
/**
* Arrange
*/
const scalingService = new ScalingService(mock(), mock(), mock(), globalConfig);
await scalingService.setupQueue();
const jobId = '123';
queue.getJob.mockResolvedValue(mock<Job>({ id: jobId }));
@@ -196,7 +220,6 @@ describe('ScalingService', () => {
/**
* Arrange
*/
const scalingService = new ScalingService(mock(), mock(), mock(), globalConfig);
await scalingService.setupQueue();
queue.getJobs.mockResolvedValue([mock<Job>({ id: '123' })]);
@@ -217,7 +240,6 @@ describe('ScalingService', () => {
/**
* Arrange
*/
const scalingService = new ScalingService(mock(), mock(), mock(), globalConfig);
await scalingService.setupQueue();
// @ts-expect-error - Untyped but possible Redis response
queue.getJobs.mockResolvedValue([mock<Job>(), null]);
@@ -239,7 +261,6 @@ describe('ScalingService', () => {
/**
* Arrange
*/
const scalingService = new ScalingService(mock(), mock(), mock(), globalConfig);
await scalingService.setupQueue();
const job = mock<Job>({ isActive: jest.fn().mockResolvedValue(true) });
@@ -259,7 +280,6 @@ describe('ScalingService', () => {
/**
* Arrange
*/
const scalingService = new ScalingService(mock(), mock(), mock(), globalConfig);
await scalingService.setupQueue();
const job = mock<Job>({ isActive: jest.fn().mockResolvedValue(false) });
@@ -279,7 +299,6 @@ describe('ScalingService', () => {
/**
* Arrange
*/
const scalingService = new ScalingService(mock(), mock(), mock(), globalConfig);
await scalingService.setupQueue();
const job = mock<Job>({
isActive: jest.fn().mockImplementation(() => {
@@ -298,4 +317,42 @@ describe('ScalingService', () => {
expect(result).toBe(false);
});
});
describe('scheduleQueueRecovery', () => {
it('if leader, should schedule queue recovery', async () => {
/**
* Arrange
*/
const scheduleSpy = jest.spyOn(scalingService, 'scheduleQueueRecovery');
instanceSettings.markAsLeader();
/**
* Act
*/
await scalingService.setupQueue();
/**
* Assert
*/
expect(scheduleSpy).toHaveBeenCalled();
});
it('if follower, should not schedule queue recovery', async () => {
/**
* Arrange
*/
const scheduleSpy = jest.spyOn(scalingService, 'scheduleQueueRecovery');
instanceSettings.markAsFollower();
/**
* Act
*/
await scalingService.setupQueue();
/**
* Assert
*/
expect(scheduleSpy).not.toHaveBeenCalled();
});
});
});

View File

@@ -1,16 +1,28 @@
import Container, { Service } from 'typedi';
import { ApplicationError, BINARY_ENCODING, sleep } from 'n8n-workflow';
import { ApplicationError, BINARY_ENCODING, sleep, jsonStringify } from 'n8n-workflow';
import { ActiveExecutions } from '@/ActiveExecutions';
import config from '@/config';
import { Logger } from '@/Logger';
import { MaxStalledCountError } from '@/errors/max-stalled-count.error';
import { HIGHEST_SHUTDOWN_PRIORITY } from '@/constants';
import { HIGHEST_SHUTDOWN_PRIORITY, Time } from '@/constants';
import { OnShutdown } from '@/decorators/OnShutdown';
import { JOB_TYPE_NAME, QUEUE_NAME } from './constants';
import { JobProcessor } from './job-processor';
import type { JobQueue, Job, JobData, JobOptions, JobMessage, JobStatus, JobId } from './types';
import type {
JobQueue,
Job,
JobData,
JobOptions,
JobMessage,
JobStatus,
JobId,
QueueRecoveryContext,
} from './types';
import type { IExecuteResponsePromiseData } from 'n8n-workflow';
import { GlobalConfig } from '@n8n/config';
import { ExecutionRepository } from '@/databases/repositories/execution.repository';
import { InstanceSettings } from 'n8n-core';
import { OrchestrationService } from '@/services/orchestration.service';
@Service()
export class ScalingService {
@@ -23,6 +35,9 @@ export class ScalingService {
private readonly activeExecutions: ActiveExecutions,
private readonly jobProcessor: JobProcessor,
private readonly globalConfig: GlobalConfig,
private readonly executionRepository: ExecutionRepository,
private readonly instanceSettings: InstanceSettings,
private readonly orchestrationService: OrchestrationService,
) {}
// #region Lifecycle
@@ -43,6 +58,14 @@ export class ScalingService {
this.registerListeners();
if (this.instanceSettings.isLeader) this.scheduleQueueRecovery();
if (this.orchestrationService.isMultiMainSetupEnabled) {
this.orchestrationService.multiMainSetup
.on('leader-takeover', () => this.scheduleQueueRecovery())
.on('leader-stepdown', () => this.stopQueueRecovery());
}
this.logger.debug('[ScalingService] Queue setup completed');
}
@@ -64,6 +87,10 @@ export class ScalingService {
this.logger.debug('[ScalingService] Queue paused');
this.stopQueueRecovery();
this.logger.debug('[ScalingService] Queue recovery stopped');
let count = 0;
while (this.getRunningJobsCount() !== 0) {
@@ -230,4 +257,86 @@ export class ScalingService {
throw new ApplicationError('This method must be called on a `worker` instance');
}
// #region Queue recovery
private readonly queueRecoveryContext: QueueRecoveryContext = {
batchSize: config.getEnv('executions.queueRecovery.batchSize'),
waitMs: config.getEnv('executions.queueRecovery.interval') * 60 * 1000,
};
scheduleQueueRecovery(waitMs = this.queueRecoveryContext.waitMs) {
this.queueRecoveryContext.timeout = setTimeout(async () => {
try {
const nextWaitMs = await this.recoverFromQueue();
this.scheduleQueueRecovery(nextWaitMs);
} catch (error) {
this.logger.error('[ScalingService] Failed to recover dangling executions from queue', {
msg: this.toErrorMsg(error),
});
this.logger.error('[ScalingService] Retrying...');
this.scheduleQueueRecovery();
}
}, waitMs);
const wait = [this.queueRecoveryContext.waitMs / Time.minutes.toMilliseconds, 'min'].join(' ');
this.logger.debug(`[ScalingService] Scheduled queue recovery check for next ${wait}`);
}
stopQueueRecovery() {
clearTimeout(this.queueRecoveryContext.timeout);
}
/**
* Mark in-progress executions as `crashed` if stored in DB as `new` or `running`
* but absent from the queue. Return time until next recovery cycle.
*/
private async recoverFromQueue() {
const { waitMs, batchSize } = this.queueRecoveryContext;
const storedIds = await this.executionRepository.getInProgressExecutionIds(batchSize);
if (storedIds.length === 0) {
this.logger.debug('[ScalingService] Completed queue recovery check, no dangling executions');
return waitMs;
}
const runningJobs = await this.findJobsByStatus(['active', 'waiting']);
const queuedIds = new Set(runningJobs.map((job) => job.data.executionId));
if (queuedIds.size === 0) {
this.logger.debug('[ScalingService] Completed queue recovery check, no dangling executions');
return waitMs;
}
const danglingIds = storedIds.filter((id) => !queuedIds.has(id));
if (danglingIds.length === 0) {
this.logger.debug('[ScalingService] Completed queue recovery check, no dangling executions');
return waitMs;
}
await this.executionRepository.markAsCrashed(danglingIds);
this.logger.info(
'[ScalingService] Completed queue recovery check, recovered dangling executions',
{ danglingIds },
);
// if this cycle used up the whole batch size, it is possible for there to be
// dangling executions outside this check, so speed up next cycle
return storedIds.length >= this.queueRecoveryContext.batchSize ? waitMs / 2 : waitMs;
}
private toErrorMsg(error: unknown) {
return error instanceof Error
? error.message
: jsonStringify(error, { replaceCircularRefs: true });
}
// #endregion
}

View File

@@ -53,3 +53,14 @@ export type RunningJob = {
};
export type RunningJobSummary = Omit<RunningJob, 'run'>;
export type QueueRecoveryContext = {
/** ID of timeout for next scheduled recovery cycle. */
timeout?: NodeJS.Timeout;
/** Number of in-progress executions to check per cycle. */
batchSize: number;
/** Time (in milliseconds) to wait until the next cycle. */
waitMs: number;
};