mirror of
https://github.com/Abdulazizzn/n8n-enterprise-unlocked.git
synced 2025-12-17 18:12:04 +00:00
refactor(core): Move queue recovery to scaling service (no-changelog) (#10368)
This commit is contained in:
@@ -30,7 +30,6 @@ import type { IWorkflowExecutionDataProcess } from '@/Interfaces';
|
|||||||
import { ExecutionService } from '@/executions/execution.service';
|
import { ExecutionService } from '@/executions/execution.service';
|
||||||
import { OwnershipService } from '@/services/ownership.service';
|
import { OwnershipService } from '@/services/ownership.service';
|
||||||
import { WorkflowRunner } from '@/WorkflowRunner';
|
import { WorkflowRunner } from '@/WorkflowRunner';
|
||||||
import { ExecutionRecoveryService } from '@/executions/execution-recovery.service';
|
|
||||||
import { EventService } from '@/events/event.service';
|
import { EventService } from '@/events/event.service';
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment, @typescript-eslint/no-var-requires
|
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment, @typescript-eslint/no-var-requires
|
||||||
@@ -305,7 +304,6 @@ export class Start extends BaseCommand {
|
|||||||
await this.server.start();
|
await this.server.start();
|
||||||
|
|
||||||
Container.get(PruningService).init();
|
Container.get(PruningService).init();
|
||||||
Container.get(ExecutionRecoveryService).init();
|
|
||||||
|
|
||||||
if (config.getEnv('executions.mode') === 'regular') {
|
if (config.getEnv('executions.mode') === 'regular') {
|
||||||
await this.runEnqueuedExecutions();
|
await this.runEnqueuedExecutions();
|
||||||
|
|||||||
@@ -9,8 +9,6 @@ import { createExecution } from '@test-integration/db/executions';
|
|||||||
import * as testDb from '@test-integration/testDb';
|
import * as testDb from '@test-integration/testDb';
|
||||||
|
|
||||||
import { mock } from 'jest-mock-extended';
|
import { mock } from 'jest-mock-extended';
|
||||||
import { OrchestrationService } from '@/services/orchestration.service';
|
|
||||||
import config from '@/config';
|
|
||||||
import { ExecutionRecoveryService } from '@/executions/execution-recovery.service';
|
import { ExecutionRecoveryService } from '@/executions/execution-recovery.service';
|
||||||
import { ExecutionRepository } from '@/databases/repositories/execution.repository';
|
import { ExecutionRepository } from '@/databases/repositories/execution.repository';
|
||||||
import { Push } from '@/push';
|
import { Push } from '@/push';
|
||||||
@@ -28,20 +26,17 @@ describe('ExecutionRecoveryService', () => {
|
|||||||
const instanceSettings = new InstanceSettings();
|
const instanceSettings = new InstanceSettings();
|
||||||
|
|
||||||
let executionRecoveryService: ExecutionRecoveryService;
|
let executionRecoveryService: ExecutionRecoveryService;
|
||||||
let orchestrationService: OrchestrationService;
|
|
||||||
let executionRepository: ExecutionRepository;
|
let executionRepository: ExecutionRepository;
|
||||||
|
|
||||||
beforeAll(async () => {
|
beforeAll(async () => {
|
||||||
await testDb.init();
|
await testDb.init();
|
||||||
executionRepository = Container.get(ExecutionRepository);
|
executionRepository = Container.get(ExecutionRepository);
|
||||||
orchestrationService = Container.get(OrchestrationService);
|
|
||||||
|
|
||||||
executionRecoveryService = new ExecutionRecoveryService(
|
executionRecoveryService = new ExecutionRecoveryService(
|
||||||
mock(),
|
mock(),
|
||||||
instanceSettings,
|
instanceSettings,
|
||||||
push,
|
push,
|
||||||
executionRepository,
|
executionRepository,
|
||||||
orchestrationService,
|
|
||||||
mock(),
|
mock(),
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
@@ -53,74 +48,12 @@ describe('ExecutionRecoveryService', () => {
|
|||||||
afterEach(async () => {
|
afterEach(async () => {
|
||||||
jest.restoreAllMocks();
|
jest.restoreAllMocks();
|
||||||
await testDb.truncate(['Execution', 'ExecutionData', 'Workflow']);
|
await testDb.truncate(['Execution', 'ExecutionData', 'Workflow']);
|
||||||
executionRecoveryService.shutdown();
|
|
||||||
});
|
});
|
||||||
|
|
||||||
afterAll(async () => {
|
afterAll(async () => {
|
||||||
await testDb.terminate();
|
await testDb.terminate();
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('scheduleQueueRecovery', () => {
|
|
||||||
describe('queue mode', () => {
|
|
||||||
it('if leader, should schedule queue recovery', () => {
|
|
||||||
/**
|
|
||||||
* Arrange
|
|
||||||
*/
|
|
||||||
config.set('executions.mode', 'queue');
|
|
||||||
const scheduleSpy = jest.spyOn(executionRecoveryService, 'scheduleQueueRecovery');
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Act
|
|
||||||
*/
|
|
||||||
executionRecoveryService.init();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Assert
|
|
||||||
*/
|
|
||||||
expect(scheduleSpy).toHaveBeenCalled();
|
|
||||||
});
|
|
||||||
|
|
||||||
it('if follower, should do nothing', () => {
|
|
||||||
/**
|
|
||||||
* Arrange
|
|
||||||
*/
|
|
||||||
config.set('executions.mode', 'queue');
|
|
||||||
instanceSettings.markAsFollower();
|
|
||||||
const scheduleSpy = jest.spyOn(executionRecoveryService, 'scheduleQueueRecovery');
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Act
|
|
||||||
*/
|
|
||||||
executionRecoveryService.init();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Assert
|
|
||||||
*/
|
|
||||||
expect(scheduleSpy).not.toHaveBeenCalled();
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
describe('regular mode', () => {
|
|
||||||
it('should do nothing', () => {
|
|
||||||
/**
|
|
||||||
* Arrange
|
|
||||||
*/
|
|
||||||
config.set('executions.mode', 'regular');
|
|
||||||
const scheduleSpy = jest.spyOn(executionRecoveryService, 'scheduleQueueRecovery');
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Act
|
|
||||||
*/
|
|
||||||
executionRecoveryService.init();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Assert
|
|
||||||
*/
|
|
||||||
expect(scheduleSpy).not.toHaveBeenCalled();
|
|
||||||
});
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
describe('recoverFromLogs', () => {
|
describe('recoverFromLogs', () => {
|
||||||
describe('if follower', () => {
|
describe('if follower', () => {
|
||||||
test('should do nothing', async () => {
|
test('should do nothing', async () => {
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import Container, { Service } from 'typedi';
|
import { Service } from 'typedi';
|
||||||
import { Push } from '@/push';
|
import { Push } from '@/push';
|
||||||
import { jsonStringify, sleep } from 'n8n-workflow';
|
import { sleep } from 'n8n-workflow';
|
||||||
import { ExecutionRepository } from '@db/repositories/execution.repository';
|
import { ExecutionRepository } from '@db/repositories/execution.repository';
|
||||||
import { getWorkflowHooksMain } from '@/WorkflowExecuteAdditionalData'; // @TODO: Dependency cycle
|
import { getWorkflowHooksMain } from '@/WorkflowExecuteAdditionalData'; // @TODO: Dependency cycle
|
||||||
import type { DateTime } from 'luxon';
|
import type { DateTime } from 'luxon';
|
||||||
@@ -12,10 +12,6 @@ import { NodeCrashedError } from '@/errors/node-crashed.error';
|
|||||||
import { WorkflowCrashedError } from '@/errors/workflow-crashed.error';
|
import { WorkflowCrashedError } from '@/errors/workflow-crashed.error';
|
||||||
import { ARTIFICIAL_TASK_DATA } from '@/constants';
|
import { ARTIFICIAL_TASK_DATA } from '@/constants';
|
||||||
import { Logger } from '@/Logger';
|
import { Logger } from '@/Logger';
|
||||||
import config from '@/config';
|
|
||||||
import { OnShutdown } from '@/decorators/OnShutdown';
|
|
||||||
import type { QueueRecoverySettings } from './execution.types';
|
|
||||||
import { OrchestrationService } from '@/services/orchestration.service';
|
|
||||||
import { EventService } from '@/events/event.service';
|
import { EventService } from '@/events/event.service';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -28,34 +24,9 @@ export class ExecutionRecoveryService {
|
|||||||
private readonly instanceSettings: InstanceSettings,
|
private readonly instanceSettings: InstanceSettings,
|
||||||
private readonly push: Push,
|
private readonly push: Push,
|
||||||
private readonly executionRepository: ExecutionRepository,
|
private readonly executionRepository: ExecutionRepository,
|
||||||
private readonly orchestrationService: OrchestrationService,
|
|
||||||
private readonly eventService: EventService,
|
private readonly eventService: EventService,
|
||||||
) {}
|
) {}
|
||||||
|
|
||||||
/**
|
|
||||||
* @important Requires `OrchestrationService` to be initialized on queue mode.
|
|
||||||
*/
|
|
||||||
init() {
|
|
||||||
if (config.getEnv('executions.mode') === 'regular') return;
|
|
||||||
|
|
||||||
const { isLeader } = this.instanceSettings;
|
|
||||||
if (isLeader) this.scheduleQueueRecovery();
|
|
||||||
|
|
||||||
const { isMultiMainSetupEnabled } = this.orchestrationService;
|
|
||||||
if (isMultiMainSetupEnabled) {
|
|
||||||
this.orchestrationService.multiMainSetup
|
|
||||||
.on('leader-takeover', () => this.scheduleQueueRecovery())
|
|
||||||
.on('leader-stepdown', () => this.stopQueueRecovery());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private readonly queueRecoverySettings: QueueRecoverySettings = {
|
|
||||||
batchSize: config.getEnv('executions.queueRecovery.batchSize'),
|
|
||||||
waitMs: config.getEnv('executions.queueRecovery.interval') * 60 * 1000,
|
|
||||||
};
|
|
||||||
|
|
||||||
private isShuttingDown = false;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Recover key properties of a truncated execution using event logs.
|
* Recover key properties of a truncated execution using event logs.
|
||||||
*/
|
*/
|
||||||
@@ -82,89 +53,10 @@ export class ExecutionRecoveryService {
|
|||||||
return amendedExecution;
|
return amendedExecution;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Schedule a cycle to mark dangling executions as crashed in queue mode.
|
|
||||||
*/
|
|
||||||
scheduleQueueRecovery(waitMs = this.queueRecoverySettings.waitMs) {
|
|
||||||
if (!this.shouldScheduleQueueRecovery()) return;
|
|
||||||
|
|
||||||
this.queueRecoverySettings.timeout = setTimeout(async () => {
|
|
||||||
try {
|
|
||||||
const nextWaitMs = await this.recoverFromQueue();
|
|
||||||
this.scheduleQueueRecovery(nextWaitMs);
|
|
||||||
} catch (error) {
|
|
||||||
const msg = this.toErrorMsg(error);
|
|
||||||
|
|
||||||
this.logger.error('[Recovery] Failed to recover dangling executions from queue', { msg });
|
|
||||||
this.logger.error('[Recovery] Retrying...');
|
|
||||||
|
|
||||||
this.scheduleQueueRecovery();
|
|
||||||
}
|
|
||||||
}, waitMs);
|
|
||||||
|
|
||||||
const wait = [this.queueRecoverySettings.waitMs / (60 * 1000), 'min'].join(' ');
|
|
||||||
|
|
||||||
this.logger.debug(`[Recovery] Scheduled queue recovery check for next ${wait}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
stopQueueRecovery() {
|
|
||||||
clearTimeout(this.queueRecoverySettings.timeout);
|
|
||||||
}
|
|
||||||
|
|
||||||
@OnShutdown()
|
|
||||||
shutdown() {
|
|
||||||
this.isShuttingDown = true;
|
|
||||||
this.stopQueueRecovery();
|
|
||||||
}
|
|
||||||
|
|
||||||
// ----------------------------------
|
// ----------------------------------
|
||||||
// private
|
// private
|
||||||
// ----------------------------------
|
// ----------------------------------
|
||||||
|
|
||||||
/**
|
|
||||||
* Mark in-progress executions as `crashed` if stored in DB as `new` or `running`
|
|
||||||
* but absent from the queue. Return time until next recovery cycle.
|
|
||||||
*/
|
|
||||||
private async recoverFromQueue() {
|
|
||||||
const { waitMs, batchSize } = this.queueRecoverySettings;
|
|
||||||
|
|
||||||
const storedIds = await this.executionRepository.getInProgressExecutionIds(batchSize);
|
|
||||||
|
|
||||||
if (storedIds.length === 0) {
|
|
||||||
this.logger.debug('[Recovery] Completed queue recovery check, no dangling executions');
|
|
||||||
return waitMs;
|
|
||||||
}
|
|
||||||
|
|
||||||
const { ScalingService } = await import('@/scaling/scaling.service');
|
|
||||||
|
|
||||||
const runningJobs = await Container.get(ScalingService).findJobsByStatus(['active', 'waiting']);
|
|
||||||
|
|
||||||
const queuedIds = new Set(runningJobs.map((job) => job.data.executionId));
|
|
||||||
|
|
||||||
if (queuedIds.size === 0) {
|
|
||||||
this.logger.debug('[Recovery] Completed queue recovery check, no dangling executions');
|
|
||||||
return waitMs;
|
|
||||||
}
|
|
||||||
|
|
||||||
const danglingIds = storedIds.filter((id) => !queuedIds.has(id));
|
|
||||||
|
|
||||||
if (danglingIds.length === 0) {
|
|
||||||
this.logger.debug('[Recovery] Completed queue recovery check, no dangling executions');
|
|
||||||
return waitMs;
|
|
||||||
}
|
|
||||||
|
|
||||||
await this.executionRepository.markAsCrashed(danglingIds);
|
|
||||||
|
|
||||||
this.logger.info('[Recovery] Completed queue recovery check, recovered dangling executions', {
|
|
||||||
danglingIds,
|
|
||||||
});
|
|
||||||
|
|
||||||
// if this cycle used up the whole batch size, it is possible for there to be
|
|
||||||
// dangling executions outside this check, so speed up next cycle
|
|
||||||
|
|
||||||
return storedIds.length >= this.queueRecoverySettings.batchSize ? waitMs / 2 : waitMs;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Amend `status`, `stoppedAt`, and (if possible) `data` of an execution using event logs.
|
* Amend `status`, `stoppedAt`, and (if possible) `data` of an execution using event logs.
|
||||||
*/
|
*/
|
||||||
@@ -313,18 +205,4 @@ export class ExecutionRecoveryService {
|
|||||||
|
|
||||||
await externalHooks.executeHookFunctions('workflowExecuteAfter', [run]);
|
await externalHooks.executeHookFunctions('workflowExecuteAfter', [run]);
|
||||||
}
|
}
|
||||||
|
|
||||||
private toErrorMsg(error: unknown) {
|
|
||||||
return error instanceof Error
|
|
||||||
? error.message
|
|
||||||
: jsonStringify(error, { replaceCircularRefs: true });
|
|
||||||
}
|
|
||||||
|
|
||||||
private shouldScheduleQueueRecovery() {
|
|
||||||
return (
|
|
||||||
config.getEnv('executions.mode') === 'queue' &&
|
|
||||||
this.instanceSettings.isLeader &&
|
|
||||||
!this.isShuttingDown
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -93,23 +93,6 @@ export namespace ExecutionSummaries {
|
|||||||
export type ExecutionSummaryWithScopes = ExecutionSummary & { scopes: Scope[] };
|
export type ExecutionSummaryWithScopes = ExecutionSummary & { scopes: Scope[] };
|
||||||
}
|
}
|
||||||
|
|
||||||
export type QueueRecoverySettings = {
|
|
||||||
/**
|
|
||||||
* ID of timeout for next scheduled recovery cycle.
|
|
||||||
*/
|
|
||||||
timeout?: NodeJS.Timeout;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Number of in-progress executions to check per cycle.
|
|
||||||
*/
|
|
||||||
batchSize: number;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Time (in milliseconds) to wait before the next cycle.
|
|
||||||
*/
|
|
||||||
waitMs: number;
|
|
||||||
};
|
|
||||||
|
|
||||||
export type StopResult = {
|
export type StopResult = {
|
||||||
mode: WorkflowExecuteMode;
|
mode: WorkflowExecuteMode;
|
||||||
startedAt: Date;
|
startedAt: Date;
|
||||||
|
|||||||
@@ -7,6 +7,9 @@ import type { Job, JobData, JobOptions, JobQueue } from '../types';
|
|||||||
import { ApplicationError } from 'n8n-workflow';
|
import { ApplicationError } from 'n8n-workflow';
|
||||||
import { mockInstance } from '@test/mocking';
|
import { mockInstance } from '@test/mocking';
|
||||||
import { GlobalConfig } from '@n8n/config';
|
import { GlobalConfig } from '@n8n/config';
|
||||||
|
import { InstanceSettings } from 'n8n-core';
|
||||||
|
import type { OrchestrationService } from '@/services/orchestration.service';
|
||||||
|
import Container from 'typedi';
|
||||||
import type { JobProcessor } from '../job-processor';
|
import type { JobProcessor } from '../job-processor';
|
||||||
|
|
||||||
const queue = mock<JobQueue>({
|
const queue = mock<JobQueue>({
|
||||||
@@ -34,9 +37,27 @@ describe('ScalingService', () => {
|
|||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const instanceSettings = Container.get(InstanceSettings);
|
||||||
|
const orchestrationService = mock<OrchestrationService>({ isMultiMainSetupEnabled: false });
|
||||||
|
const jobProcessor = mock<JobProcessor>();
|
||||||
|
let scalingService: ScalingService;
|
||||||
|
|
||||||
beforeEach(() => {
|
beforeEach(() => {
|
||||||
jest.clearAllMocks();
|
jest.clearAllMocks();
|
||||||
config.set('generic.instanceType', 'main');
|
config.set('generic.instanceType', 'main');
|
||||||
|
scalingService = new ScalingService(
|
||||||
|
mock(),
|
||||||
|
mock(),
|
||||||
|
jobProcessor,
|
||||||
|
globalConfig,
|
||||||
|
mock(),
|
||||||
|
instanceSettings,
|
||||||
|
orchestrationService,
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
scalingService.stopQueueRecovery();
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('setupQueue', () => {
|
describe('setupQueue', () => {
|
||||||
@@ -44,7 +65,6 @@ describe('ScalingService', () => {
|
|||||||
/**
|
/**
|
||||||
* Arrange
|
* Arrange
|
||||||
*/
|
*/
|
||||||
const scalingService = new ScalingService(mock(), mock(), mock(), globalConfig);
|
|
||||||
const { prefix, settings } = globalConfig.queue.bull;
|
const { prefix, settings } = globalConfig.queue.bull;
|
||||||
const Bull = jest.mocked(BullModule.default);
|
const Bull = jest.mocked(BullModule.default);
|
||||||
|
|
||||||
@@ -72,7 +92,15 @@ describe('ScalingService', () => {
|
|||||||
* Arrange
|
* Arrange
|
||||||
*/
|
*/
|
||||||
config.set('generic.instanceType', 'worker');
|
config.set('generic.instanceType', 'worker');
|
||||||
const scalingService = new ScalingService(mock(), mock(), mock(), globalConfig);
|
const scalingService = new ScalingService(
|
||||||
|
mock(),
|
||||||
|
mock(),
|
||||||
|
mock(),
|
||||||
|
globalConfig,
|
||||||
|
mock(),
|
||||||
|
instanceSettings,
|
||||||
|
orchestrationService,
|
||||||
|
);
|
||||||
await scalingService.setupQueue();
|
await scalingService.setupQueue();
|
||||||
const concurrency = 5;
|
const concurrency = 5;
|
||||||
|
|
||||||
@@ -91,7 +119,6 @@ describe('ScalingService', () => {
|
|||||||
/**
|
/**
|
||||||
* Arrange
|
* Arrange
|
||||||
*/
|
*/
|
||||||
const scalingService = new ScalingService(mock(), mock(), mock(), globalConfig);
|
|
||||||
await scalingService.setupQueue();
|
await scalingService.setupQueue();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -102,14 +129,13 @@ describe('ScalingService', () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
describe('stop', () => {
|
describe('stop', () => {
|
||||||
it('should pause the queue and check for running jobs', async () => {
|
it('should pause the queue, check for running jobs, and stop queue recovery', async () => {
|
||||||
/**
|
/**
|
||||||
* Arrange
|
* Arrange
|
||||||
*/
|
*/
|
||||||
const jobProcessor = mock<JobProcessor>();
|
|
||||||
const scalingService = new ScalingService(mock(), mock(), jobProcessor, globalConfig);
|
|
||||||
await scalingService.setupQueue();
|
await scalingService.setupQueue();
|
||||||
jobProcessor.getRunningJobIds.mockReturnValue([]);
|
jobProcessor.getRunningJobIds.mockReturnValue([]);
|
||||||
|
const stopQueueRecoverySpy = jest.spyOn(scalingService, 'stopQueueRecovery');
|
||||||
const getRunningJobsCountSpy = jest.spyOn(scalingService, 'getRunningJobsCount');
|
const getRunningJobsCountSpy = jest.spyOn(scalingService, 'getRunningJobsCount');
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -121,6 +147,7 @@ describe('ScalingService', () => {
|
|||||||
* Assert
|
* Assert
|
||||||
*/
|
*/
|
||||||
expect(queue.pause).toHaveBeenCalledWith(true, true);
|
expect(queue.pause).toHaveBeenCalledWith(true, true);
|
||||||
|
expect(stopQueueRecoverySpy).toHaveBeenCalled();
|
||||||
expect(getRunningJobsCountSpy).toHaveBeenCalled();
|
expect(getRunningJobsCountSpy).toHaveBeenCalled();
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@@ -130,7 +157,6 @@ describe('ScalingService', () => {
|
|||||||
/**
|
/**
|
||||||
* Arrange
|
* Arrange
|
||||||
*/
|
*/
|
||||||
const scalingService = new ScalingService(mock(), mock(), mock(), globalConfig);
|
|
||||||
await scalingService.setupQueue();
|
await scalingService.setupQueue();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -150,7 +176,6 @@ describe('ScalingService', () => {
|
|||||||
/**
|
/**
|
||||||
* Arrange
|
* Arrange
|
||||||
*/
|
*/
|
||||||
const scalingService = new ScalingService(mock(), mock(), mock(), globalConfig);
|
|
||||||
await scalingService.setupQueue();
|
await scalingService.setupQueue();
|
||||||
queue.add.mockResolvedValue(mock<Job>({ id: '456' }));
|
queue.add.mockResolvedValue(mock<Job>({ id: '456' }));
|
||||||
|
|
||||||
@@ -173,7 +198,6 @@ describe('ScalingService', () => {
|
|||||||
/**
|
/**
|
||||||
* Arrange
|
* Arrange
|
||||||
*/
|
*/
|
||||||
const scalingService = new ScalingService(mock(), mock(), mock(), globalConfig);
|
|
||||||
await scalingService.setupQueue();
|
await scalingService.setupQueue();
|
||||||
const jobId = '123';
|
const jobId = '123';
|
||||||
queue.getJob.mockResolvedValue(mock<Job>({ id: jobId }));
|
queue.getJob.mockResolvedValue(mock<Job>({ id: jobId }));
|
||||||
@@ -196,7 +220,6 @@ describe('ScalingService', () => {
|
|||||||
/**
|
/**
|
||||||
* Arrange
|
* Arrange
|
||||||
*/
|
*/
|
||||||
const scalingService = new ScalingService(mock(), mock(), mock(), globalConfig);
|
|
||||||
await scalingService.setupQueue();
|
await scalingService.setupQueue();
|
||||||
queue.getJobs.mockResolvedValue([mock<Job>({ id: '123' })]);
|
queue.getJobs.mockResolvedValue([mock<Job>({ id: '123' })]);
|
||||||
|
|
||||||
@@ -217,7 +240,6 @@ describe('ScalingService', () => {
|
|||||||
/**
|
/**
|
||||||
* Arrange
|
* Arrange
|
||||||
*/
|
*/
|
||||||
const scalingService = new ScalingService(mock(), mock(), mock(), globalConfig);
|
|
||||||
await scalingService.setupQueue();
|
await scalingService.setupQueue();
|
||||||
// @ts-expect-error - Untyped but possible Redis response
|
// @ts-expect-error - Untyped but possible Redis response
|
||||||
queue.getJobs.mockResolvedValue([mock<Job>(), null]);
|
queue.getJobs.mockResolvedValue([mock<Job>(), null]);
|
||||||
@@ -239,7 +261,6 @@ describe('ScalingService', () => {
|
|||||||
/**
|
/**
|
||||||
* Arrange
|
* Arrange
|
||||||
*/
|
*/
|
||||||
const scalingService = new ScalingService(mock(), mock(), mock(), globalConfig);
|
|
||||||
await scalingService.setupQueue();
|
await scalingService.setupQueue();
|
||||||
const job = mock<Job>({ isActive: jest.fn().mockResolvedValue(true) });
|
const job = mock<Job>({ isActive: jest.fn().mockResolvedValue(true) });
|
||||||
|
|
||||||
@@ -259,7 +280,6 @@ describe('ScalingService', () => {
|
|||||||
/**
|
/**
|
||||||
* Arrange
|
* Arrange
|
||||||
*/
|
*/
|
||||||
const scalingService = new ScalingService(mock(), mock(), mock(), globalConfig);
|
|
||||||
await scalingService.setupQueue();
|
await scalingService.setupQueue();
|
||||||
const job = mock<Job>({ isActive: jest.fn().mockResolvedValue(false) });
|
const job = mock<Job>({ isActive: jest.fn().mockResolvedValue(false) });
|
||||||
|
|
||||||
@@ -279,7 +299,6 @@ describe('ScalingService', () => {
|
|||||||
/**
|
/**
|
||||||
* Arrange
|
* Arrange
|
||||||
*/
|
*/
|
||||||
const scalingService = new ScalingService(mock(), mock(), mock(), globalConfig);
|
|
||||||
await scalingService.setupQueue();
|
await scalingService.setupQueue();
|
||||||
const job = mock<Job>({
|
const job = mock<Job>({
|
||||||
isActive: jest.fn().mockImplementation(() => {
|
isActive: jest.fn().mockImplementation(() => {
|
||||||
@@ -298,4 +317,42 @@ describe('ScalingService', () => {
|
|||||||
expect(result).toBe(false);
|
expect(result).toBe(false);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('scheduleQueueRecovery', () => {
|
||||||
|
it('if leader, should schedule queue recovery', async () => {
|
||||||
|
/**
|
||||||
|
* Arrange
|
||||||
|
*/
|
||||||
|
const scheduleSpy = jest.spyOn(scalingService, 'scheduleQueueRecovery');
|
||||||
|
instanceSettings.markAsLeader();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Act
|
||||||
|
*/
|
||||||
|
await scalingService.setupQueue();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Assert
|
||||||
|
*/
|
||||||
|
expect(scheduleSpy).toHaveBeenCalled();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('if follower, should not schedule queue recovery', async () => {
|
||||||
|
/**
|
||||||
|
* Arrange
|
||||||
|
*/
|
||||||
|
const scheduleSpy = jest.spyOn(scalingService, 'scheduleQueueRecovery');
|
||||||
|
instanceSettings.markAsFollower();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Act
|
||||||
|
*/
|
||||||
|
await scalingService.setupQueue();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Assert
|
||||||
|
*/
|
||||||
|
expect(scheduleSpy).not.toHaveBeenCalled();
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -1,16 +1,28 @@
|
|||||||
import Container, { Service } from 'typedi';
|
import Container, { Service } from 'typedi';
|
||||||
import { ApplicationError, BINARY_ENCODING, sleep } from 'n8n-workflow';
|
import { ApplicationError, BINARY_ENCODING, sleep, jsonStringify } from 'n8n-workflow';
|
||||||
import { ActiveExecutions } from '@/ActiveExecutions';
|
import { ActiveExecutions } from '@/ActiveExecutions';
|
||||||
import config from '@/config';
|
import config from '@/config';
|
||||||
import { Logger } from '@/Logger';
|
import { Logger } from '@/Logger';
|
||||||
import { MaxStalledCountError } from '@/errors/max-stalled-count.error';
|
import { MaxStalledCountError } from '@/errors/max-stalled-count.error';
|
||||||
import { HIGHEST_SHUTDOWN_PRIORITY } from '@/constants';
|
import { HIGHEST_SHUTDOWN_PRIORITY, Time } from '@/constants';
|
||||||
import { OnShutdown } from '@/decorators/OnShutdown';
|
import { OnShutdown } from '@/decorators/OnShutdown';
|
||||||
import { JOB_TYPE_NAME, QUEUE_NAME } from './constants';
|
import { JOB_TYPE_NAME, QUEUE_NAME } from './constants';
|
||||||
import { JobProcessor } from './job-processor';
|
import { JobProcessor } from './job-processor';
|
||||||
import type { JobQueue, Job, JobData, JobOptions, JobMessage, JobStatus, JobId } from './types';
|
import type {
|
||||||
|
JobQueue,
|
||||||
|
Job,
|
||||||
|
JobData,
|
||||||
|
JobOptions,
|
||||||
|
JobMessage,
|
||||||
|
JobStatus,
|
||||||
|
JobId,
|
||||||
|
QueueRecoveryContext,
|
||||||
|
} from './types';
|
||||||
import type { IExecuteResponsePromiseData } from 'n8n-workflow';
|
import type { IExecuteResponsePromiseData } from 'n8n-workflow';
|
||||||
import { GlobalConfig } from '@n8n/config';
|
import { GlobalConfig } from '@n8n/config';
|
||||||
|
import { ExecutionRepository } from '@/databases/repositories/execution.repository';
|
||||||
|
import { InstanceSettings } from 'n8n-core';
|
||||||
|
import { OrchestrationService } from '@/services/orchestration.service';
|
||||||
|
|
||||||
@Service()
|
@Service()
|
||||||
export class ScalingService {
|
export class ScalingService {
|
||||||
@@ -23,6 +35,9 @@ export class ScalingService {
|
|||||||
private readonly activeExecutions: ActiveExecutions,
|
private readonly activeExecutions: ActiveExecutions,
|
||||||
private readonly jobProcessor: JobProcessor,
|
private readonly jobProcessor: JobProcessor,
|
||||||
private readonly globalConfig: GlobalConfig,
|
private readonly globalConfig: GlobalConfig,
|
||||||
|
private readonly executionRepository: ExecutionRepository,
|
||||||
|
private readonly instanceSettings: InstanceSettings,
|
||||||
|
private readonly orchestrationService: OrchestrationService,
|
||||||
) {}
|
) {}
|
||||||
|
|
||||||
// #region Lifecycle
|
// #region Lifecycle
|
||||||
@@ -43,6 +58,14 @@ export class ScalingService {
|
|||||||
|
|
||||||
this.registerListeners();
|
this.registerListeners();
|
||||||
|
|
||||||
|
if (this.instanceSettings.isLeader) this.scheduleQueueRecovery();
|
||||||
|
|
||||||
|
if (this.orchestrationService.isMultiMainSetupEnabled) {
|
||||||
|
this.orchestrationService.multiMainSetup
|
||||||
|
.on('leader-takeover', () => this.scheduleQueueRecovery())
|
||||||
|
.on('leader-stepdown', () => this.stopQueueRecovery());
|
||||||
|
}
|
||||||
|
|
||||||
this.logger.debug('[ScalingService] Queue setup completed');
|
this.logger.debug('[ScalingService] Queue setup completed');
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -64,6 +87,10 @@ export class ScalingService {
|
|||||||
|
|
||||||
this.logger.debug('[ScalingService] Queue paused');
|
this.logger.debug('[ScalingService] Queue paused');
|
||||||
|
|
||||||
|
this.stopQueueRecovery();
|
||||||
|
|
||||||
|
this.logger.debug('[ScalingService] Queue recovery stopped');
|
||||||
|
|
||||||
let count = 0;
|
let count = 0;
|
||||||
|
|
||||||
while (this.getRunningJobsCount() !== 0) {
|
while (this.getRunningJobsCount() !== 0) {
|
||||||
@@ -230,4 +257,86 @@ export class ScalingService {
|
|||||||
|
|
||||||
throw new ApplicationError('This method must be called on a `worker` instance');
|
throw new ApplicationError('This method must be called on a `worker` instance');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// #region Queue recovery
|
||||||
|
|
||||||
|
private readonly queueRecoveryContext: QueueRecoveryContext = {
|
||||||
|
batchSize: config.getEnv('executions.queueRecovery.batchSize'),
|
||||||
|
waitMs: config.getEnv('executions.queueRecovery.interval') * 60 * 1000,
|
||||||
|
};
|
||||||
|
|
||||||
|
scheduleQueueRecovery(waitMs = this.queueRecoveryContext.waitMs) {
|
||||||
|
this.queueRecoveryContext.timeout = setTimeout(async () => {
|
||||||
|
try {
|
||||||
|
const nextWaitMs = await this.recoverFromQueue();
|
||||||
|
this.scheduleQueueRecovery(nextWaitMs);
|
||||||
|
} catch (error) {
|
||||||
|
this.logger.error('[ScalingService] Failed to recover dangling executions from queue', {
|
||||||
|
msg: this.toErrorMsg(error),
|
||||||
|
});
|
||||||
|
this.logger.error('[ScalingService] Retrying...');
|
||||||
|
|
||||||
|
this.scheduleQueueRecovery();
|
||||||
|
}
|
||||||
|
}, waitMs);
|
||||||
|
|
||||||
|
const wait = [this.queueRecoveryContext.waitMs / Time.minutes.toMilliseconds, 'min'].join(' ');
|
||||||
|
|
||||||
|
this.logger.debug(`[ScalingService] Scheduled queue recovery check for next ${wait}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
stopQueueRecovery() {
|
||||||
|
clearTimeout(this.queueRecoveryContext.timeout);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Mark in-progress executions as `crashed` if stored in DB as `new` or `running`
|
||||||
|
* but absent from the queue. Return time until next recovery cycle.
|
||||||
|
*/
|
||||||
|
private async recoverFromQueue() {
|
||||||
|
const { waitMs, batchSize } = this.queueRecoveryContext;
|
||||||
|
|
||||||
|
const storedIds = await this.executionRepository.getInProgressExecutionIds(batchSize);
|
||||||
|
|
||||||
|
if (storedIds.length === 0) {
|
||||||
|
this.logger.debug('[ScalingService] Completed queue recovery check, no dangling executions');
|
||||||
|
return waitMs;
|
||||||
|
}
|
||||||
|
|
||||||
|
const runningJobs = await this.findJobsByStatus(['active', 'waiting']);
|
||||||
|
|
||||||
|
const queuedIds = new Set(runningJobs.map((job) => job.data.executionId));
|
||||||
|
|
||||||
|
if (queuedIds.size === 0) {
|
||||||
|
this.logger.debug('[ScalingService] Completed queue recovery check, no dangling executions');
|
||||||
|
return waitMs;
|
||||||
|
}
|
||||||
|
|
||||||
|
const danglingIds = storedIds.filter((id) => !queuedIds.has(id));
|
||||||
|
|
||||||
|
if (danglingIds.length === 0) {
|
||||||
|
this.logger.debug('[ScalingService] Completed queue recovery check, no dangling executions');
|
||||||
|
return waitMs;
|
||||||
|
}
|
||||||
|
|
||||||
|
await this.executionRepository.markAsCrashed(danglingIds);
|
||||||
|
|
||||||
|
this.logger.info(
|
||||||
|
'[ScalingService] Completed queue recovery check, recovered dangling executions',
|
||||||
|
{ danglingIds },
|
||||||
|
);
|
||||||
|
|
||||||
|
// if this cycle used up the whole batch size, it is possible for there to be
|
||||||
|
// dangling executions outside this check, so speed up next cycle
|
||||||
|
|
||||||
|
return storedIds.length >= this.queueRecoveryContext.batchSize ? waitMs / 2 : waitMs;
|
||||||
|
}
|
||||||
|
|
||||||
|
private toErrorMsg(error: unknown) {
|
||||||
|
return error instanceof Error
|
||||||
|
? error.message
|
||||||
|
: jsonStringify(error, { replaceCircularRefs: true });
|
||||||
|
}
|
||||||
|
|
||||||
|
// #endregion
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -53,3 +53,14 @@ export type RunningJob = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
export type RunningJobSummary = Omit<RunningJob, 'run'>;
|
export type RunningJobSummary = Omit<RunningJob, 'run'>;
|
||||||
|
|
||||||
|
export type QueueRecoveryContext = {
|
||||||
|
/** ID of timeout for next scheduled recovery cycle. */
|
||||||
|
timeout?: NodeJS.Timeout;
|
||||||
|
|
||||||
|
/** Number of in-progress executions to check per cycle. */
|
||||||
|
batchSize: number;
|
||||||
|
|
||||||
|
/** Time (in milliseconds) to wait until the next cycle. */
|
||||||
|
waitMs: number;
|
||||||
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user