refactor(core): Revamp logs for scaling mode (#11244)

This commit is contained in:
Iván Ovejero
2024-10-14 15:15:42 +02:00
committed by GitHub
parent 3d97f02a8d
commit 873851b54e
16 changed files with 230 additions and 97 deletions

View File

@@ -6,6 +6,7 @@ import {
sleep,
jsonStringify,
ErrorReporterProxy,
ensureError,
} from 'n8n-workflow';
import type { IExecuteResponsePromiseData } from 'n8n-workflow';
import { strict } from 'node:assert';
@@ -20,6 +21,7 @@ import { MaxStalledCountError } from '@/errors/max-stalled-count.error';
import { EventService } from '@/events/event.service';
import { Logger } from '@/logging/logger.service';
import { OrchestrationService } from '@/services/orchestration.service';
import { assertNever } from '@/utils';
import { JOB_TYPE_NAME, QUEUE_NAME } from './constants';
import { JobProcessor } from './job-processor';
@@ -31,7 +33,8 @@ import type {
JobStatus,
JobId,
QueueRecoveryContext,
JobReport,
JobMessage,
JobFailedMessage,
} from './scaling.types';
@Service()
@@ -89,34 +92,46 @@ export class ScalingService {
void this.queue.process(JOB_TYPE_NAME, concurrency, async (job: Job) => {
try {
await this.jobProcessor.processJob(job);
} catch (error: unknown) {
// Errors thrown here will be sent to the main instance by bull. Logging
// them out and rethrowing them allows to find out which worker had the
// issue.
this.logger.error('Executing a job errored', {
jobId: job.id,
executionId: job.data.executionId,
error,
});
ErrorReporterProxy.error(error);
throw error;
} catch (error) {
await this.reportJobProcessingError(ensureError(error), job);
}
});
this.logger.debug('Worker setup completed');
}
private async reportJobProcessingError(error: Error, job: Job) {
const { executionId } = job.data;
this.logger.error(`Worker errored while running execution ${executionId} (job ${job.id})`, {
error,
executionId,
jobId: job.id,
});
const msg: JobFailedMessage = {
kind: 'job-failed',
executionId,
workerId: config.getEnv('redis.queueModeId'),
errorMsg: error.message,
};
await job.progress(msg);
ErrorReporterProxy.error(error, { executionId });
throw error;
}
@OnShutdown(HIGHEST_SHUTDOWN_PRIORITY)
async stop() {
await this.queue.pause(true, true);
await this.queue.pause(true, true); // no more jobs will be picked up
this.logger.debug('Queue paused');
this.stopQueueRecovery();
this.stopQueueMetrics();
this.logger.debug('Queue recovery and metrics stopped');
let count = 0;
while (this.getRunningJobsCount() !== 0) {
@@ -161,7 +176,10 @@ export class ScalingService {
const job = await this.queue.add(JOB_TYPE_NAME, jobData, jobOptions);
this.logger.info(`Added job ${job.id} (execution ${jobData.executionId})`);
const { executionId } = jobData;
const jobId = job.id;
this.logger.info(`Enqueued execution ${executionId} (job ${jobId})`, { executionId, jobId });
return job;
}
@@ -218,7 +236,7 @@ export class ScalingService {
*/
private registerWorkerListeners() {
this.queue.on('global:progress', (jobId: JobId, msg: unknown) => {
if (!this.isPubSubMessage(msg)) return;
if (!this.isJobMessage(msg)) return;
if (msg.kind === 'abort-job') this.jobProcessor.stopJob(jobId);
});
@@ -258,12 +276,36 @@ export class ScalingService {
throw error;
});
this.queue.on('global:progress', (_jobId: JobId, msg: unknown) => {
if (!this.isPubSubMessage(msg)) return;
this.queue.on('global:progress', (jobId: JobId, msg: unknown) => {
if (!this.isJobMessage(msg)) return;
if (msg.kind === 'respond-to-webhook') {
const decodedResponse = this.decodeWebhookResponse(msg.response);
this.activeExecutions.resolveResponsePromise(msg.executionId, decodedResponse);
// completion and failure are reported via `global:progress` to convey more details
// than natively provided by Bull in `global:completed` and `global:failed` events
switch (msg.kind) {
case 'respond-to-webhook':
const decodedResponse = this.decodeWebhookResponse(msg.response);
this.activeExecutions.resolveResponsePromise(msg.executionId, decodedResponse);
break;
case 'job-finished':
this.logger.info(`Execution ${msg.executionId} (job ${jobId}) finished successfully`, {
workerId: msg.workerId,
executionId: msg.executionId,
jobId,
});
break;
case 'job-failed':
this.logger.error(`Execution ${msg.executionId} (job ${jobId}) failed`, {
workerId: msg.workerId,
errorMsg: msg.errorMsg,
executionId: msg.executionId,
jobId,
});
break;
case 'abort-job':
break; // only for worker
default:
assertNever(msg);
}
});
@@ -273,7 +315,8 @@ export class ScalingService {
}
}
private isPubSubMessage(candidate: unknown): candidate is JobReport {
/** Whether the argument is a message sent via Bull's internal pubsub setup. */
private isJobMessage(candidate: unknown): candidate is JobMessage {
return typeof candidate === 'object' && candidate !== null && 'kind' in candidate;
}
@@ -345,6 +388,8 @@ export class ScalingService {
if (this.queueMetricsInterval) {
clearInterval(this.queueMetricsInterval);
this.queueMetricsInterval = undefined;
this.logger.debug('Queue metrics collection stopped');
}
}
@@ -379,6 +424,8 @@ export class ScalingService {
private stopQueueRecovery() {
clearTimeout(this.queueRecoveryContext.timeout);
this.logger.debug('Queue recovery stopped');
}
/**