feat(core): Automatically extract evaluation metrics (no-changelog) (#14051)

2025-12-18 02:21:13 +00:00 · 2025-03-25 15:43:19 +01:00
parent 22e6569f7e
commit 53e11b19ad
28 changed files with 460 additions and 1286 deletions
--- a/packages/cli/src/constants.ts
+++ b/packages/cli/src/constants.ts
@@ -195,3 +195,5 @@ export const WsStatusCodes = {
 } as const;

 export const FREE_AI_CREDITS_CREDENTIAL_NAME = 'n8n free OpenAI API credits';
+
+export const EVALUATION_METRICS_NODE = `${NODE_PACKAGE_PREFIX}base.evaluationMetrics`;
--- a/packages/cli/src/evaluation.ee/metrics.controller.ts
+++ b/packages/cli/src/evaluation.ee/metrics.controller.ts
@@ -1,141 +0,0 @@
-import express from 'express';
-
-import { TestMetricRepository } from '@/databases/repositories/test-metric.repository.ee';
-import { Delete, Get, Patch, Post, RestController } from '@/decorators';
-import { NotFoundError } from '@/errors/response-errors/not-found.error';
-import {
-	testMetricCreateRequestBodySchema,
-	testMetricPatchRequestBodySchema,
-} from '@/evaluation.ee/metric.schema';
-import { getSharedWorkflowIds } from '@/public-api/v1/handlers/workflows/workflows.service';
-import { Telemetry } from '@/telemetry';
-
-import { TestDefinitionService } from './test-definition.service.ee';
-import { TestMetricsRequest } from './test-definitions.types.ee';
-
-@RestController('/evaluation/test-definitions')
-export class TestMetricsController {
-	constructor(
-		private readonly testDefinitionService: TestDefinitionService,
-		private readonly testMetricRepository: TestMetricRepository,
-		private readonly telemetry: Telemetry,
-	) {}
-
-	// This method is used in multiple places in the controller to get the test definition
-	// (or just check that it exists and the user has access to it).
-	private async getTestDefinition(
-		req:
-			| TestMetricsRequest.GetOne
-			| TestMetricsRequest.GetMany
-			| TestMetricsRequest.Patch
-			| TestMetricsRequest.Delete
-			| TestMetricsRequest.Create,
-	) {
-		const { testDefinitionId } = req.params;
-
-		const userAccessibleWorkflowIds = await getSharedWorkflowIds(req.user, ['workflow:read']);
-
-		const testDefinition = await this.testDefinitionService.findOne(
-			testDefinitionId,
-			userAccessibleWorkflowIds,
-		);
-
-		if (!testDefinition) throw new NotFoundError('Test definition not found');
-
-		return testDefinition;
-	}
-
-	@Get('/:testDefinitionId/metrics')
-	async getMany(req: TestMetricsRequest.GetMany) {
-		const { testDefinitionId } = req.params;
-
-		await this.getTestDefinition(req);
-
-		return await this.testMetricRepository.find({
-			where: { testDefinition: { id: testDefinitionId } },
-		});
-	}
-
-	@Get('/:testDefinitionId/metrics/:id')
-	async getOne(req: TestMetricsRequest.GetOne) {
-		const { id: metricId, testDefinitionId } = req.params;
-
-		await this.getTestDefinition(req);
-
-		const metric = await this.testMetricRepository.findOne({
-			where: { id: metricId, testDefinition: { id: testDefinitionId } },
-		});
-
-		if (!metric) throw new NotFoundError('Metric not found');
-
-		return metric;
-	}
-
-	@Post('/:testDefinitionId/metrics')
-	async create(req: TestMetricsRequest.Create, res: express.Response) {
-		const bodyParseResult = testMetricCreateRequestBodySchema.safeParse(req.body);
-		if (!bodyParseResult.success) {
-			res.status(400).json({ errors: bodyParseResult.error.errors });
-			return;
-		}
-
-		const testDefinition = await this.getTestDefinition(req);
-
-		const metric = this.testMetricRepository.create({
-			...req.body,
-			testDefinition,
-		});
-
-		return await this.testMetricRepository.save(metric);
-	}
-
-	@Patch('/:testDefinitionId/metrics/:id')
-	async patch(req: TestMetricsRequest.Patch, res: express.Response) {
-		const { id: metricId, testDefinitionId } = req.params;
-
-		const bodyParseResult = testMetricPatchRequestBodySchema.safeParse(req.body);
-		if (!bodyParseResult.success) {
-			res.status(400).json({ errors: bodyParseResult.error.errors });
-			return;
-		}
-
-		await this.getTestDefinition(req);
-
-		const metric = await this.testMetricRepository.findOne({
-			where: { id: metricId, testDefinition: { id: testDefinitionId } },
-		});
-
-		if (!metric) throw new NotFoundError('Metric not found');
-
-		const updateResult = await this.testMetricRepository.update(metricId, bodyParseResult.data);
-
-		// Send telemetry event if the metric was updated
-		if (updateResult.affected === 1 && metric.name !== bodyParseResult.data.name) {
-			this.telemetry.track('User added metrics to test', {
-				metric_id: metricId,
-				metric_name: bodyParseResult.data.name,
-				test_id: testDefinitionId,
-			});
-		}
-
-		// Respond with the updated metric
-		return await this.testMetricRepository.findOneBy({ id: metricId });
-	}
-
-	@Delete('/:testDefinitionId/metrics/:id')
-	async delete(req: TestMetricsRequest.Delete) {
-		const { id: metricId, testDefinitionId } = req.params;
-
-		await this.getTestDefinition(req);
-
-		const metric = await this.testMetricRepository.findOne({
-			where: { id: metricId, testDefinition: { id: testDefinitionId } },
-		});
-
-		if (!metric) throw new NotFoundError('Metric not found');
-
-		await this.testMetricRepository.delete(metricId);
-
-		return { success: true };
-	}
-}
--- a/packages/cli/src/evaluation.ee/test-definitions.types.ee.ts
+++ b/packages/cli/src/evaluation.ee/test-definitions.types.ee.ts
@@ -47,36 +47,6 @@ export declare namespace TestDefinitionsRequest {
 	>;
 }

-// ----------------------------------
-//             /test-definitions/:testDefinitionId/metrics
-// ----------------------------------
-
-export declare namespace TestMetricsRequest {
-	namespace RouteParams {
-		type TestDefinitionId = {
-			testDefinitionId: string;
-		};
-
-		type TestMetricId = {
-			id: string;
-		};
-	}
-
-	type GetOne = AuthenticatedRequest<RouteParams.TestDefinitionId & RouteParams.TestMetricId>;
-
-	type GetMany = AuthenticatedRequest<RouteParams.TestDefinitionId>;
-
-	type Create = AuthenticatedRequest<RouteParams.TestDefinitionId, {}, { name: string }>;
-
-	type Patch = AuthenticatedRequest<
-		RouteParams.TestDefinitionId & RouteParams.TestMetricId,
-		{},
-		{ name: string }
-	>;
-
-	type Delete = AuthenticatedRequest<RouteParams.TestDefinitionId & RouteParams.TestMetricId>;
-}
-
 // ----------------------------------
 // 					 /test-definitions/:testDefinitionId/runs
 // ----------------------------------
--- a/packages/cli/src/evaluation.ee/test-runner/tests/evaluation-metrics.ee.test.ts
+++ b/packages/cli/src/evaluation.ee/test-runner/tests/evaluation-metrics.ee.test.ts
@@ -24,14 +24,6 @@ describe('EvaluationMetrics', () => {
 		);
 	});

-	test('should throw when missing values', () => {
-		const testMetricNames = new Set(['metric1', 'metric2']);
-		const metrics = new EvaluationMetrics(testMetricNames);
-
-		expect(() => metrics.addResults({ metric1: 1 })).toThrow('METRICS_MISSING');
-		expect(() => metrics.addResults({ metric2: 0.2 })).toThrow('METRICS_MISSING');
-	});
-
 	test('should handle empty metrics', () => {
 		const testMetricNames = new Set(['metric1', 'metric2']);
 		const metrics = new EvaluationMetrics(testMetricNames);
--- a/packages/cli/src/evaluation.ee/test-runner/tests/mock-data/workflow.evaluation-middle.json
+++ b/packages/cli/src/evaluation.ee/test-runner/tests/mock-data/workflow.evaluation-middle.json
@@ -0,0 +1,85 @@
+{
+	"nodes": [
+		{
+			"parameters": {},
+			"id": "6dde1608-135f-441d-8438-40f605e4dae3",
+			"name": "Execute Workflow Trigger",
+			"type": "n8n-nodes-base.executeWorkflowTrigger",
+			"typeVersion": 1,
+			"position": [-180, -40]
+		},
+		{
+			"parameters": {
+				"metrics": {
+					"assignments": [
+						{
+							"id": "f1cab1e4-dabc-4750-a1f7-6669a60213c9",
+							"name": "metric1",
+							"value": 20,
+							"type": "number"
+						},
+						{
+							"id": "a66ed953-5341-47f6-8565-47640d987f5f",
+							"name": "metric2",
+							"value": 30,
+							"type": "number"
+						}
+					]
+				}
+			},
+			"id": "dfc71b70-7b7a-4dde-914f-cc4ffc99b18c",
+			"name": "Success",
+			"type": "n8n-nodes-base.evaluationMetrics",
+			"typeVersion": 1,
+			"position": [620, -40]
+		},
+		{
+			"parameters": {
+				"metrics": {
+					"assignments": [
+						{
+							"id": "1e1153da-77c0-4cb5-804a-3f4bc40833dd",
+							"name": "metric2",
+							"value": 10,
+							"type": "number"
+						}
+					]
+				}
+			},
+			"id": "5ef5be33-37e0-4c95-81c8-3fd677bdca88",
+			"name": "First Metric",
+			"type": "n8n-nodes-base.evaluationMetrics",
+			"typeVersion": 1,
+			"position": [160, -40]
+		}
+	],
+	"connections": {
+		"Execute Workflow Trigger": {
+			"main": [
+				[
+					{
+						"node": "First Metric",
+						"type": "main",
+						"index": 0
+					}
+				]
+			]
+		},
+		"First Metric": {
+			"main": [
+				[
+					{
+						"node": "Success",
+						"type": "main",
+						"index": 0
+					}
+				]
+			]
+		}
+	},
+	"pinData": {},
+	"meta": {
+		"templateCredsSetupCompleted": true,
+		"instanceId": "27cc9b56542ad45b38725555722c50a1c3fee1670bbb67980558314ee08517c4"
+	}
+}
--- a/packages/cli/src/evaluation.ee/test-runner/tests/mock-data/workflow.evaluation.json
+++ b/packages/cli/src/evaluation.ee/test-runner/tests/mock-data/workflow.evaluation.json
@@ -50,7 +50,7 @@
 		},
 		{
 			"parameters": {
-				"assignments": {
+				"metrics": {
 					"assignments": [
 						{
 							"id": "3b65d55a-158f-40c6-9853-a1c44b7ba1e5",
@@ -70,13 +70,13 @@
 			},
 			"id": "0c7a1ee8-0cf0-4d7f-99a3-186bbcd8815a",
 			"name": "Success",
-			"type": "n8n-nodes-base.set",
-			"typeVersion": 3.4,
+			"type": "n8n-nodes-base.evaluationMetrics",
+			"typeVersion": 1,
 			"position": [980, 220]
 		},
 		{
 			"parameters": {
-				"assignments": {
+				"metrics": {
 					"assignments": [
 						{
 							"id": "6cc8b402-4a30-4873-b825-963a1f1b8b82",
@@ -90,8 +90,8 @@
 			},
 			"id": "50d3f84a-d99f-4e04-bdbd-3e8c2668e708",
 			"name": "Fail",
-			"type": "n8n-nodes-base.set",
-			"typeVersion": 3.4,
+			"type": "n8n-nodes-base.evaluationMetrics",
+			"typeVersion": 1,
 			"position": [980, 420]
 		}
 	],
--- a/packages/cli/src/evaluation.ee/test-runner/tests/test-runner.service.ee.test.ts
+++ b/packages/cli/src/evaluation.ee/test-runner/tests/test-runner.service.ee.test.ts
@@ -46,6 +46,12 @@ const wfEvaluationJson = JSON.parse(
 	readFileSync(path.join(__dirname, './mock-data/workflow.evaluation.json'), { encoding: 'utf-8' }),
 );

+const wfEvaluationMiddleJson = JSON.parse(
+	readFileSync(path.join(__dirname, './mock-data/workflow.evaluation-middle.json'), {
+		encoding: 'utf-8',
+	}),
+);
+
 const wfMultipleTriggersJson = JSON.parse(
 	readFileSync(path.join(__dirname, './mock-data/workflow.multiple-triggers.json'), {
 		encoding: 'utf-8',
@@ -131,9 +137,22 @@ function mockEvaluationExecutionData(metrics: Record<string, GenericValue>) {
 	return mock<IRun>({
 		data: {
 			resultData: {
-				lastNodeExecuted: 'lastNode',
+				lastNodeExecuted: 'Success',
 				runData: {
-					lastNode: [
+					Success: [
+						{
+							data: {
+								main: [
+									[
+										{
+											json: metrics,
+										},
+									],
+								],
+							},
+						},
+					],
+					Fail: [
 						{
 							data: {
 								main: [
@@ -155,6 +174,52 @@ function mockEvaluationExecutionData(metrics: Record<string, GenericValue>) {
 	});
 }

+function mockEvaluationMiddleExecutionData(
+	firstMetrics: Record<string, GenericValue>,
+	secondMetrics: Record<string, GenericValue>,
+) {
+	// Clone the metrics to avoid modifying the passed object
+	// For test assertions, these run-data need special handling
+	const runData: Record<string, any> = {
+		'First Metric': [
+			{
+				data: {
+					main: [
+						[
+							{
+								json: firstMetrics,
+							},
+						],
+					],
+				},
+			},
+		],
+		Success: [
+			{
+				data: {
+					main: [
+						[
+							{
+								json: secondMetrics,
+							},
+						],
+					],
+				},
+			},
+		],
+	};
+
+	return mock<IRun>({
+		data: {
+			resultData: {
+				lastNodeExecuted: 'Success',
+				runData,
+				error: undefined,
+			},
+		},
+	});
+}
+
 const errorReporter = mock<ErrorReporter>();
 const logger = mockLogger();
 const telemetry = mock<Telemetry>();
@@ -363,7 +428,6 @@ describe('TestRunnerService', () => {
 		expect(testRunRepository.markAsCompleted).toHaveBeenCalledTimes(1);
 		expect(testRunRepository.markAsCompleted).toHaveBeenCalledWith('test-run-id', {
 			metric1: 0.75,
-			metric2: 50,
 		});

 		expect(testRunRepository.incrementPassed).toHaveBeenCalledTimes(2);
@@ -868,6 +932,218 @@ describe('TestRunnerService', () => {
 		expect(workflowRunner.run).toHaveBeenCalledTimes(1);
 	});

+	test('should run workflow with metrics defined in the middle of the workflow', async () => {
+		const testRunnerService = new TestRunnerService(
+			logger,
+			telemetry,
+			workflowRepository,
+			workflowRunner,
+			executionRepository,
+			activeExecutions,
+			testRunRepository,
+			testCaseExecutionRepository,
+			testMetricRepository,
+			mockNodeTypes,
+			errorReporter,
+		);
+
+		workflowRepository.findById.calledWith('workflow-under-test-id').mockResolvedValueOnce({
+			id: 'workflow-under-test-id',
+			...wfUnderTestJson,
+		});
+
+		workflowRepository.findById.calledWith('evaluation-workflow-id').mockResolvedValueOnce({
+			id: 'evaluation-workflow-id',
+			...wfEvaluationMiddleJson,
+		});
+
+		workflowRunner.run.mockResolvedValueOnce('some-execution-id');
+		workflowRunner.run.mockResolvedValueOnce('some-execution-id-2');
+		workflowRunner.run.mockResolvedValueOnce('some-execution-id-3');
+		workflowRunner.run.mockResolvedValueOnce('some-execution-id-4');
+
+		// Mock executions of workflow under test
+		activeExecutions.getPostExecutePromise
+			.calledWith('some-execution-id')
+			.mockResolvedValue(mockExecutionData());
+
+		activeExecutions.getPostExecutePromise
+			.calledWith('some-execution-id-3')
+			.mockResolvedValue(mockExecutionData());
+
+		// Mock executions of evaluation workflow
+		activeExecutions.getPostExecutePromise
+			.calledWith('some-execution-id-2')
+			.mockResolvedValue(mockEvaluationMiddleExecutionData({ metric2: 1 }, { metric1: 1 }));
+
+		activeExecutions.getPostExecutePromise
+			.calledWith('some-execution-id-4')
+			.mockResolvedValue(mockEvaluationMiddleExecutionData({ metric2: 2 }, { metric1: 0.5 }));
+
+		await testRunnerService.runTest(
+			mock<User>(),
+			mock<TestDefinition>({
+				workflowId: 'workflow-under-test-id',
+				evaluationWorkflowId: 'evaluation-workflow-id',
+				mockedNodes: [{ id: '72256d90-3a67-4e29-b032-47df4e5768af' }],
+			}),
+		);
+
+		expect(workflowRunner.run).toHaveBeenCalledTimes(4);
+
+		// Check workflow under test was executed
+		expect(workflowRunner.run).toHaveBeenCalledWith(
+			expect.objectContaining({
+				executionMode: 'evaluation',
+				pinData: {
+					'When clicking ‘Test workflow’':
+						executionDataJson.resultData.runData['When clicking ‘Test workflow’'][0].data.main[0],
+				},
+				workflowData: expect.objectContaining({
+					id: 'workflow-under-test-id',
+				}),
+			}),
+		);
+
+		// Check evaluation workflow was executed
+		expect(workflowRunner.run).toHaveBeenCalledWith(
+			expect.objectContaining({
+				executionMode: 'integrated',
+				executionData: expect.objectContaining({
+					executionData: expect.objectContaining({
+						nodeExecutionStack: expect.arrayContaining([
+							expect.objectContaining({ data: expect.anything() }),
+						]),
+					}),
+				}),
+				workflowData: expect.objectContaining({
+					id: 'evaluation-workflow-id',
+				}),
+			}),
+		);
+
+		// Check Test Run status was updated correctly
+		expect(testRunRepository.createTestRun).toHaveBeenCalledTimes(1);
+		expect(testRunRepository.markAsRunning).toHaveBeenCalledTimes(1);
+		expect(testRunRepository.markAsRunning).toHaveBeenCalledWith('test-run-id', expect.any(Number));
+		expect(testRunRepository.markAsCompleted).toHaveBeenCalledTimes(1);
+		expect(testRunRepository.markAsCompleted).toHaveBeenCalledWith('test-run-id', {
+			metric1: 0.75,
+			metric2: 1.5,
+		});
+
+		expect(testRunRepository.incrementPassed).toHaveBeenCalledTimes(2);
+		expect(testRunRepository.incrementFailed).not.toHaveBeenCalled();
+	});
+
+	test('should properly override metrics from earlier nodes with later ones', async () => {
+		const testRunnerService = new TestRunnerService(
+			logger,
+			telemetry,
+			workflowRepository,
+			workflowRunner,
+			executionRepository,
+			activeExecutions,
+			testRunRepository,
+			testCaseExecutionRepository,
+			testMetricRepository,
+			mockNodeTypes,
+			errorReporter,
+		);
+
+		workflowRepository.findById.calledWith('workflow-under-test-id').mockResolvedValueOnce({
+			id: 'workflow-under-test-id',
+			...wfUnderTestJson,
+		});
+
+		workflowRepository.findById.calledWith('evaluation-workflow-id').mockResolvedValueOnce({
+			id: 'evaluation-workflow-id',
+			...wfEvaluationMiddleJson,
+		});
+
+		workflowRunner.run.mockResolvedValueOnce('some-execution-id');
+		workflowRunner.run.mockResolvedValueOnce('some-execution-id-2');
+		workflowRunner.run.mockResolvedValueOnce('some-execution-id-3');
+		workflowRunner.run.mockResolvedValueOnce('some-execution-id-4');
+
+		// Mock executions of workflow under test
+		activeExecutions.getPostExecutePromise
+			.calledWith('some-execution-id')
+			.mockResolvedValue(mockExecutionData());
+
+		activeExecutions.getPostExecutePromise
+			.calledWith('some-execution-id-3')
+			.mockResolvedValue(mockExecutionData());
+
+		// Mock executions of evaluation workflow
+		activeExecutions.getPostExecutePromise
+			.calledWith('some-execution-id-2')
+			.mockResolvedValue(
+				mockEvaluationMiddleExecutionData({ metric2: 5 }, { metric1: 1, metric2: 5 }),
+			);
+
+		activeExecutions.getPostExecutePromise
+			.calledWith('some-execution-id-4')
+			.mockResolvedValue(
+				mockEvaluationMiddleExecutionData({ metric2: 10 }, { metric1: 0.5, metric2: 10 }),
+			);
+
+		await testRunnerService.runTest(
+			mock<User>(),
+			mock<TestDefinition>({
+				workflowId: 'workflow-under-test-id',
+				evaluationWorkflowId: 'evaluation-workflow-id',
+				mockedNodes: [{ id: '72256d90-3a67-4e29-b032-47df4e5768af' }],
+			}),
+		);
+
+		expect(workflowRunner.run).toHaveBeenCalledTimes(4);
+
+		// Check workflow under test was executed
+		expect(workflowRunner.run).toHaveBeenCalledWith(
+			expect.objectContaining({
+				executionMode: 'evaluation',
+				pinData: {
+					'When clicking ‘Test workflow’':
+						executionDataJson.resultData.runData['When clicking ‘Test workflow’'][0].data.main[0],
+				},
+				workflowData: expect.objectContaining({
+					id: 'workflow-under-test-id',
+				}),
+			}),
+		);
+
+		// Check evaluation workflow was executed
+		expect(workflowRunner.run).toHaveBeenCalledWith(
+			expect.objectContaining({
+				executionMode: 'integrated',
+				executionData: expect.objectContaining({
+					executionData: expect.objectContaining({
+						nodeExecutionStack: expect.arrayContaining([
+							expect.objectContaining({ data: expect.anything() }),
+						]),
+					}),
+				}),
+				workflowData: expect.objectContaining({
+					id: 'evaluation-workflow-id',
+				}),
+			}),
+		);
+
+		// Check Test Run status was updated correctly
+		expect(testRunRepository.createTestRun).toHaveBeenCalledTimes(1);
+		expect(testRunRepository.markAsRunning).toHaveBeenCalledTimes(1);
+		expect(testRunRepository.markAsRunning).toHaveBeenCalledWith('test-run-id', expect.any(Number));
+		expect(testRunRepository.markAsCompleted).toHaveBeenCalledTimes(1);
+		expect(testRunRepository.markAsCompleted).toHaveBeenCalledWith('test-run-id', {
+			metric1: 0.75,
+			metric2: 7.5,
+		});
+
+		expect(testRunRepository.incrementPassed).toHaveBeenCalledTimes(2);
+		expect(testRunRepository.incrementFailed).not.toHaveBeenCalled();
+	});
+
 	describe('Test Run cancellation', () => {
 		beforeAll(() => {
 			jest.useFakeTimers();
--- a/packages/cli/src/evaluation.ee/test-runner/errors.ee.ts
+++ b/packages/cli/src/evaluation.ee/test-runner/errors.ee.ts
@@ -6,8 +6,6 @@ export type TestCaseExecutionErrorCode =
 	| 'FAILED_TO_EXECUTE_WORKFLOW'
 	| 'EVALUATION_WORKFLOW_DOES_NOT_EXIST'
 	| 'FAILED_TO_EXECUTE_EVALUATION_WORKFLOW'
-	| 'METRICS_MISSING'
-	| 'UNKNOWN_METRICS'
 	| 'INVALID_METRICS'
 	| 'PAYLOAD_LIMIT_EXCEEDED'
 	| 'UNKNOWN_ERROR';
--- a/packages/cli/src/evaluation.ee/test-runner/evaluation-metrics.ee.ts
+++ b/packages/cli/src/evaluation.ee/test-runner/evaluation-metrics.ee.ts
@@ -1,4 +1,3 @@
-import difference from 'lodash/difference';
 import type { IDataObject } from 'n8n-workflow';

 import { TestCaseExecutionError } from '@/evaluation.ee/test-runner/errors.ee';
@@ -43,16 +42,6 @@ export class EvaluationMetrics {
 			}
 		}

-		// Check that result contains all expected metrics
-		if (
-			difference(Array.from(this.metricNames), Object.keys(addResultsInfo.addedMetrics)).length > 0
-		) {
-			throw new TestCaseExecutionError('METRICS_MISSING', {
-				expectedMetrics: Array.from(this.metricNames).sort(),
-				receivedMetrics: Object.keys(addResultsInfo.addedMetrics).sort(),
-			});
-		}
-
 		return addResultsInfo;
 	}

--- a/packages/cli/src/evaluation.ee/test-runner/test-runner.service.ee.ts
+++ b/packages/cli/src/evaluation.ee/test-runner/test-runner.service.ee.ts
@@ -1,8 +1,10 @@
 import { Service } from '@n8n/di';
 import { parse } from 'flatted';
+import difference from 'lodash/difference';
 import { ErrorReporter, Logger } from 'n8n-core';
 import { ExecutionCancelledError, NodeConnectionTypes, Workflow } from 'n8n-workflow';
 import type {
+	AssignmentCollectionValue,
 	IDataObject,
 	IRun,
 	IRunExecutionData,
@@ -13,6 +15,7 @@ import assert from 'node:assert';

 import { ActiveExecutions } from '@/active-executions';
 import config from '@/config';
+import { EVALUATION_METRICS_NODE } from '@/constants';
 import type { ExecutionEntity } from '@/databases/entities/execution-entity';
 import type { MockedNodeItem, TestDefinition } from '@/databases/entities/test-definition.ee';
 import type { TestRun } from '@/databases/entities/test-run.ee';
@@ -225,6 +228,50 @@ export class TestRunnerService {
 		return await executePromise;
 	}

+	/**
+	 * Sync the metrics of the test definition with the evaluation workflow.
+	 */
+	async syncMetrics(
+		testDefinitionId: string,
+		evaluationWorkflow: IWorkflowBase,
+	): Promise<Set<string>> {
+		const usedTestMetricNames = await this.getUsedTestMetricNames(evaluationWorkflow);
+		const existingTestMetrics = await this.testMetricRepository.find({
+			where: {
+				testDefinition: { id: testDefinitionId },
+			},
+		});
+
+		const existingMetricNames = new Set(existingTestMetrics.map((metric) => metric.name));
+		const metricsToAdd = difference(
+			Array.from(usedTestMetricNames),
+			Array.from(existingMetricNames),
+		);
+		const metricsToRemove = difference(
+			Array.from(existingMetricNames),
+			Array.from(usedTestMetricNames),
+		);
+
+		// Add new metrics
+		const metricsToAddEntities = metricsToAdd.map((metricName) =>
+			this.testMetricRepository.create({
+				name: metricName,
+				testDefinition: { id: testDefinitionId },
+			}),
+		);
+		await this.testMetricRepository.save(metricsToAddEntities);
+
+		// Remove no longer used metrics
+		metricsToRemove.forEach(async (metricName) => {
+			const metric = existingTestMetrics.find((m) => m.name === metricName);
+			assert(metric, 'Existing metric not found');
+
+			await this.testMetricRepository.delete(metric.id);
+		});
+
+		return usedTestMetricNames;
+	}
+
 	/**
 	 * Run the evaluation workflow with the expected and actual run data.
 	 */
@@ -265,35 +312,45 @@ export class TestRunnerService {
 		return await executePromise;
 	}

+	/**
+	 * Get the evaluation metrics nodes from a workflow.
+	 */
+	static getEvaluationMetricsNodes(workflow: IWorkflowBase) {
+		return workflow.nodes.filter((node) => node.type === EVALUATION_METRICS_NODE);
+	}
+
 	/**
 	 * Evaluation result is the first item in the output of the last node
 	 * executed in the evaluation workflow. Defaults to an empty object
 	 * in case the node doesn't produce any output items.
 	 */
-	private extractEvaluationResult(execution: IRun): IDataObject {
+	private extractEvaluationResult(execution: IRun, evaluationWorkflow: IWorkflowBase): IDataObject {
 		const lastNodeExecuted = execution.data.resultData.lastNodeExecuted;
 		assert(lastNodeExecuted, 'Could not find the last node executed in evaluation workflow');
+		const metricsNodes = TestRunnerService.getEvaluationMetricsNodes(evaluationWorkflow);
+		const metricsRunData = metricsNodes.flatMap(
+			(node) => execution.data.resultData.runData[node.name],
+		);
+		const metricsData = metricsRunData.reverse().map((data) => data.data?.main?.[0]?.[0]?.json);
+		const metricsResult = metricsData.reduce((acc, curr) => ({ ...acc, ...curr }), {}) ?? {};

-		// Extract the output of the last node executed in the evaluation workflow
-		// We use only the first item of a first main output
-		const lastNodeTaskData = execution.data.resultData.runData[lastNodeExecuted]?.[0];
-		const mainConnectionData = lastNodeTaskData?.data?.main?.[0];
-		return mainConnectionData?.[0]?.json ?? {};
+		return metricsResult;
 	}

 	/**
 	 * Get the metrics to collect from the evaluation workflow execution results.
 	 */
-	private async getTestMetricNames(testDefinitionId: string) {
-		const metrics = await this.testMetricRepository.find({
-			where: {
-				testDefinition: {
-					id: testDefinitionId,
-				},
-			},
+	private async getUsedTestMetricNames(evaluationWorkflow: IWorkflowBase) {
+		const metricsNodes = TestRunnerService.getEvaluationMetricsNodes(evaluationWorkflow);
+		const metrics = metricsNodes.map((node) => {
+			const metricsParameter = node.parameters?.metrics as AssignmentCollectionValue;
+			assert(metricsParameter, 'Metrics parameter not found');
+
+			const metricsNames = metricsParameter.assignments.map((assignment) => assignment.name);
+			return metricsNames;
 		});

-		return new Set(metrics.map((m) => m.name));
+		return new Set(metrics.flat());
 	}

 	/**
@@ -329,7 +386,6 @@ export class TestRunnerService {
 			if (!evaluationWorkflow) {
 				throw new TestRunError('EVALUATION_WORKFLOW_NOT_FOUND');
 			}
-
 			///
 			// 1. Make test cases from previous executions
 			///
@@ -359,8 +415,8 @@ export class TestRunnerService {
 				pastExecutions.map((e) => e.id),
 			);

-			// Get the metrics to collect from the evaluation workflow
-			const testMetricNames = await this.getTestMetricNames(test.id);
+			// Sync the metrics of the test definition with the evaluation workflow
+			const testMetricNames = await this.syncMetrics(test.id, evaluationWorkflow);

 			// 2. Run over all the test cases
 			const pastExecutionIds = pastExecutions.map((e) => e.id);
@@ -465,8 +521,8 @@ export class TestRunnerService {
 					this.logger.debug('Evaluation execution finished', { pastExecutionId });

 					// Extract the output of the last node executed in the evaluation workflow
-					const { addedMetrics, unknownMetrics } = metrics.addResults(
-						this.extractEvaluationResult(evalExecution),
+					const { addedMetrics } = metrics.addResults(
+						this.extractEvaluationResult(evalExecution, evaluationWorkflow),
 					);

 					if (evalExecution.data.resultData.error) {
@@ -483,22 +539,12 @@ export class TestRunnerService {
 						await Db.transaction(async (trx) => {
 							await this.testRunRepository.incrementPassed(testRun.id, trx);

-							// Add warning if the evaluation workflow produced an unknown metric
-							if (unknownMetrics.size > 0) {
-								await this.testCaseExecutionRepository.markAsWarning({
-									testRunId: testRun.id,
-									pastExecutionId,
-									errorCode: 'UNKNOWN_METRICS',
-									errorDetails: { unknownMetrics: Array.from(unknownMetrics) },
-								});
-							} else {
-								await this.testCaseExecutionRepository.markAsCompleted({
-									testRunId: testRun.id,
-									pastExecutionId,
-									metrics: addedMetrics,
-									trx,
-								});
-							}
+							await this.testCaseExecutionRepository.markAsCompleted({
+								testRunId: testRun.id,
+								pastExecutionId,
+								metrics: addedMetrics,
+								trx,
+							});
 						});
 					}
 				} catch (e) {
--- a/packages/cli/src/server.ts
+++ b/packages/cli/src/server.ts
@@ -66,7 +66,6 @@ import '@/executions/executions.controller';
 import '@/external-secrets.ee/external-secrets.controller.ee';
 import '@/license/license.controller';
 import '@/evaluation.ee/test-definitions.controller.ee';
-import '@/evaluation.ee/metrics.controller';
 import '@/evaluation.ee/test-runs.controller.ee';
 import '@/workflows/workflow-history.ee/workflow-history.controller.ee';
 import '@/workflows/workflows.controller';
--- a/packages/cli/test/integration/evaluation/metrics.api.test.ts
+++ b/packages/cli/test/integration/evaluation/metrics.api.test.ts
@@ -1,381 +0,0 @@
-import { Container } from '@n8n/di';
-import type { IWorkflowBase } from 'n8n-workflow';
-
-import type { TestDefinition } from '@/databases/entities/test-definition.ee';
-import type { User } from '@/databases/entities/user';
-import { TestDefinitionRepository } from '@/databases/repositories/test-definition.repository.ee';
-import { TestMetricRepository } from '@/databases/repositories/test-metric.repository.ee';
-import { createUserShell } from '@test-integration/db/users';
-import { createWorkflow } from '@test-integration/db/workflows';
-import * as testDb from '@test-integration/test-db';
-import type { SuperAgentTest } from '@test-integration/types';
-import * as utils from '@test-integration/utils';
-
-let authOwnerAgent: SuperAgentTest;
-let workflowUnderTest: IWorkflowBase;
-let otherWorkflow: IWorkflowBase;
-let testDefinition: TestDefinition;
-let otherTestDefinition: TestDefinition;
-let ownerShell: User;
-
-const testServer = utils.setupTestServer({ endpointGroups: ['evaluation'] });
-
-beforeAll(async () => {
-	ownerShell = await createUserShell('global:owner');
-	authOwnerAgent = testServer.authAgentFor(ownerShell);
-});
-
-beforeEach(async () => {
-	await testDb.truncate(['TestDefinition', 'TestMetric']);
-
-	workflowUnderTest = await createWorkflow({ name: 'workflow-under-test' }, ownerShell);
-
-	testDefinition = Container.get(TestDefinitionRepository).create({
-		name: 'test',
-		workflow: { id: workflowUnderTest.id },
-	});
-	await Container.get(TestDefinitionRepository).save(testDefinition);
-
-	otherWorkflow = await createWorkflow({ name: 'other-workflow' });
-
-	otherTestDefinition = Container.get(TestDefinitionRepository).create({
-		name: 'other-test',
-		workflow: { id: otherWorkflow.id },
-	});
-	await Container.get(TestDefinitionRepository).save(otherTestDefinition);
-});
-
-describe('GET /evaluation/test-definitions/:testDefinitionId/metrics', () => {
-	test('should retrieve empty list of metrics for a test definition', async () => {
-		const resp = await authOwnerAgent.get(
-			`/evaluation/test-definitions/${testDefinition.id}/metrics`,
-		);
-
-		expect(resp.statusCode).toBe(200);
-		expect(resp.body.data.length).toBe(0);
-	});
-
-	test('should retrieve metrics for a test definition', async () => {
-		const newMetric = Container.get(TestMetricRepository).create({
-			testDefinition: { id: testDefinition.id },
-			name: 'metric-1',
-		});
-		await Container.get(TestMetricRepository).save(newMetric);
-
-		const newMetric2 = Container.get(TestMetricRepository).create({
-			testDefinition: { id: testDefinition.id },
-			name: 'metric-2',
-		});
-		await Container.get(TestMetricRepository).save(newMetric2);
-
-		const resp = await authOwnerAgent.get(
-			`/evaluation/test-definitions/${testDefinition.id}/metrics`,
-		);
-
-		expect(resp.statusCode).toBe(200);
-		expect(resp.body.data.length).toBe(2);
-		expect(resp.body.data).toEqual(
-			expect.arrayContaining([
-				expect.objectContaining({
-					id: expect.any(String),
-					name: 'metric-1',
-				}),
-				expect.objectContaining({
-					id: expect.any(String),
-					name: 'metric-2',
-				}),
-			]),
-		);
-	});
-
-	test('should return 404 if test definition does not exist', async () => {
-		const resp = await authOwnerAgent.get('/evaluation/test-definitions/999/metrics');
-
-		expect(resp.statusCode).toBe(404);
-	});
-
-	test('should return 404 if test definition is not accessible to the user', async () => {
-		const resp = await authOwnerAgent.get(
-			`/evaluation/test-definitions/${otherTestDefinition.id}/metrics`,
-		);
-
-		expect(resp.statusCode).toBe(404);
-	});
-});
-
-describe('GET /evaluation/test-definitions/:testDefinitionId/metrics/:id', () => {
-	test('should retrieve a metric for a test definition', async () => {
-		const newMetric = Container.get(TestMetricRepository).create({
-			testDefinition: { id: testDefinition.id },
-			name: 'metric-1',
-		});
-		await Container.get(TestMetricRepository).save(newMetric);
-
-		const resp = await authOwnerAgent.get(
-			`/evaluation/test-definitions/${testDefinition.id}/metrics/${newMetric.id}`,
-		);
-
-		expect(resp.statusCode).toBe(200);
-		expect(resp.body.data).toEqual(
-			expect.objectContaining({
-				id: newMetric.id,
-				name: 'metric-1',
-			}),
-		);
-	});
-
-	test('should return 404 if metric does not exist', async () => {
-		const resp = await authOwnerAgent.get(
-			`/evaluation/test-definitions/${testDefinition.id}/metrics/999`,
-		);
-
-		expect(resp.statusCode).toBe(404);
-	});
-
-	test('should return 404 if metric is not accessible to the user', async () => {
-		const newMetric = Container.get(TestMetricRepository).create({
-			testDefinition: { id: otherTestDefinition.id },
-			name: 'metric-1',
-		});
-		await Container.get(TestMetricRepository).save(newMetric);
-
-		const resp = await authOwnerAgent.get(
-			`/evaluation/test-definitions/${otherTestDefinition.id}/metrics/${newMetric.id}`,
-		);
-
-		expect(resp.statusCode).toBe(404);
-	});
-});
-
-describe('POST /evaluation/test-definitions/:testDefinitionId/metrics', () => {
-	test('should create a metric for a test definition', async () => {
-		const resp = await authOwnerAgent
-			.post(`/evaluation/test-definitions/${testDefinition.id}/metrics`)
-			.send({
-				name: 'metric-1',
-			});
-
-		expect(resp.statusCode).toBe(200);
-		expect(resp.body.data).toEqual(
-			expect.objectContaining({
-				id: expect.any(String),
-				name: 'metric-1',
-			}),
-		);
-
-		const metrics = await Container.get(TestMetricRepository).find({
-			where: { testDefinition: { id: testDefinition.id } },
-		});
-		expect(metrics.length).toBe(1);
-		expect(metrics[0].name).toBe('metric-1');
-	});
-
-	test('should return 400 if name is missing', async () => {
-		const resp = await authOwnerAgent
-			.post(`/evaluation/test-definitions/${testDefinition.id}/metrics`)
-			.send({});
-
-		expect(resp.statusCode).toBe(400);
-		expect(resp.body.errors).toEqual(
-			expect.arrayContaining([
-				expect.objectContaining({
-					code: 'invalid_type',
-					message: 'Required',
-					path: ['name'],
-				}),
-			]),
-		);
-	});
-
-	test('should return 400 if name is not a string', async () => {
-		const resp = await authOwnerAgent
-			.post(`/evaluation/test-definitions/${testDefinition.id}/metrics`)
-			.send({
-				name: 123,
-			});
-
-		expect(resp.statusCode).toBe(400);
-		expect(resp.body.errors).toEqual(
-			expect.arrayContaining([
-				expect.objectContaining({
-					code: 'invalid_type',
-					message: 'Expected string, received number',
-					path: ['name'],
-				}),
-			]),
-		);
-	});
-
-	test('should return 404 if test definition does not exist', async () => {
-		const resp = await authOwnerAgent.post('/evaluation/test-definitions/999/metrics').send({
-			name: 'metric-1',
-		});
-
-		expect(resp.statusCode).toBe(404);
-	});
-
-	test('should return 404 if test definition is not accessible to the user', async () => {
-		const resp = await authOwnerAgent
-			.post(`/evaluation/test-definitions/${otherTestDefinition.id}/metrics`)
-			.send({
-				name: 'metric-1',
-			});
-
-		expect(resp.statusCode).toBe(404);
-	});
-});
-
-describe('PATCH /evaluation/test-definitions/:testDefinitionId/metrics/:id', () => {
-	test('should update a metric for a test definition', async () => {
-		const newMetric = Container.get(TestMetricRepository).create({
-			testDefinition: { id: testDefinition.id },
-			name: 'metric-1',
-		});
-		await Container.get(TestMetricRepository).save(newMetric);
-
-		const resp = await authOwnerAgent
-			.patch(`/evaluation/test-definitions/${testDefinition.id}/metrics/${newMetric.id}`)
-			.send({
-				name: 'metric-2',
-			});
-
-		expect(resp.statusCode).toBe(200);
-		expect(resp.body.data).toEqual(
-			expect.objectContaining({
-				id: newMetric.id,
-				name: 'metric-2',
-			}),
-		);
-
-		const metrics = await Container.get(TestMetricRepository).find({
-			where: { testDefinition: { id: testDefinition.id } },
-		});
-		expect(metrics.length).toBe(1);
-		expect(metrics[0].name).toBe('metric-2');
-	});
-
-	test('should return 400 if name is missing', async () => {
-		const newMetric = Container.get(TestMetricRepository).create({
-			testDefinition: { id: testDefinition.id },
-			name: 'metric-1',
-		});
-		await Container.get(TestMetricRepository).save(newMetric);
-
-		const resp = await authOwnerAgent
-			.patch(`/evaluation/test-definitions/${testDefinition.id}/metrics/${newMetric.id}`)
-			.send({});
-
-		expect(resp.statusCode).toBe(400);
-		expect(resp.body.errors).toEqual(
-			expect.arrayContaining([
-				expect.objectContaining({
-					code: 'invalid_type',
-					message: 'Required',
-					path: ['name'],
-				}),
-			]),
-		);
-	});
-
-	test('should return 400 if name is not a string', async () => {
-		const newMetric = Container.get(TestMetricRepository).create({
-			testDefinition: { id: testDefinition.id },
-			name: 'metric-1',
-		});
-		await Container.get(TestMetricRepository).save(newMetric);
-
-		const resp = await authOwnerAgent
-			.patch(`/evaluation/test-definitions/${testDefinition.id}/metrics/${newMetric.id}`)
-			.send({
-				name: 123,
-			});
-
-		expect(resp.statusCode).toBe(400);
-		expect(resp.body.errors).toEqual(
-			expect.arrayContaining([
-				expect.objectContaining({
-					code: 'invalid_type',
-					message: 'Expected string, received number',
-				}),
-			]),
-		);
-	});
-
-	test('should return 404 if metric does not exist', async () => {
-		const resp = await authOwnerAgent
-			.patch(`/evaluation/test-definitions/${testDefinition.id}/metrics/999`)
-			.send({
-				name: 'metric-1',
-			});
-
-		expect(resp.statusCode).toBe(404);
-	});
-
-	test('should return 404 if test definition does not exist', async () => {
-		const resp = await authOwnerAgent.patch('/evaluation/test-definitions/999/metrics/999').send({
-			name: 'metric-1',
-		});
-
-		expect(resp.statusCode).toBe(404);
-	});
-
-	test('should return 404 if metric is not accessible to the user', async () => {
-		const newMetric = Container.get(TestMetricRepository).create({
-			testDefinition: { id: otherTestDefinition.id },
-			name: 'metric-1',
-		});
-		await Container.get(TestMetricRepository).save(newMetric);
-
-		const resp = await authOwnerAgent
-			.patch(`/evaluation/test-definitions/${otherTestDefinition.id}/metrics/${newMetric.id}`)
-			.send({
-				name: 'metric-2',
-			});
-
-		expect(resp.statusCode).toBe(404);
-	});
-});
-
-describe('DELETE /evaluation/test-definitions/:testDefinitionId/metrics/:id', () => {
-	test('should delete a metric for a test definition', async () => {
-		const newMetric = Container.get(TestMetricRepository).create({
-			testDefinition: { id: testDefinition.id },
-			name: 'metric-1',
-		});
-		await Container.get(TestMetricRepository).save(newMetric);
-
-		const resp = await authOwnerAgent.delete(
-			`/evaluation/test-definitions/${testDefinition.id}/metrics/${newMetric.id}`,
-		);
-
-		expect(resp.statusCode).toBe(200);
-		expect(resp.body.data).toEqual({ success: true });
-
-		const metrics = await Container.get(TestMetricRepository).find({
-			where: { testDefinition: { id: testDefinition.id } },
-		});
-		expect(metrics.length).toBe(0);
-	});
-
-	test('should return 404 if metric does not exist', async () => {
-		const resp = await authOwnerAgent.delete(
-			`/evaluation/test-definitions/${testDefinition.id}/metrics/999`,
-		);
-
-		expect(resp.statusCode).toBe(404);
-	});
-
-	test('should return 404 if metric is not accessible to the user', async () => {
-		const newMetric = Container.get(TestMetricRepository).create({
-			testDefinition: { id: otherTestDefinition.id },
-			name: 'metric-1',
-		});
-		await Container.get(TestMetricRepository).save(newMetric);
-
-		const resp = await authOwnerAgent.delete(
-			`/evaluation/test-definitions/${otherTestDefinition.id}/metrics/${newMetric.id}`,
-		);
-
-		expect(resp.statusCode).toBe(404);
-	});
-});
--- a/packages/cli/test/integration/shared/utils/test-server.ts
+++ b/packages/cli/test/integration/shared/utils/test-server.ts
@@ -281,7 +281,6 @@ export const setupTestServer = ({
 						break;

 					case 'evaluation':
-						await import('@/evaluation.ee/metrics.controller');
 						await import('@/evaluation.ee/test-definitions.controller.ee');
 						await import('@/evaluation.ee/test-runs.controller.ee');
 						break;