diff --git a/packages/cli/src/evaluation.ee/test-runner/__tests__/utils.ee.test.ts b/packages/cli/src/evaluation.ee/test-runner/__tests__/utils.ee.test.ts new file mode 100644 index 0000000000..3ce62313fb --- /dev/null +++ b/packages/cli/src/evaluation.ee/test-runner/__tests__/utils.ee.test.ts @@ -0,0 +1,290 @@ +import { mock } from 'jest-mock-extended'; +import { NodeConnectionTypes } from 'n8n-workflow'; +import type { IRunData } from 'n8n-workflow'; + +import { extractTokenUsage } from '../utils.ee'; + +type TokenUsage = { + completionTokens: number; + promptTokens: number; + totalTokens: number; +}; + +type JsonData = { + tokenUsage?: TokenUsage | null; + tokenUsageEstimate?: TokenUsage | null; + [key: string]: unknown; +}; + +function createRunDataMock(nodeConfigs: Record) { + const runData: Record = {}; + + for (const [nodeName, jsonValues] of Object.entries(nodeConfigs)) { + if ( + jsonValues.some( + (json) => json.tokenUsage !== undefined || json.tokenUsageEstimate !== undefined, + ) + ) { + // AI node with language model data + runData[nodeName] = jsonValues.map((json) => ({ + data: { + [NodeConnectionTypes.AiLanguageModel]: [[{ json }]], + }, + })); + } else { + // Non-AI node with main data + runData[nodeName] = jsonValues.map((json) => ({ + data: { + main: [[{ json }]], + }, + })); + } + } + + return mock(runData as any); +} + +describe('extractTokenUsage', () => { + describe('Basic Token Extraction', () => { + it('extracts token usage from single AI node', () => { + const runData = createRunDataMock({ + 'OpenAI Chat Model': [ + { + tokenUsage: { + completionTokens: 16, + promptTokens: 69, + totalTokens: 85, + }, + }, + ], + }); + + const result = extractTokenUsage(runData); + + expect(result.total).toEqual({ + completionTokens: 16, + promptTokens: 69, + totalTokens: 85, + }); + }); + + it('extracts token usage from multiple AI node executions', () => { + const runData = createRunDataMock({ + 'OpenAI Chat Model': [ + { + tokenUsage: { + completionTokens: 16, + promptTokens: 69, + totalTokens: 85, + }, + }, + { + tokenUsage: { + completionTokens: 46, + promptTokens: 138, + totalTokens: 184, + }, + }, + ], + }); + + const result = extractTokenUsage(runData); + + expect(result.total).toEqual({ + completionTokens: 62, + promptTokens: 207, + totalTokens: 269, + }); + }); + + it('extracts token usage from mixed AI nodes', () => { + const runData = createRunDataMock({ + 'OpenAI Chat Model': [ + { + tokenUsage: { + completionTokens: 16, + promptTokens: 69, + totalTokens: 85, + }, + }, + ], + 'Claude Model': [ + { + tokenUsage: { + completionTokens: 25, + promptTokens: 50, + totalTokens: 75, + }, + }, + ], + }); + + const result = extractTokenUsage(runData); + + expect(result.total).toEqual({ + completionTokens: 41, + promptTokens: 119, + totalTokens: 160, + }); + }); + }); + + describe('Token Usage Sources', () => { + it('extracts from tokenUsageEstimate property', () => { + const runData = createRunDataMock({ + 'OpenAI Chat Model': [ + { + tokenUsage: undefined, + tokenUsageEstimate: { + completionTokens: 20, + promptTokens: 80, + totalTokens: 100, + }, + }, + ], + }); + + const result = extractTokenUsage(runData); + + expect(result.total).toEqual({ + completionTokens: 20, + promptTokens: 80, + totalTokens: 100, + }); + }); + + it('prioritizes tokenUsage over tokenUsageEstimate', () => { + const runData = createRunDataMock({ + 'OpenAI Chat Model': [ + { + tokenUsage: { + completionTokens: 16, + promptTokens: 69, + totalTokens: 85, + }, + tokenUsageEstimate: { + completionTokens: 20, + promptTokens: 80, + totalTokens: 100, + }, + }, + ], + }); + + const result = extractTokenUsage(runData); + + expect(result.total).toEqual({ + completionTokens: 16, + promptTokens: 69, + totalTokens: 85, + }); + }); + }); + + describe('Null/Undefined Token Data', () => { + it('handles missing token usage data', () => { + const runData = createRunDataMock({ + 'OpenAI Chat Model': [{}], + }); + + const result = extractTokenUsage(runData); + + expect(result.total).toEqual({ + completionTokens: 0, + promptTokens: 0, + totalTokens: 0, + }); + }); + + it('handles null token usage data', () => { + const runData = createRunDataMock({ + 'OpenAI Chat Model': [ + { + tokenUsage: null, + }, + ], + }); + + const result = extractTokenUsage(runData); + + expect(result.total).toEqual({ + completionTokens: 0, + promptTokens: 0, + totalTokens: 0, + }); + }); + }); + + describe('Edge cases', () => { + it('handles empty AI node data', () => { + const runData = mock({ + 'OpenAI Chat Model': [ + { + data: { + [NodeConnectionTypes.AiLanguageModel]: [], + }, + }, + ], + }); + + const result = extractTokenUsage(runData); + + expect(result.total).toEqual({ + completionTokens: 0, + promptTokens: 0, + totalTokens: 0, + }); + }); + + it('handles missing AI node execution data', () => { + const runData = mock({ + 'OpenAI Chat Model': [ + { + data: { + [NodeConnectionTypes.AiLanguageModel]: [[]], + }, + }, + ], + }); + + const result = extractTokenUsage(runData); + + expect(result.total).toEqual({ + completionTokens: 0, + promptTokens: 0, + totalTokens: 0, + }); + }); + + it('handles empty execution data', () => { + const runData = createRunDataMock({}); + + const result = extractTokenUsage(runData); + + expect(result.total).toEqual({ + completionTokens: 0, + promptTokens: 0, + totalTokens: 0, + }); + }); + + it('handles execution with no AI nodes', () => { + const runData = createRunDataMock({ + 'When clicking Execute workflow': [ + { + text: 'Say HEY', + code: 1, + }, + ], + }); + + const result = extractTokenUsage(runData); + + expect(result.total).toEqual({ + completionTokens: 0, + promptTokens: 0, + totalTokens: 0, + }); + }); + }); +}); diff --git a/packages/cli/src/evaluation.ee/test-runner/test-runner.service.ee.ts b/packages/cli/src/evaluation.ee/test-runner/test-runner.service.ee.ts index da91c5ba49..65fbaefc6d 100644 --- a/packages/cli/src/evaluation.ee/test-runner/test-runner.service.ee.ts +++ b/packages/cli/src/evaluation.ee/test-runner/test-runner.service.ee.ts @@ -23,7 +23,10 @@ import assert from 'node:assert'; import { ActiveExecutions } from '@/active-executions'; import config from '@/config'; import { TestCaseExecutionError, TestRunError } from '@/evaluation.ee/test-runner/errors.ee'; -import { checkNodeParameterNotEmpty } from '@/evaluation.ee/test-runner/utils.ee'; +import { + checkNodeParameterNotEmpty, + extractTokenUsage, +} from '@/evaluation.ee/test-runner/utils.ee'; import { Telemetry } from '@/telemetry'; import { WorkflowRunner } from '@/workflow-runner'; @@ -367,7 +370,7 @@ export class TestRunnerService { /** * Evaluation result is collected from all Evaluation Metrics nodes */ - private extractEvaluationResult(execution: IRun, workflow: IWorkflowBase): IDataObject { + private extractUserDefinedMetrics(execution: IRun, workflow: IWorkflowBase): IDataObject { const metricsNodes = TestRunnerService.getEvaluationMetricsNodes(workflow); // If a metrics node did not execute, ignore it. @@ -381,6 +384,23 @@ export class TestRunnerService { return metricsResult; } + /** + * Extracts predefined metrics from the execution data. + * Currently, it extracts token usage and execution time. + */ + private extractPredefinedMetrics(execution: IRun) { + const metricValues: Record = {}; + + const tokenUsageMetrics = extractTokenUsage(execution.data.resultData.runData); + Object.assign(metricValues, tokenUsageMetrics.total); + + if (execution.startedAt && execution.stoppedAt) { + metricValues.executionTime = execution.stoppedAt.getTime() - execution.startedAt.getTime(); + } + + return metricValues; + } + /** * Creates a new test run for the given workflow */ @@ -511,11 +531,18 @@ export class TestRunnerService { } const completedAt = new Date(); - const { addedMetrics } = metrics.addResults( - this.extractEvaluationResult(testCaseExecution, workflow), + // Collect common metrics + const { addedMetrics: addedPredefinedMetrics } = metrics.addResults( + this.extractPredefinedMetrics(testCaseExecution), + ); + this.logger.debug('Test case common metrics extracted', addedPredefinedMetrics); + + // Collect user-defined metrics + const { addedMetrics: addedUserDefinedMetrics } = metrics.addResults( + this.extractUserDefinedMetrics(testCaseExecution, workflow), ); - if (Object.keys(addedMetrics).length === 0) { + if (Object.keys(addedUserDefinedMetrics).length === 0) { await this.testCaseExecutionRepository.createTestCaseExecution({ executionId: testCaseExecutionId, testRun: { @@ -528,7 +555,16 @@ export class TestRunnerService { }); telemetryMeta.errored_test_case_count++; } else { - this.logger.debug('Test case metrics extracted', addedMetrics); + const combinedMetrics = { + ...addedUserDefinedMetrics, + ...addedPredefinedMetrics, + }; + + this.logger.debug( + 'Test case metrics extracted (user-defined)', + addedUserDefinedMetrics, + ); + // Create a new test case execution in DB await this.testCaseExecutionRepository.createTestCaseExecution({ executionId: testCaseExecutionId, @@ -538,7 +574,7 @@ export class TestRunnerService { runAt, completedAt, status: 'success', - metrics: addedMetrics, + metrics: combinedMetrics, }); } } catch (e) { diff --git a/packages/cli/src/evaluation.ee/test-runner/utils.ee.ts b/packages/cli/src/evaluation.ee/test-runner/utils.ee.ts index 36f30762a4..8f94f47ba9 100644 --- a/packages/cli/src/evaluation.ee/test-runner/utils.ee.ts +++ b/packages/cli/src/evaluation.ee/test-runner/utils.ee.ts @@ -1,4 +1,18 @@ -import type { NodeParameterValueType, INodeParameterResourceLocator } from 'n8n-workflow'; +import type { + NodeParameterValueType, + INodeParameterResourceLocator, + IRunData, + INodeExecutionData, +} from 'n8n-workflow'; +import { NodeConnectionTypes } from 'n8n-workflow'; + +type TokenUsageValues = { + completionTokens: number; + promptTokens: number; + totalTokens: number; +}; + +type TokenUsageInfo = Record<`${string}__${number}` | 'total', TokenUsageValues>; function isRlcValue(value: NodeParameterValueType): value is INodeParameterResourceLocator { return Boolean( @@ -17,3 +31,55 @@ export function checkNodeParameterNotEmpty(value: NodeParameterValueType) { return true; } + +export function extractTokenUsage(executionRunData: IRunData) { + const result: TokenUsageInfo = { + total: { + completionTokens: 0, + promptTokens: 0, + totalTokens: 0, + }, + }; + + const extractFromNode = (nodeName: string, nodeData: INodeExecutionData, index: number) => { + function isValidTokenInfo(data: unknown): data is TokenUsageValues { + return ( + typeof data === 'object' && + data !== null && + 'completionTokens' in data && + 'promptTokens' in data && + 'totalTokens' in data && + typeof data.completionTokens === 'number' && + typeof data.promptTokens === 'number' && + typeof data.totalTokens === 'number' + ); + } + + const tokenInfo = nodeData.json?.tokenUsage ?? nodeData.json?.tokenUsageEstimate; + + if (tokenInfo && isValidTokenInfo(tokenInfo)) { + result[`${nodeName}__${index}`] = { + completionTokens: tokenInfo.completionTokens, + promptTokens: tokenInfo.promptTokens, + totalTokens: tokenInfo.totalTokens, + }; + + result.total.completionTokens += tokenInfo.completionTokens; + result.total.promptTokens += tokenInfo.promptTokens; + result.total.totalTokens += tokenInfo.totalTokens; + } + }; + + for (const [nodeName, nodeData] of Object.entries(executionRunData)) { + if (nodeData[0]?.data?.[NodeConnectionTypes.AiLanguageModel]) { + for (const [index, node] of nodeData.entries()) { + const modelNodeExecutionData = node.data?.[NodeConnectionTypes.AiLanguageModel]?.[0]?.[0]; + if (modelNodeExecutionData) { + extractFromNode(nodeName, modelNodeExecutionData, index); + } + } + } + } + + return result; +}