feat(core): Add number of tokens and latency automatically as metrics in evaluation (#16243)

This commit is contained in:
Eugene
2025-06-13 09:24:12 +02:00
committed by GitHub
parent bcf1a7108b
commit eff0571f42
3 changed files with 400 additions and 8 deletions

View File

@@ -0,0 +1,290 @@
import { mock } from 'jest-mock-extended';
import { NodeConnectionTypes } from 'n8n-workflow';
import type { IRunData } from 'n8n-workflow';
import { extractTokenUsage } from '../utils.ee';
type TokenUsage = {
completionTokens: number;
promptTokens: number;
totalTokens: number;
};
type JsonData = {
tokenUsage?: TokenUsage | null;
tokenUsageEstimate?: TokenUsage | null;
[key: string]: unknown;
};
function createRunDataMock(nodeConfigs: Record<string, JsonData[]>) {
const runData: Record<string, unknown[]> = {};
for (const [nodeName, jsonValues] of Object.entries(nodeConfigs)) {
if (
jsonValues.some(
(json) => json.tokenUsage !== undefined || json.tokenUsageEstimate !== undefined,
)
) {
// AI node with language model data
runData[nodeName] = jsonValues.map((json) => ({
data: {
[NodeConnectionTypes.AiLanguageModel]: [[{ json }]],
},
}));
} else {
// Non-AI node with main data
runData[nodeName] = jsonValues.map((json) => ({
data: {
main: [[{ json }]],
},
}));
}
}
return mock<IRunData>(runData as any);
}
describe('extractTokenUsage', () => {
describe('Basic Token Extraction', () => {
it('extracts token usage from single AI node', () => {
const runData = createRunDataMock({
'OpenAI Chat Model': [
{
tokenUsage: {
completionTokens: 16,
promptTokens: 69,
totalTokens: 85,
},
},
],
});
const result = extractTokenUsage(runData);
expect(result.total).toEqual({
completionTokens: 16,
promptTokens: 69,
totalTokens: 85,
});
});
it('extracts token usage from multiple AI node executions', () => {
const runData = createRunDataMock({
'OpenAI Chat Model': [
{
tokenUsage: {
completionTokens: 16,
promptTokens: 69,
totalTokens: 85,
},
},
{
tokenUsage: {
completionTokens: 46,
promptTokens: 138,
totalTokens: 184,
},
},
],
});
const result = extractTokenUsage(runData);
expect(result.total).toEqual({
completionTokens: 62,
promptTokens: 207,
totalTokens: 269,
});
});
it('extracts token usage from mixed AI nodes', () => {
const runData = createRunDataMock({
'OpenAI Chat Model': [
{
tokenUsage: {
completionTokens: 16,
promptTokens: 69,
totalTokens: 85,
},
},
],
'Claude Model': [
{
tokenUsage: {
completionTokens: 25,
promptTokens: 50,
totalTokens: 75,
},
},
],
});
const result = extractTokenUsage(runData);
expect(result.total).toEqual({
completionTokens: 41,
promptTokens: 119,
totalTokens: 160,
});
});
});
describe('Token Usage Sources', () => {
it('extracts from tokenUsageEstimate property', () => {
const runData = createRunDataMock({
'OpenAI Chat Model': [
{
tokenUsage: undefined,
tokenUsageEstimate: {
completionTokens: 20,
promptTokens: 80,
totalTokens: 100,
},
},
],
});
const result = extractTokenUsage(runData);
expect(result.total).toEqual({
completionTokens: 20,
promptTokens: 80,
totalTokens: 100,
});
});
it('prioritizes tokenUsage over tokenUsageEstimate', () => {
const runData = createRunDataMock({
'OpenAI Chat Model': [
{
tokenUsage: {
completionTokens: 16,
promptTokens: 69,
totalTokens: 85,
},
tokenUsageEstimate: {
completionTokens: 20,
promptTokens: 80,
totalTokens: 100,
},
},
],
});
const result = extractTokenUsage(runData);
expect(result.total).toEqual({
completionTokens: 16,
promptTokens: 69,
totalTokens: 85,
});
});
});
describe('Null/Undefined Token Data', () => {
it('handles missing token usage data', () => {
const runData = createRunDataMock({
'OpenAI Chat Model': [{}],
});
const result = extractTokenUsage(runData);
expect(result.total).toEqual({
completionTokens: 0,
promptTokens: 0,
totalTokens: 0,
});
});
it('handles null token usage data', () => {
const runData = createRunDataMock({
'OpenAI Chat Model': [
{
tokenUsage: null,
},
],
});
const result = extractTokenUsage(runData);
expect(result.total).toEqual({
completionTokens: 0,
promptTokens: 0,
totalTokens: 0,
});
});
});
describe('Edge cases', () => {
it('handles empty AI node data', () => {
const runData = mock<IRunData>({
'OpenAI Chat Model': [
{
data: {
[NodeConnectionTypes.AiLanguageModel]: [],
},
},
],
});
const result = extractTokenUsage(runData);
expect(result.total).toEqual({
completionTokens: 0,
promptTokens: 0,
totalTokens: 0,
});
});
it('handles missing AI node execution data', () => {
const runData = mock<IRunData>({
'OpenAI Chat Model': [
{
data: {
[NodeConnectionTypes.AiLanguageModel]: [[]],
},
},
],
});
const result = extractTokenUsage(runData);
expect(result.total).toEqual({
completionTokens: 0,
promptTokens: 0,
totalTokens: 0,
});
});
it('handles empty execution data', () => {
const runData = createRunDataMock({});
const result = extractTokenUsage(runData);
expect(result.total).toEqual({
completionTokens: 0,
promptTokens: 0,
totalTokens: 0,
});
});
it('handles execution with no AI nodes', () => {
const runData = createRunDataMock({
'When clicking Execute workflow': [
{
text: 'Say HEY',
code: 1,
},
],
});
const result = extractTokenUsage(runData);
expect(result.total).toEqual({
completionTokens: 0,
promptTokens: 0,
totalTokens: 0,
});
});
});
});

View File

@@ -23,7 +23,10 @@ import assert from 'node:assert';
import { ActiveExecutions } from '@/active-executions';
import config from '@/config';
import { TestCaseExecutionError, TestRunError } from '@/evaluation.ee/test-runner/errors.ee';
import { checkNodeParameterNotEmpty } from '@/evaluation.ee/test-runner/utils.ee';
import {
checkNodeParameterNotEmpty,
extractTokenUsage,
} from '@/evaluation.ee/test-runner/utils.ee';
import { Telemetry } from '@/telemetry';
import { WorkflowRunner } from '@/workflow-runner';
@@ -367,7 +370,7 @@ export class TestRunnerService {
/**
* Evaluation result is collected from all Evaluation Metrics nodes
*/
private extractEvaluationResult(execution: IRun, workflow: IWorkflowBase): IDataObject {
private extractUserDefinedMetrics(execution: IRun, workflow: IWorkflowBase): IDataObject {
const metricsNodes = TestRunnerService.getEvaluationMetricsNodes(workflow);
// If a metrics node did not execute, ignore it.
@@ -381,6 +384,23 @@ export class TestRunnerService {
return metricsResult;
}
/**
* Extracts predefined metrics from the execution data.
* Currently, it extracts token usage and execution time.
*/
private extractPredefinedMetrics(execution: IRun) {
const metricValues: Record<string, number> = {};
const tokenUsageMetrics = extractTokenUsage(execution.data.resultData.runData);
Object.assign(metricValues, tokenUsageMetrics.total);
if (execution.startedAt && execution.stoppedAt) {
metricValues.executionTime = execution.stoppedAt.getTime() - execution.startedAt.getTime();
}
return metricValues;
}
/**
* Creates a new test run for the given workflow
*/
@@ -511,11 +531,18 @@ export class TestRunnerService {
}
const completedAt = new Date();
const { addedMetrics } = metrics.addResults(
this.extractEvaluationResult(testCaseExecution, workflow),
// Collect common metrics
const { addedMetrics: addedPredefinedMetrics } = metrics.addResults(
this.extractPredefinedMetrics(testCaseExecution),
);
this.logger.debug('Test case common metrics extracted', addedPredefinedMetrics);
// Collect user-defined metrics
const { addedMetrics: addedUserDefinedMetrics } = metrics.addResults(
this.extractUserDefinedMetrics(testCaseExecution, workflow),
);
if (Object.keys(addedMetrics).length === 0) {
if (Object.keys(addedUserDefinedMetrics).length === 0) {
await this.testCaseExecutionRepository.createTestCaseExecution({
executionId: testCaseExecutionId,
testRun: {
@@ -528,7 +555,16 @@ export class TestRunnerService {
});
telemetryMeta.errored_test_case_count++;
} else {
this.logger.debug('Test case metrics extracted', addedMetrics);
const combinedMetrics = {
...addedUserDefinedMetrics,
...addedPredefinedMetrics,
};
this.logger.debug(
'Test case metrics extracted (user-defined)',
addedUserDefinedMetrics,
);
// Create a new test case execution in DB
await this.testCaseExecutionRepository.createTestCaseExecution({
executionId: testCaseExecutionId,
@@ -538,7 +574,7 @@ export class TestRunnerService {
runAt,
completedAt,
status: 'success',
metrics: addedMetrics,
metrics: combinedMetrics,
});
}
} catch (e) {

View File

@@ -1,4 +1,18 @@
import type { NodeParameterValueType, INodeParameterResourceLocator } from 'n8n-workflow';
import type {
NodeParameterValueType,
INodeParameterResourceLocator,
IRunData,
INodeExecutionData,
} from 'n8n-workflow';
import { NodeConnectionTypes } from 'n8n-workflow';
type TokenUsageValues = {
completionTokens: number;
promptTokens: number;
totalTokens: number;
};
type TokenUsageInfo = Record<`${string}__${number}` | 'total', TokenUsageValues>;
function isRlcValue(value: NodeParameterValueType): value is INodeParameterResourceLocator {
return Boolean(
@@ -17,3 +31,55 @@ export function checkNodeParameterNotEmpty(value: NodeParameterValueType) {
return true;
}
export function extractTokenUsage(executionRunData: IRunData) {
const result: TokenUsageInfo = {
total: {
completionTokens: 0,
promptTokens: 0,
totalTokens: 0,
},
};
const extractFromNode = (nodeName: string, nodeData: INodeExecutionData, index: number) => {
function isValidTokenInfo(data: unknown): data is TokenUsageValues {
return (
typeof data === 'object' &&
data !== null &&
'completionTokens' in data &&
'promptTokens' in data &&
'totalTokens' in data &&
typeof data.completionTokens === 'number' &&
typeof data.promptTokens === 'number' &&
typeof data.totalTokens === 'number'
);
}
const tokenInfo = nodeData.json?.tokenUsage ?? nodeData.json?.tokenUsageEstimate;
if (tokenInfo && isValidTokenInfo(tokenInfo)) {
result[`${nodeName}__${index}`] = {
completionTokens: tokenInfo.completionTokens,
promptTokens: tokenInfo.promptTokens,
totalTokens: tokenInfo.totalTokens,
};
result.total.completionTokens += tokenInfo.completionTokens;
result.total.promptTokens += tokenInfo.promptTokens;
result.total.totalTokens += tokenInfo.totalTokens;
}
};
for (const [nodeName, nodeData] of Object.entries(executionRunData)) {
if (nodeData[0]?.data?.[NodeConnectionTypes.AiLanguageModel]) {
for (const [index, node] of nodeData.entries()) {
const modelNodeExecutionData = node.data?.[NodeConnectionTypes.AiLanguageModel]?.[0]?.[0];
if (modelNodeExecutionData) {
extractFromNode(nodeName, modelNodeExecutionData, index);
}
}
}
}
return result;
}