Files
n8n-enterprise-unlocked/packages/@n8n/ai-workflow-builder.ee/evaluations/langsmith/runner.ts
2025-09-01 16:28:19 +02:00

165 lines
4.9 KiB
TypeScript

import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
import type { LangChainTracer } from '@langchain/core/tracers/tracer_langchain';
import { evaluate } from 'langsmith/evaluation';
import type { INodeTypeDescription } from 'n8n-workflow';
import pc from 'picocolors';
import { createLangsmithEvaluator } from './evaluator';
import { PLAN_APPROVAL_MESSAGE } from '../../src/constants';
import type { WorkflowState } from '../../src/workflow-state';
import { setupTestEnvironment, createAgent } from '../core/environment';
import {
generateRunId,
safeExtractUsage,
isWorkflowStateValues,
extractMessageContent,
} from '../types/langsmith';
import { consumeGenerator, formatHeader, getChatPayload } from '../utils/evaluation-helpers';
/**
* Creates a workflow generation function for Langsmith evaluation
* @param parsedNodeTypes - Node types
* @param llm - Language model
* @param tracer - Optional tracer
* @returns Function that generates workflows from inputs
*/
function createWorkflowGenerator(
parsedNodeTypes: INodeTypeDescription[],
llm: BaseChatModel,
tracer?: LangChainTracer,
) {
return async (inputs: typeof WorkflowState.State) => {
// Generate a unique ID for this evaluation run
const runId = generateRunId();
// Validate inputs
if (!inputs.messages || !Array.isArray(inputs.messages) || inputs.messages.length === 0) {
throw new Error('No messages provided in inputs');
}
// Extract first message content safely
const firstMessage = inputs.messages[0];
const messageContent = extractMessageContent(firstMessage);
// Create agent for this run
const agent = createAgent(parsedNodeTypes, llm, tracer);
// First generate plan
await consumeGenerator(
agent.chat(getChatPayload(messageContent, runId), 'langsmith-eval-user'),
);
// Confirm plan
await consumeGenerator(
agent.chat(getChatPayload(PLAN_APPROVAL_MESSAGE, runId), 'langsmith-eval-user'),
);
// Get generated workflow with validation
const state = await agent.getState(runId, 'langsmith-eval-user');
// Validate state
if (!state.values) {
throw new Error('No values in agent state');
}
if (!isWorkflowStateValues(state.values)) {
throw new Error('Invalid workflow state: workflow or messages missing');
}
const generatedWorkflow = state.values.workflowJSON;
const messages = state.values.messages;
// Extract usage metadata safely
const usage = safeExtractUsage(messages);
return {
workflow: generatedWorkflow,
prompt: messageContent,
usage,
};
};
}
/**
* Runs evaluation using Langsmith
*/
export async function runLangsmithEvaluation(): Promise<void> {
console.log(formatHeader('AI Workflow Builder Langsmith Evaluation', 70));
console.log();
// Check for Langsmith API key
if (!process.env.LANGSMITH_API_KEY) {
console.error(pc.red('✗ LANGSMITH_API_KEY environment variable not set'));
process.exit(1);
}
try {
// Setup test environment
const { parsedNodeTypes, llm, tracer, lsClient } = await setupTestEnvironment();
if (!lsClient) {
throw new Error('Langsmith client not initialized');
}
// Get dataset name from env or use default
const datasetName = process.env.LANGSMITH_DATASET_NAME ?? 'workflow-builder-canvas-prompts';
console.log(pc.blue(`➔ Using dataset: ${datasetName}`));
// Verify dataset exists
try {
await lsClient.readDataset({ datasetName });
} catch (error) {
console.error(pc.red(`✗ Dataset "${datasetName}" not found`));
console.log('\nAvailable datasets:');
// List available datasets
for await (const dataset of lsClient.listDatasets()) {
console.log(pc.dim(` - ${dataset.name} (${dataset.id})`));
}
console.log(
'\nTo use a different dataset, set the LANGSMITH_DATASET_NAME environment variable',
);
process.exit(1);
}
console.log();
const startTime = Date.now();
// Create workflow generation function
const generateWorkflow = createWorkflowGenerator(parsedNodeTypes, llm, tracer);
// Create LLM-based evaluator
const evaluator = createLangsmithEvaluator(llm);
// Run Langsmith evaluation
const results = await evaluate(generateWorkflow, {
data: datasetName,
evaluators: [evaluator],
maxConcurrency: 7,
experimentPrefix: 'workflow-builder-evaluation',
metadata: {
evaluationType: 'llm-based',
modelName: process.env.LLM_MODEL ?? 'default',
},
});
const totalTime = Date.now() - startTime;
console.log(pc.green(`✓ Evaluation completed in ${(totalTime / 1000).toFixed(1)}s`));
// Display results information
console.log('\nView detailed results in Langsmith dashboard');
console.log(
`Experiment name: workflow-builder-evaluation-${new Date().toISOString().split('T')[0]}`,
);
// Log summary of results if available
if (results) {
console.log(pc.dim('Evaluation run completed successfully'));
console.log(pc.dim(`Dataset: ${datasetName}`));
}
} catch (error) {
console.error(pc.red('✗ Langsmith evaluation failed:'), error);
process.exit(1);
}
}