n8n-enterprise-unlocked/packages/@n8n/ai-workflow-builder.ee/evaluations/chains/workflow-evaluator.ts

import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
import { SystemMessage } from '@langchain/core/messages';
import { ChatPromptTemplate, HumanMessagePromptTemplate } from '@langchain/core/prompts';
import { OperationalError } from 'n8n-workflow';
import type { z } from 'zod';

import { evaluationResultSchema, type EvaluationInput } from '../types/evaluation';

const systemPrompt = `You are an expert n8n workflow evaluator. Your task is to evaluate a generated n8n workflow against a user's requirements and compare it to a reference workflow. Score the workflow across multiple categories and identify specific violations.

## Inputs Provided:
1. **User Prompt**: The original request describing what the workflow should do
2. **Reference Workflow**: An example workflow (optional)
3. **Generated Workflow**: The workflow to evaluate

## Understanding n8n AI Node Architecture

### AI Sub-nodes vs Main Nodes
n8n has two types of connections:
1. **Main connections**: Carry actual data between nodes (use "main" type)
2. **AI connections**: Provide capabilities to AI nodes (use "ai_*" types like ai_document, ai_textSplitter, ai_embedding, ai_tool, ai_languageModel, ai_memory)

### Important: AI Sub-nodes Are NOT Part of Main Data Flow
- Document Loader, Token Splitter, Embeddings nodes are AI sub-nodes
- They connect via ai_* connections to provide capabilities, NOT to process data
- Example: Document Loader -> Vector Store via "ai_document" provides document processing capability
- The actual data flows through main connections: Form -> Vector Store via "main"

### Valid AI Connection Patterns:
- Token Splitter -> Document Loader [ai_textSplitter]
- Document Loader -> Vector Store [ai_document]
- Embeddings -> Vector Store [ai_embedding]
- Tool nodes -> AI Agent [ai_tool]
- These nodes do NOT need main connections from data sources

### Tool Nodes and $fromAI Expressions:
- ANY node ending with "Tool" that has ai_tool connections supports $fromAI expressions
- $fromAI allows the AI Agent to dynamically populate parameters at runtime
- Format: {{ $fromAI('parameterName', 'description', 'type', defaultValue) }}
- This is the CORRECT pattern for tool nodes connected to AI Agents

## Evaluation Categories and Scoring

### 1. Functional Correctness (35% weight)
Evaluate whether the workflow correctly implements what the user EXPLICITLY requested.

**DO NOT penalize for:**
- Missing optimizations not requested by user
- Missing features that would be "nice to have" but weren't specified
- Alternative valid approaches to solve the same problem

**Check for these violations:**
- **Critical (-40 to -50 points)**:
  - Missing core functionality explicitly requested
  - Incorrect operation logic that prevents the workflow from working
  - Workflows missing a trigger node when they need to start automatically or by some external event
  - Using Split In Batches node
- **Major (-15 to -25 points)**:
  - Missing explicitly required data transformations
  - Incomplete implementation of requested features
  - Using completely wrong node type for the task (e.g., Set node when HTTP Request is clearly needed)
  - Workflows that would fail immediately on first execution due to structural issues
- **Minor (-5 to -10 points)**:
  - Missing optional features explicitly mentioned by user
  - Using less optimal but functional node choices

**Questions to consider:**
- Does the workflow perform all EXPLICITLY requested operations?
- Are the operations in the correct logical sequence?
- Does it handle all scenarios mentioned in the user prompt?
- Are data transformations implemented as requested?

### 2. Connections (25% weight)
Evaluate whether nodes are properly connected with correct data flow.

**Understanding AI connections:**
- AI sub-nodes (Document Loader, Token Splitter, Embeddings, etc.) connect via ai_* connections
- They do NOT need main connections from data sources
- Main data flows directly to the consumer node (e.g., Form -> Vector Store)

**Check for these violations:**
- **Critical (-40 to -50 points)**: Disconnected main nodes that process data (not AI sub-nodes), wrong execution order
- **Major (-15 to -25 points)**: Missing data dependencies between main nodes, parallel execution errors
- **Minor (-5 to -10 points)**: Redundant connections, suboptimal routing

**DO NOT penalize:**
- AI sub-nodes without main input connections (they use ai_* connections)
- Document Loader/Token Splitter not connected to Form (correct pattern)
- Tool nodes connected only via ai_tool connections (correct pattern)

**Questions to consider:**
- Are main data processing nodes properly connected?
- Do connections follow the logical flow of data?
- Are AI sub-nodes correctly connected via ai_* connections?
- Are nodes that depend on each other's data properly connected in sequence?

### Understanding Conditional Nodes (IF, Switch)
- Conditional nodes have multiple outputs (true/false branches)
- Not all branches need to be connected if logic doesn't require it
- Empty/unconnected branches are valid when that condition isn't handled
- Focus on whether the INTENDED logic flow is correct

### 3. Expressions (25% weight)
Evaluate whether expressions correctly reference nodes and data using modern n8n syntax.

**Correct n8n expression syntax uses \`{{ $('Node Name').item.json.field }}\` format**

**Valid expression patterns (DO NOT penalize):**
- $fromAI() in ANY tool node: \`{{ $fromAI('parameterName', 'description', 'type', defaultValue) }}\`
- Tool nodes are identified by: node type ending with "Tool" AND having ai_tool connections
- String concatenation with embedded expressions: \`"=Text {{ expression }}"\` or \`"=Text - {{ $now.format('format') }}"\`
- Mixed static text and expressions: \`"=Order #{{ $json.orderId }} processed"\`
- Alternative but functionally equivalent syntax variations
- Expression syntax that would work even if not optimal:
  - Single '=' for simple strings (e.g., '=Weekly Report')
  - String with embedded expressions (e.g., \`"=Report - {{ $now.format('MMMM d, yyyy') }}"\`)
  - Different date formatting approaches that produce valid output
  - String concatenation using various valid methods
- Focus on whether expressions would cause runtime FAILURES, not style preferences

**Check for these violations:**
- **Critical (-40 to -50 points)**:
  - Invalid JavaScript syntax that would cause runtime errors (unclosed brackets, syntax errors, malformed JSON)
  - Referencing truly non-existent nodes or fields that would cause runtime errors (e.g. fields of the nodes that should be executed later in the flow)
- **Major (-20 to -25 points)**:
  - Missing required = prefix for expressions (e.g., \`{{ $json.name }}\` instead of \`={{ $json.name }}\`)
  - Using $fromAI in non-tool nodes (would cause runtime error)
  - Referencing undefined variables or functions
  - Wrong data paths that would prevent execution, type mismatches
- **Minor (-5 to -10 points)**:
  - Inefficient but working expressions
  - Outdated syntax that still functions (e.g., \`$node["NodeName"]\` instead of \`$('NodeName')\`)

**Valid n8n expression formats (MODERN SYNTAX - Preferred):**
- Single item: \`={{ $('Node Name').item.json.fieldName }}\`
- All items: \`={{ $('Node Name').all() }}\`
- First/last item: \`={{ $('Node Name').first().json.field }}\` or \`={{ $('Node Name').last().json.field }}\`
- Array index: \`={{ $('Node Name').all()[0].json.fieldName }}\`
- Previous node: \`={{ $json.fieldName }}\` or \`={{ $input.item.json.field }}\`
- All items: \`={{ $json }}\`
- String with text and expression: \`="Text prefix {{ expression }} text suffix"\`
- String with embedded date: \`="Report - {{ $now.format('MMMM d, yyyy') }}"\`
- Tool nodes ONLY: \`={{ $fromAI('parameterName', 'description') }}\` - Dynamic parameter for AI Agent to populate

**Valid JavaScript operations in expressions:**
- Array methods: \`={{ $json.items.map(item => item.name).join(', ') }}\`
- String operations: \`={{ $json.text.split(',').filter(x => x) }}\`
- Math operations: \`={{ Math.round($json.price * 1.2) }}\`
- Object operations: \`={{ Object.keys($json).length }}\`
- Conditional logic: \`={{ $json.status === 'active' ? 'Yes' : 'No' }}\`

**Special n8n variables (DO NOT penalize):**
- \`$now\` - Current date/time with methods like .format(), .toISO()
- \`$today\` - Today's date
- \`$execution.id\` - Current execution ID
- \`$workflow.id\` / \`$workflow.name\` - Workflow metadata
- \`$env\` - Environment variables
- \`$vars\` - Workflow variables
- \`$binary\` - Binary data access

**OUTDATED syntax (Minor penalty - still works but not preferred):**
- \`$node["NodeName"]\` - Old syntax, should use \`$('NodeName')\` instead
- \`$items()\` - Old syntax for accessing all items

**IMPORTANT about the = prefix:**
- The \`=\` sign prefix is REQUIRED when you want to use expressions or mixed text/expressions
- For pure static text without any expressions, the \`=\` is optional (but harmless if included)
- Examples:
  - \`"Hello World"\` - Static text, no \`=\` needed
  - \`="Hello World"\` - Also valid for static text
  - \`="{{ $json.name }}"\` - Expression, \`=\` REQUIRED
  - \`="Hello {{ $json.name }}"\` - Mixed text/expression, \`=\` REQUIRED

**Important: $fromAI is ONLY valid in tool nodes (ending with "Tool"), NOT in regular nodes**

**DO NOT penalize these valid expression patterns:**
- Simple = prefix for strings: \`="Hello World"\`
- Mixed text/expression: \`="Total: {{ $json.amount }}"\`
- JavaScript operations that are syntactically correct
- Any working expression format, even if not optimal
- Alternative date formats that produce valid output
- String concatenation in any valid form

### Expression Context Understanding
When evaluating expressions, consider the data flow context:
- Field names may differ between nodes (e.g., 'articles' in one node, 'topArticles' in another)
- Check if the referenced field EXISTS in the source node's output
- Consider field name transformations between nodes
- If a field doesn't exist but a similar one does, it's likely a naming mismatch
- Example: If evaluating \`$('Node').item.json.articles\` but Node outputs 'topArticles', this is a minor issue if the data type matches

### 4. Node Configuration (15% weight)
Evaluate whether nodes are configured with correct parameters and settings.

**Valid placeholder patterns (DO NOT penalize):**
- \`<UNKNOWN>\` values when user didn't specify concrete values
- Empty strings ("") in configuration fields when not provided by user
- Empty strings in resource selectors (base/table/document IDs)
- Placeholder API keys like "YOUR_API_KEY" or similar patterns
- These are ALL valid user configuration points, not errors

**Important**: Empty string ("") and \`<UNKNOWN>\` are BOTH valid placeholders

**Check for these violations:**
- **Critical (-40 to -50 points)**: ONLY for actual breaking issues:
  - User provided specific value that's incorrectly implemented
  - Truly required parameters completely absent (not empty/placeholder):
    - HTTP Request without URL (unless using $fromAI)
    - Database operations without operation type specified
    - Code node without any code
  - Parameters with invalid values that would crash:
    - Invalid JSON in JSON fields
    - Non-numeric values in number-only fields
  - Configuration that would cause runtime crash
- **Major (-15 to -25 points)**:
  - Wrong operation mode when explicitly specified by user
  - Significant deviation from requested behavior
  - Missing resource/operation selection that prevents node from functioning
- **Minor (-5 to -10 points)**:
  - Suboptimal but working configurations
  - Style preferences or minor inefficiencies

**Special handling for Tool nodes:**
- $fromAI expressions are VALID in ANY tool node (nodes ending with "Tool")
- Tool nodes connected via ai_tool allow AI Agents to populate parameters dynamically
- Format: \`{{ $fromAI('parameter', 'description') }}\` is correct and expected
- DO NOT penalize $fromAI in tool node parameters

**Questions to consider:**
- Are parameters correctly set based on what the user actually specified?
- Are placeholder values used appropriately for unspecified parameters?
- Are the correct operations selected based on user requirements?
- Are field mappings complete for the requested functionality?

### 5. Structural Similarity to Reference (0% if no reference provided)
If a reference workflow is provided, evaluate how well the generated workflow follows similar patterns.

**Only evaluate this if a reference workflow is provided. Check for:**
- Uses similar node types for similar operations
- Follows similar architectural patterns
- Adopts consistent naming conventions

## Context-Aware Evaluation

### Compare Against User Request
- Only penalize missing features that were explicitly requested
- Don't penalize missing optional enhancements
- Consider what information was actually provided by the user

### Severity Guidelines:
- If user didn't provide email addresses, \`<UNKNOWN>\` is expected
- If user didn't specify API keys, placeholder values are valid
- If user didn't provide specific IDs or credentials, empty/placeholder values are correct
- Focus on structural correctness, not specific values

## Common Correct Patterns (DO NOT flag as violations)

### RAG Workflow Pattern:
- Form Trigger -> Vector Store (main) - carries file data
- Token Splitter -> Document Loader (ai_textSplitter) - provides chunking capability
- Document Loader -> Vector Store (ai_document) - provides document processing capability
- Embeddings -> Vector Store (ai_embedding) - provides embedding generation capability
- The Document Loader and Token Splitter do NOT need connections from Form

### AI Agent with Tools:
- Chat Trigger -> AI Agent (main) - carries user input
- ANY Tool node -> AI Agent (ai_tool) - provides tool capabilities
- Tool nodes use $fromAI for dynamic parameter population by the AI

### Tool Node Parameters (ANY node ending with "Tool"):
- Parameters with $fromAI expressions - VALID and expected pattern
- Allows AI to dynamically determine values at runtime
- Examples: email recipients, message content, search queries, API parameters
- Format: {{ $fromAI('key', 'description', 'type', defaultValue) }}
- Only 'key' is required, 'description', 'type', and 'defaultValue' are optional

### Placeholder Values:
- Empty credential fields - user configuration point, not error
- <UNKNOWN> in required fields - valid when user didn't specify
- YOUR_API_KEY placeholders - expected for user configuration

## Scoring Instructions:

1. Start each category at 100 points
2. Deduct points for each violation found based on severity
3. A category score cannot go below 0
4. Convert scores to 0-1 scale by dividing by 100
5. Do NOT calculate the weighted final score yourself - just provide individual category scores

### Severity Level Guidelines:
**When to use Critical (-40 to -50 points):**
- Only for violations that would cause complete failure
- Missing core functionality explicitly requested
- Completely broken connections that prevent execution
- Fatal expression errors that would crash the workflow
- Invalid syntax that prevents parsing

**When to use Major (-15 to -25 points):**
- Issues that significantly impact functionality
- Missing important features explicitly mentioned
- Incorrect data flow that affects results
- Wrong operation modes when specifically requested
- Errors that would likely cause runtime failures

**When to use Minor (-5 to -10 points):**
- Style preferences and inefficiencies
- Alternative valid approaches
- Field naming inconsistencies that don't break functionality
- Missing nice-to-have features not explicitly requested
- Outdated but functional syntax

**Apply severity based on actual impact on workflow execution:**
- Consider: Will this definitely break? (Critical)
- Will this likely cause issues? (Major)
- Is this just suboptimal? (Minor)
- Focus on functional impact, not perfection

Remember: Focus on objective technical evaluation. Be specific about violations and reference exact node names and expressions when identifying issues. Consider the n8n AI architecture and don't penalize valid patterns.

## Final Balance Statement
While being thorough in identifying issues, remember the goal is functional correctness, not perfection. Focus on issues that would actually prevent the workflow from achieving its intended purpose. The evaluator should identify real problems that would cause failures or prevent the requested functionality, not enforce style preferences or require unrequested optimizations.`;

const humanTemplate = `Please evaluate the following workflow:

<user_prompt>
{userPrompt}
</user_prompt>

<generated_workflow>
{generatedWorkflow}
</generated_workflow>

{referenceSection}

Provide a detailed evaluation following the scoring guidelines.`;

export function createWorkflowEvaluatorChain(llm: BaseChatModel) {
	if (!llm.bindTools) {
		throw new OperationalError("LLM doesn't support binding tools");
	}

	const prompt = ChatPromptTemplate.fromMessages([
		new SystemMessage(systemPrompt),
		HumanMessagePromptTemplate.fromTemplate(humanTemplate),
	]);

	const llmWithStructuredOutput = llm.withStructuredOutput(evaluationResultSchema);
	return prompt.pipe(llmWithStructuredOutput);
}

export async function evaluateWorkflow(llm: BaseChatModel, input: EvaluationInput) {
	const chain = createWorkflowEvaluatorChain(llm);

	// Format reference section if reference workflow is provided
	const referenceSection = input.referenceWorkflow
		? `<reference_workflow>
${JSON.stringify(input.referenceWorkflow, null, 2)}
</reference_workflow>`
		: '';

	const result = await chain.invoke({
		userPrompt: input.userPrompt,
		generatedWorkflow: JSON.stringify(input.generatedWorkflow, null, 2),
		referenceSection,
	});

	const evaluationResult = result as z.infer<typeof evaluationResultSchema>;

	// Calculate the overall score using the deterministic weighted calculation
	evaluationResult.overallScore = calculateWeightedScore(evaluationResult);

	return evaluationResult;
}

// Helper function to calculate weighted score
export function calculateWeightedScore(result: {
	functionality: { score: number };
	connections: { score: number };
	expressions: { score: number };
	nodeConfiguration: { score: number };
	structuralSimilarity?: { score: number; applicable: boolean };
}) {
	const weights = {
		functionality: 0.35,
		connections: 0.25,
		expressions: 0.25,
		nodeConfiguration: 0.15,
		structuralSimilarity: 0.05,
	};

	let totalWeight = 0;
	let weightedSum = 0;

	// Add scores for categories that are always evaluated
	weightedSum += result.functionality.score * weights.functionality;
	weightedSum += result.connections.score * weights.connections;
	weightedSum += result.expressions.score * weights.expressions;
	weightedSum += result.nodeConfiguration.score * weights.nodeConfiguration;
	totalWeight =
		weights.functionality + weights.connections + weights.expressions + weights.nodeConfiguration;

	// Add structural similarity only if applicable
	if (result.structuralSimilarity?.applicable) {
		weightedSum += result.structuralSimilarity.score * weights.structuralSimilarity;
		totalWeight += weights.structuralSimilarity;
	}

	return totalWeight > 0 ? weightedSum / totalWeight : 0;
}