diff --git a/packages/@n8n/ai-workflow-builder.ee/evaluations/README.md b/packages/@n8n/ai-workflow-builder.ee/evaluations/README.md index 2109f8b698..45883878f4 100644 --- a/packages/@n8n/ai-workflow-builder.ee/evaluations/README.md +++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/README.md @@ -123,6 +123,10 @@ Violations are categorized by severity: # Run with default settings pnpm eval +# Run a specific test case +pnpm eval --test-case google-sheets-processing +pnpm eval --test-case extract-from-file + # With additional generated test cases GENERATE_TEST_CASES=true pnpm eval diff --git a/packages/@n8n/ai-workflow-builder.ee/evaluations/chains/test-case-generator.ts b/packages/@n8n/ai-workflow-builder.ee/evaluations/chains/test-case-generator.ts index 7043ba5b9a..e05dd53e51 100644 --- a/packages/@n8n/ai-workflow-builder.ee/evaluations/chains/test-case-generator.ts +++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/chains/test-case-generator.ts @@ -142,4 +142,16 @@ export const basicTestCases: TestCase[] = [ prompt: 'Create a multi-agent AI workflow where different AI agents collaborate to research a topic, fact-check information, and compile comprehensive reports.', }, + { + id: 'google-sheets-processing', + name: 'Process large Google Sheets data', + prompt: + 'Create a workflow that reads all rows from a Google Sheets document with thousands of customer records. For each row, call an external API to get additional customer data, process the response, and update the row with the enriched information. Handle rate limiting and errors gracefully.', + }, + { + id: 'extract-from-file', + name: 'Extract data from uploaded files', + prompt: + 'Build a workflow that accepts file uploads through an n8n form. When users upload PDF documents, CSV files, or Excel spreadsheets, automatically extract the text content and data from these files. Transform the extracted data into a structured format and save it to a database or send it via email as a summary.', + }, ]; diff --git a/packages/@n8n/ai-workflow-builder.ee/evaluations/chains/workflow-evaluator.ts b/packages/@n8n/ai-workflow-builder.ee/evaluations/chains/workflow-evaluator.ts index 07a3af72e0..3f1cb88668 100644 --- a/packages/@n8n/ai-workflow-builder.ee/evaluations/chains/workflow-evaluator.ts +++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/chains/workflow-evaluator.ts @@ -54,6 +54,7 @@ Evaluate whether the workflow correctly implements what the user EXPLICITLY requ - Missing core functionality explicitly requested - Incorrect operation logic that prevents the workflow from working - Workflows missing a trigger node when they need to start automatically or by some external event + - Using Split In Batches node - **Major (-15 to -25 points)**: - Missing explicitly required data transformations - Incomplete implementation of requested features diff --git a/packages/@n8n/ai-workflow-builder.ee/evaluations/cli/runner.ts b/packages/@n8n/ai-workflow-builder.ee/evaluations/cli/runner.ts index 5a2cb94d60..2b01d7e4e5 100644 --- a/packages/@n8n/ai-workflow-builder.ee/evaluations/cli/runner.ts +++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/cli/runner.ts @@ -24,7 +24,7 @@ import { generateMarkdownReport } from '../utils/evaluation-reporter.js'; * Main CLI evaluation runner that executes all test cases in parallel * Supports concurrency control via EVALUATION_CONCURRENCY environment variable */ -export async function runCliEvaluation(): Promise { +export async function runCliEvaluation(testCaseFilter?: string): Promise { console.log(formatHeader('AI Workflow Builder Full Evaluation', 70)); console.log(); try { @@ -34,11 +34,24 @@ export async function runCliEvaluation(): Promise { // Determine test cases to run let testCases: TestCase[] = basicTestCases; - // Optionally generate additional test cases - if (shouldGenerateTestCases()) { - console.log(pc.blue('➔ Generating additional test cases...')); - const generatedCases = await generateTestCases(llm, howManyTestCasesToGenerate()); - testCases = [...testCases, ...generatedCases]; + // Filter to single test case if specified + if (testCaseFilter) { + const filteredCase = testCases.find((tc) => tc.id === testCaseFilter); + if (filteredCase) { + testCases = [filteredCase]; + console.log(pc.blue(`➔ Running single test case: ${filteredCase.name}`)); + } else { + console.log(pc.red(`❌ Test case '${testCaseFilter}' not found`)); + console.log(pc.dim(`Available test cases: ${testCases.map((tc) => tc.id).join(', ')}`)); + return; + } + } else { + // Optionally generate additional test cases + if (shouldGenerateTestCases()) { + console.log(pc.blue('➔ Generating additional test cases...')); + const generatedCases = await generateTestCases(llm, howManyTestCasesToGenerate()); + testCases = [...testCases, ...generatedCases]; + } } // Get concurrency from environment diff --git a/packages/@n8n/ai-workflow-builder.ee/evaluations/index.ts b/packages/@n8n/ai-workflow-builder.ee/evaluations/index.ts index e198dae8f0..1408eb23fe 100644 --- a/packages/@n8n/ai-workflow-builder.ee/evaluations/index.ts +++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/index.ts @@ -14,10 +14,15 @@ export { setupTestEnvironment, createAgent } from './core/environment.js'; async function main(): Promise { const useLangsmith = process.env.USE_LANGSMITH_EVAL === 'true'; + // Parse command line arguments for single test case + const testCaseId = process.argv.includes('--test-case') + ? process.argv[process.argv.indexOf('--test-case') + 1] + : undefined; + if (useLangsmith) { await runLangsmithEvaluation(); } else { - await runCliEvaluation(); + await runCliEvaluation(testCaseId); } } diff --git a/packages/@n8n/ai-workflow-builder.ee/src/tools/prompts/main-agent.prompt.ts b/packages/@n8n/ai-workflow-builder.ee/src/tools/prompts/main-agent.prompt.ts index 649e18168b..58c02fb6d7 100644 --- a/packages/@n8n/ai-workflow-builder.ee/src/tools/prompts/main-agent.prompt.ts +++ b/packages/@n8n/ai-workflow-builder.ee/src/tools/prompts/main-agent.prompt.ts @@ -238,6 +238,8 @@ Why: Unconfigured nodes WILL fail at runtime For AI-generated structured data, prefer Structured Output Parser nodes over Code nodes. Why: Purpose-built parsers are more reliable and handle edge cases better than custom code. +For binary file data, use Extract From File node to extract content from files before processing. + Use Code nodes only for: - Simple string manipulations - Already structured data (JSON, CSV) @@ -321,9 +323,10 @@ Anticipate workflow needs and suggest enhancements: - Set nodes for data transformation between incompatible formats - Schedule Triggers for recurring tasks - Error handling for external service calls -- Split In Batches for large dataset processing Why: Proactive suggestions create more robust, production-ready workflows + +NEVER use Split In Batches nodes.