From cdcc56461b77554964622f490ae901f170886595 Mon Sep 17 00:00:00 2001 From: centdix <40307056+centdix@users.noreply.github.com> Date: Mon, 13 Apr 2026 14:05:46 +0200 Subject: [PATCH] feat: add black-box ai eval benchmarks (#8618) --- .gitignore | 2 + ai_evals/.gitignore | 2 + ai_evals/AGENTS.md | 172 +++ ai_evals/CLAUDE.md | 1 + ai_evals/README.md | 176 +++ ai_evals/adapters/cli/runtime.test.ts | 72 ++ ai_evals/adapters/cli/runtime.ts | 199 ++++ ai_evals/adapters/frontend/benchmarkRunner.ts | 87 ++ .../frontend/core/app/appEvalRunner.ts | 92 ++ .../frontend/core}/app/appFixtureLoader.ts | 6 +- .../adapters/frontend/core/app/fileHelpers.ts | 255 +++++ .../frontend/core/flow/fileHelpers.ts | 161 +++ .../frontend/core/flow/flowEvalRunner.ts | 103 ++ .../frontend/core/script/fileHelpers.ts | 73 ++ .../adapters/frontend/core/script/preview.ts | 96 ++ .../frontend/core/script/scriptEvalRunner.ts | 109 ++ .../frontend/core}/shared/baseEvalRunner.ts | 100 +- .../adapters/frontend/core/shared/index.ts | 3 + .../core/shared/providerConfig.test.ts | 41 + .../frontend/core/shared/providerConfig.ts | 71 ++ .../adapters/frontend/core/shared/types.ts | 32 + ai_evals/adapters/frontend/mockBackend.ts | 270 +++++ ai_evals/adapters/frontend/progress.ts | 133 +++ ai_evals/adapters/frontend/runtime.ts | 216 ++++ ai_evals/adapters/frontend/vitest.config.ts | 28 + .../adapters/frontend/vitestAdapter.test.ts | 165 +++ ai_evals/bun.lock | 313 ++++++ ai_evals/cases/app.yaml | 93 ++ ai_evals/cases/cli.yaml | 66 ++ ai_evals/cases/flow.yaml | 246 ++++ ai_evals/cases/script.yaml | 11 + ai_evals/cli/index.ts | 295 +++++ ai_evals/core/cases.ts | 71 ++ ai_evals/core/files.ts | 67 ++ ai_evals/core/judge.ts | 149 +++ ai_evals/core/models.test.ts | 29 + ai_evals/core/models.ts | 185 +++ ai_evals/core/results.ts | 296 +++++ ai_evals/core/runSuite.ts | 264 +++++ ai_evals/core/types.ts | 198 ++++ ai_evals/core/validators.test.ts | 36 + ai_evals/core/validators.ts | 997 ++++++++++++++++ .../f/evals/hello__flow/flow.yaml | 0 .../f/evals/hello__flow/hello.ts | 0 .../f/evals/hello__flow/flow.yaml | 0 .../f/evals/hello__flow/hello.ts | 2 + .../f/evals/hello.ts | 0 .../bun-hello-script/f/evals/hello.ts | 3 + .../f/evals/reuse_greeting__flow/flow.yaml | 2 + .../f/lib/format_greeting.ts | 3 + .../f/evals/add_numbers.py | 2 + .../f/evals/hello__flow/flow.yaml | 20 + .../f/evals/hello__flow/hello.ts | 3 + .../f/evals/hello.ts | 3 + .../f/lib/format_greeting.ts | 3 + .../file_manager/backend/createFolder/main.ts | 0 .../backend/createFolder/meta.json | 0 .../file_manager/backend/deleteItem/main.ts | 0 .../file_manager/backend/deleteItem/meta.json | 0 .../file_manager/backend/listFiles/main.ts | 0 .../file_manager/backend/listFiles/meta.json | 0 .../file_manager/backend/listFolders/main.ts | 0 .../backend/listFolders/meta.json | 0 .../file_manager/backend/moveItem/main.ts | 0 .../file_manager/backend/moveItem/meta.json | 0 .../file_manager/backend/renameItem/main.ts | 0 .../file_manager/backend/renameItem/meta.json | 0 .../frontend/components/Breadcrumb.tsx | 0 .../frontend/components/FileItem.tsx | 0 .../frontend/components/FileList.tsx | 0 .../frontend/components/FolderTree.tsx | 0 .../frontend/components/Toolbar.tsx | 0 .../initial/file_manager/frontend/index.tsx | 0 .../shopping_cart/backend/addToCart/main.ts | 0 .../shopping_cart/backend/addToCart/meta.json | 0 .../backend/calculateTotal/main.ts | 0 .../backend/calculateTotal/meta.json | 0 .../shopping_cart/backend/getProducts/main.ts | 0 .../backend/getProducts/meta.json | 0 .../backend/removeFromCart/main.ts | 0 .../backend/removeFromCart/meta.json | 0 .../frontend/components/Cart.tsx | 0 .../frontend/components/ProductCard.tsx | 0 .../frontend/components/ProductList.tsx | 0 .../initial/shopping_cart/frontend/index.tsx | 0 .../backend/decrementCounter/main.ts | 0 .../backend/decrementCounter/meta.json | 0 .../backend/incrementCounter/main.ts | 0 .../backend/incrementCounter/meta.json | 0 .../test1_counter_app/frontend/index.tsx | 0 .../flow/expected/test0_sum_two_numbers.json | 31 + .../frontend}/flow/expected/test1.json | 0 .../expected/test10_while_loop_counter.json | 30 + .../expected/test11_preprocessor_failure.json | 36 + .../flow/expected/test12_approval_step.json | 44 + .../expected/test1_reuse_existing_script.json | 39 + .../frontend}/flow/expected/test2.json | 0 .../expected/test2_call_existing_subflow.json | 39 + .../frontend}/flow/expected/test3.json | 0 .../expected/test3_branchone_routing.json | 24 + .../frontend}/flow/expected/test4.json | 0 .../flow/expected/test5_modify_simple.json | 0 .../flow/expected/test6_modify_medium.json | 0 .../flow/expected/test7_modify_complex.json | 0 .../test1_reuse_existing_script_initial.json | 29 + .../test2_call_existing_subflow_initial.json | 49 + .../frontend}/flow/initial/test5_initial.json | 0 .../frontend}/flow/initial/test6_initial.json | 0 .../frontend}/flow/initial/test7_initial.json | 0 .../script/expected/test1_greet_user.json | 8 + .../script/initial/test1_empty_bun.json | 8 + ai_evals/history/app.jsonl | 3 + ai_evals/history/cli.jsonl | 2 + ai_evals/history/flow.jsonl | 3 + ai_evals/history/script.jsonl | 3 + ai_evals/modes/app.ts | 79 ++ ai_evals/modes/cli.ts | 162 +++ ai_evals/modes/flow.ts | 104 ++ ai_evals/modes/frontendCommon.test.ts | 28 + ai_evals/modes/frontendCommon.ts | 23 + ai_evals/modes/script.ts | 61 + ai_evals/package.json | 19 + cli/README.md | 22 + cli/TESTING.md | 14 + cli/src/commands/init/init.ts | 113 +- cli/src/guidance/writer.ts | 269 +++++ cli/test-skills/README.md | 103 -- cli/test-skills/bun.lock | 61 - cli/test-skills/package.json | 16 - cli/test-skills/src/skill-invocation.test.ts | 91 -- cli/test-skills/src/test-utils.ts | 137 --- cli/test-skills/tsconfig.json | 17 - cli/test/guidance_writer_unit.test.ts | 148 +++ docs/failing-tests.md | 33 + docs/system-prompt-testing-plan.md | 1000 +++++++++++++++++ docs/system-prompt-testing-status.md | 129 +++ .../copilot/chat/AIChatManager.svelte.ts | 7 +- .../chat/__tests__/app/appChat.eval.test.ts | 303 ----- .../chat/__tests__/app/appEvalComparison.ts | 171 --- .../chat/__tests__/app/appEvalHelpers.ts | 147 --- .../chat/__tests__/app/appEvalRunner.ts | 177 --- .../chat/__tests__/app/appResultsWriter.ts | 247 ---- .../chat/__tests__/app/variants/baseline.ts | 12 - .../chat/__tests__/app/variants/index.ts | 6 - .../__tests__/app/variants/streamlined.ts | 144 --- .../chat/__tests__/flow/flowChat.eval.test.ts | 449 -------- .../chat/__tests__/flow/flowEvalComparison.ts | 68 -- .../chat/__tests__/flow/flowEvalHelpers.ts | 104 -- .../chat/__tests__/flow/flowEvalRunner.ts | 186 --- .../chat/__tests__/flow/variants/baseline.ts | 12 - .../chat/__tests__/flow/variants/index.ts | 6 - .../flow/variants/minimal-single-tool.ts | 402 ------- .../chat/__tests__/shared/baseLLMEvaluator.ts | 135 --- .../__tests__/shared/baseResultsWriter.ts | 169 --- .../chat/__tests__/shared/baseVariants.ts | 108 -- .../copilot/chat/__tests__/shared/index.ts | 28 - .../copilot/chat/__tests__/shared/types.ts | 107 -- .../lib/components/copilot/chat/anthropic.ts | 15 +- .../lib/components/copilot/chat/chatLoop.ts | 21 +- .../copilot/chat/flow/FlowAIChat.svelte | 13 +- .../lib/components/copilot/chat/flow/core.ts | 24 +- .../copilot/chat/flow/inlineScriptsUtils.ts | 106 +- .../copilot/chat/openai-responses.ts | 18 +- .../src/lib/components/copilot/chat/shared.ts | 2 - .../lib/components/copilot/chat/tokenUsage.ts | 73 ++ frontend/src/lib/components/copilot/lib.ts | 28 +- 166 files changed, 8640 insertions(+), 3647 deletions(-) create mode 100644 ai_evals/.gitignore create mode 100644 ai_evals/AGENTS.md create mode 100644 ai_evals/CLAUDE.md create mode 100644 ai_evals/README.md create mode 100644 ai_evals/adapters/cli/runtime.test.ts create mode 100644 ai_evals/adapters/cli/runtime.ts create mode 100644 ai_evals/adapters/frontend/benchmarkRunner.ts create mode 100644 ai_evals/adapters/frontend/core/app/appEvalRunner.ts rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/adapters/frontend/core}/app/appFixtureLoader.ts (97%) create mode 100644 ai_evals/adapters/frontend/core/app/fileHelpers.ts create mode 100644 ai_evals/adapters/frontend/core/flow/fileHelpers.ts create mode 100644 ai_evals/adapters/frontend/core/flow/flowEvalRunner.ts create mode 100644 ai_evals/adapters/frontend/core/script/fileHelpers.ts create mode 100644 ai_evals/adapters/frontend/core/script/preview.ts create mode 100644 ai_evals/adapters/frontend/core/script/scriptEvalRunner.ts rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/adapters/frontend/core}/shared/baseEvalRunner.ts (66%) create mode 100644 ai_evals/adapters/frontend/core/shared/index.ts create mode 100644 ai_evals/adapters/frontend/core/shared/providerConfig.test.ts create mode 100644 ai_evals/adapters/frontend/core/shared/providerConfig.ts create mode 100644 ai_evals/adapters/frontend/core/shared/types.ts create mode 100644 ai_evals/adapters/frontend/mockBackend.ts create mode 100644 ai_evals/adapters/frontend/progress.ts create mode 100644 ai_evals/adapters/frontend/runtime.ts create mode 100644 ai_evals/adapters/frontend/vitest.config.ts create mode 100644 ai_evals/adapters/frontend/vitestAdapter.test.ts create mode 100644 ai_evals/bun.lock create mode 100644 ai_evals/cases/app.yaml create mode 100644 ai_evals/cases/cli.yaml create mode 100644 ai_evals/cases/flow.yaml create mode 100644 ai_evals/cases/script.yaml create mode 100644 ai_evals/cli/index.ts create mode 100644 ai_evals/core/cases.ts create mode 100644 ai_evals/core/files.ts create mode 100644 ai_evals/core/judge.ts create mode 100644 ai_evals/core/models.test.ts create mode 100644 ai_evals/core/models.ts create mode 100644 ai_evals/core/results.ts create mode 100644 ai_evals/core/runSuite.ts create mode 100644 ai_evals/core/types.ts create mode 100644 ai_evals/core/validators.test.ts create mode 100644 ai_evals/core/validators.ts create mode 100644 ai_evals/fixtures/cli/expected/bun-hello-flow-punctuation/f/evals/hello__flow/flow.yaml create mode 100644 ai_evals/fixtures/cli/expected/bun-hello-flow-punctuation/f/evals/hello__flow/hello.ts create mode 100644 ai_evals/fixtures/cli/expected/bun-hello-flow/f/evals/hello__flow/flow.yaml create mode 100644 ai_evals/fixtures/cli/expected/bun-hello-flow/f/evals/hello__flow/hello.ts create mode 100644 ai_evals/fixtures/cli/expected/bun-hello-script-uppercase/f/evals/hello.ts create mode 100644 ai_evals/fixtures/cli/expected/bun-hello-script/f/evals/hello.ts create mode 100644 ai_evals/fixtures/cli/expected/flow-reuse-existing-script/f/evals/reuse_greeting__flow/flow.yaml create mode 100644 ai_evals/fixtures/cli/expected/flow-reuse-existing-script/f/lib/format_greeting.ts create mode 100644 ai_evals/fixtures/cli/expected/python-add-numbers-script/f/evals/add_numbers.py create mode 100644 ai_evals/fixtures/cli/initial/bun-hello-flow-punctuation/f/evals/hello__flow/flow.yaml create mode 100644 ai_evals/fixtures/cli/initial/bun-hello-flow-punctuation/f/evals/hello__flow/hello.ts create mode 100644 ai_evals/fixtures/cli/initial/bun-hello-script-uppercase/f/evals/hello.ts create mode 100644 ai_evals/fixtures/cli/initial/flow-reuse-existing-script/f/lib/format_greeting.ts rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/createFolder/main.ts (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/createFolder/meta.json (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/deleteItem/main.ts (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/deleteItem/meta.json (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/listFiles/main.ts (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/listFiles/meta.json (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/listFolders/main.ts (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/listFolders/meta.json (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/moveItem/main.ts (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/moveItem/meta.json (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/renameItem/main.ts (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/renameItem/meta.json (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/frontend/components/Breadcrumb.tsx (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/frontend/components/FileItem.tsx (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/frontend/components/FileList.tsx (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/frontend/components/FolderTree.tsx (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/frontend/components/Toolbar.tsx (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/frontend/index.tsx (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/backend/addToCart/main.ts (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/backend/addToCart/meta.json (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/backend/calculateTotal/main.ts (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/backend/calculateTotal/meta.json (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/backend/getProducts/main.ts (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/backend/getProducts/meta.json (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/backend/removeFromCart/main.ts (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/backend/removeFromCart/meta.json (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/frontend/components/Cart.tsx (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/frontend/components/ProductCard.tsx (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/frontend/components/ProductList.tsx (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/frontend/index.tsx (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/test1_counter_app/backend/decrementCounter/main.ts (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/test1_counter_app/backend/decrementCounter/meta.json (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/test1_counter_app/backend/incrementCounter/main.ts (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/test1_counter_app/backend/incrementCounter/meta.json (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/test1_counter_app/frontend/index.tsx (100%) create mode 100644 ai_evals/fixtures/frontend/flow/expected/test0_sum_two_numbers.json rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/flow/expected/test1.json (100%) create mode 100644 ai_evals/fixtures/frontend/flow/expected/test10_while_loop_counter.json create mode 100644 ai_evals/fixtures/frontend/flow/expected/test11_preprocessor_failure.json create mode 100644 ai_evals/fixtures/frontend/flow/expected/test12_approval_step.json create mode 100644 ai_evals/fixtures/frontend/flow/expected/test1_reuse_existing_script.json rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/flow/expected/test2.json (100%) create mode 100644 ai_evals/fixtures/frontend/flow/expected/test2_call_existing_subflow.json rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/flow/expected/test3.json (100%) create mode 100644 ai_evals/fixtures/frontend/flow/expected/test3_branchone_routing.json rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/flow/expected/test4.json (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/flow/expected/test5_modify_simple.json (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/flow/expected/test6_modify_medium.json (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/flow/expected/test7_modify_complex.json (100%) create mode 100644 ai_evals/fixtures/frontend/flow/initial/test1_reuse_existing_script_initial.json create mode 100644 ai_evals/fixtures/frontend/flow/initial/test2_call_existing_subflow_initial.json rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/flow/initial/test5_initial.json (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/flow/initial/test6_initial.json (100%) rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/flow/initial/test7_initial.json (100%) create mode 100644 ai_evals/fixtures/frontend/script/expected/test1_greet_user.json create mode 100644 ai_evals/fixtures/frontend/script/initial/test1_empty_bun.json create mode 100644 ai_evals/history/app.jsonl create mode 100644 ai_evals/history/cli.jsonl create mode 100644 ai_evals/history/flow.jsonl create mode 100644 ai_evals/history/script.jsonl create mode 100644 ai_evals/modes/app.ts create mode 100644 ai_evals/modes/cli.ts create mode 100644 ai_evals/modes/flow.ts create mode 100644 ai_evals/modes/frontendCommon.test.ts create mode 100644 ai_evals/modes/frontendCommon.ts create mode 100644 ai_evals/modes/script.ts create mode 100644 ai_evals/package.json create mode 100644 cli/src/guidance/writer.ts delete mode 100644 cli/test-skills/README.md delete mode 100644 cli/test-skills/bun.lock delete mode 100644 cli/test-skills/package.json delete mode 100644 cli/test-skills/src/skill-invocation.test.ts delete mode 100644 cli/test-skills/src/test-utils.ts delete mode 100644 cli/test-skills/tsconfig.json create mode 100644 cli/test/guidance_writer_unit.test.ts create mode 100644 docs/failing-tests.md create mode 100644 docs/system-prompt-testing-plan.md create mode 100644 docs/system-prompt-testing-status.md delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/app/appChat.eval.test.ts delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/app/appEvalComparison.ts delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/app/appEvalHelpers.ts delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/app/appEvalRunner.ts delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/app/appResultsWriter.ts delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/app/variants/baseline.ts delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/app/variants/index.ts delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/app/variants/streamlined.ts delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/flow/flowChat.eval.test.ts delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalComparison.ts delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalHelpers.ts delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalRunner.ts delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/flow/variants/baseline.ts delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/flow/variants/index.ts delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/flow/variants/minimal-single-tool.ts delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/shared/baseLLMEvaluator.ts delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/shared/baseResultsWriter.ts delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/shared/baseVariants.ts delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/shared/index.ts delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/shared/types.ts create mode 100644 frontend/src/lib/components/copilot/chat/tokenUsage.ts diff --git a/.gitignore b/.gitignore index a080d58795..b2741131a5 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,8 @@ rust-client/Cargo.toml backend/target frontend/node_modules typescript-client/node_modules +ai_evals/node_modules +ai_evals/results/ frontend/.svelte-kit backend/chrome_profiler.json .fast-check/ diff --git a/ai_evals/.gitignore b/ai_evals/.gitignore new file mode 100644 index 0000000000..9263598939 --- /dev/null +++ b/ai_evals/.gitignore @@ -0,0 +1,2 @@ +.env +results/ diff --git a/ai_evals/AGENTS.md b/ai_evals/AGENTS.md new file mode 100644 index 0000000000..096baf5b58 --- /dev/null +++ b/ai_evals/AGENTS.md @@ -0,0 +1,172 @@ +# AI Evals Authoring Guide + +This folder contains black-box benchmark cases for: + +- `flow` +- `app` +- `script` +- `cli` + +The goal is to test the current production prompts and guidance with realistic user requests, not to test one exact implementation shape. + +## Core rules + +1. Write prompts like a real user request. +2. Prefer behavior, inputs, constraints, and outcomes over internal implementation details. +3. Keep deterministic validation narrow and hard. +4. Put semantic expectations in `judgeChecklist`. +5. Use `expected` fixtures only when exact structure really matters. + +## Prompt writing + +Prompts should sound like something a user would naturally ask. + +Good: + +- "Create a flow that routes support requests based on customer tier." +- "Add a reset button that sets the counter back to 0." +- "Create a flow that reuses the existing greeting script instead of duplicating the logic." + +Bad: + +- "Use `branchone` with 3 branches and a default branch." +- "Create a `rawscript` step with this exact topology." +- "This is a benchmark harness." + +Do not write prompts as if the user knows Windmill internals unless the case is explicitly testing a power-user workflow. + +## Flow-specific rules + +This is the main principle you asked for: + +- flow prompts should read like requests from a user who does not know the product internals +- the user should ask for behavior, not for `branchone`, `branchall`, `rawscript`, `preprocessor_module`, `failure_module`, exact graph topology, or other internal constructs + +That means: + +- creation cases should describe the business behavior and expected result +- modification cases may mention existing step names, because the user can see the current flow +- only mention special Windmill constructs when the case is explicitly about those constructs + +Examples: + +- acceptable creation prompt: + "Create a purchase approval flow that pauses for approval and asks the approver for a comment." +- avoid: + "Create a suspend step with one required event and a resume form." + +For flow cases, do not fail a case just because the model chose a different valid topology. + +## App-specific rules + +App prompts should focus on user-visible behavior: + +- what the UI should let the user do +- what should persist +- what backend behavior is needed + +Avoid prompting in terms of React structure, component names, or implementation unless the case is specifically about editing an existing app. + +## CLI-specific rules + +CLI prompts can be more explicit about paths and file names because real CLI users often do specify them. + +Still, avoid benchmark phrasing. The prompt should read like a repo task, not a harness instruction. + +When relevant, ask the assistant to tell the user which `wmill` commands to run next. That is part of the benchmarked behavior. + +## Deterministic validation + +Use deterministic validation only for hard failures such as: + +- missing required files +- unexpected extra files when the prompt says not to create them +- syntax errors +- unresolved flow refs +- missing required special modules or suspend config +- obvious artifact corruption + +Do not use deterministic validation to enforce one preferred implementation for broad creation tasks. + +Examples of bad hard checks: + +- exact step topology for a creation flow +- exact branch structure when the prompt only asked for routing behavior +- exact input shape when multiple reasonable shapes are acceptable + +## Judge checklist + +Every non-trivial case should have a `judgeChecklist`. + +The checklist should capture: + +- the user-visible behavior that must be present +- important constraints +- key completion criteria + +The checklist should not duplicate low-level implementation details unless they are truly required by the task. + +Good checklist items: + +- "the flow calculates the order total with 8% tax" +- "the app persists recipes appropriately for a raw Windmill app" +- "the flow reuses the existing workspace script instead of rewriting the logic" + +Bad checklist items: + +- "uses `branchone`" +- "contains a `rawscript` node" + +## When to use `expected` + +Use `expected` fixtures when the case is structure-sensitive, for example: + +- exact file creation +- exact script content +- modification cases where a specific file must change in a specific way +- cases where preserving an existing structure is part of the requirement + +Do not use a full `expected` artifact as the semantic oracle for broad creation tasks when multiple valid outputs should pass. + +## When to use `initial` + +Use `initial` when the benchmark is about: + +- editing an existing artifact +- reusing existing workspace assets +- preserving existing behavior while adding a change + +If the case is greenfield, prefer no `initial`. + +## Case design ladder + +Prefer suites that get gradually harder: + +1. trivial create case +2. realistic create case +3. reuse-existing-assets case +4. modification case +5. refactor case +6. edge-case or niche product behavior + +The last cases in a suite should cover unusual or product-specific behavior. + +## Anti-patterns + +Avoid these: + +- benchmark framing in prompts +- over-specified internal topology for creation tasks +- judge checklists that just restate implementation details +- deterministic validation that encodes one preferred solution +- fixtures that are so minimal or brittle that they create false negatives + +## Before adding a case + +Ask: + +1. Would a real user plausibly write this prompt? +2. If the model solves it in a different valid way, would the case still pass? +3. Are the hard deterministic checks only catching objectively broken output? +4. Does the `judgeChecklist` describe the real success criteria? +5. If this case fails, will the reason be understandable from the saved artifacts? diff --git a/ai_evals/CLAUDE.md b/ai_evals/CLAUDE.md new file mode 100644 index 0000000000..eef4bd20cf --- /dev/null +++ b/ai_evals/CLAUDE.md @@ -0,0 +1 @@ +@AGENTS.md \ No newline at end of file diff --git a/ai_evals/README.md b/ai_evals/README.md new file mode 100644 index 0000000000..353bee9dc3 --- /dev/null +++ b/ai_evals/README.md @@ -0,0 +1,176 @@ +# AI Evals + +Small benchmark runner for the four Windmill AI generation modes: + +- `cli` +- `flow` +- `script` +- `app` + +The benchmark always tests the current production prompts, tools, and guidance in this checkout. + +Each attempt runs: + +1. the real production path +2. deterministic validation +3. LLM judging + +## Install + +```bash +cd ai_evals +bun install +``` + +Frontend modes also require frontend dependencies: + +```bash +cd frontend +bun install +``` + +## Commands + +List model aliases: + +```bash +cd ai_evals +bun run cli -- models +``` + +List cases: + +```bash +cd ai_evals +bun run cli -- cases +bun run cli -- cases flow +``` + +Run benchmarks: + +```bash +cd ai_evals +bun run cli -- run flow +bun run cli -- run flow flow-test4-order-processing-loop --model opus +bun run cli -- run flow flow-test0-sum-two-numbers --models haiku,opus,4o +bun run cli -- run flow flow-test0-sum-two-numbers --runs 3 --verbose +bun run cli -- run flow --record +bun run cli -- run cli bun-hello-script +``` + +Public CLI surface: + +- `models` +- `cases [mode]` +- `run [caseIds...]` + +`run` options: + +- `--runs `: repeat each case `n` times +- `--output `: custom result JSON path +- `--model `: choose the model under test +- `--models `: run the same cases sequentially against several model aliases +- `--verbose`: stream assistant output for frontend runs +- `--record`: append a compact tracked summary line to `ai_evals/history/.jsonl` for full-suite runs only + +## Models + +Use `bun run cli -- models` to see the current aliases. + +Today: + +- `haiku` +- `sonnet` +- `opus` +- `4o` +- `gemini-flash` +- `gemini-pro` +- `gemini-3-flash-preview` +- `gemini-3.1-pro-preview` + +Notes: + +- the command also prints accepted alias spellings such as `gpt-4o`, `claude-opus-4.6`, and `claude-haiku-4.5` +- frontend modes (`flow`, `script`, `app`) can use Anthropic, OpenAI, and Gemini-backed aliases +- `cli` mode always uses the Anthropic agent SDK, so only Anthropic aliases are valid there +- the judge model is separate and currently defaults to `claude-sonnet-4-6` + +## Case Format + +Cases live in one YAML file per mode under `ai_evals/cases/`. + +Minimal shape: + +```yaml +- id: flow-test0-sum-two-numbers + prompt: |- + Create a flow that takes two numbers, `a` and `b`, and returns their sum. + initial: ai_evals/fixtures/... + expected: ai_evals/fixtures/... +``` + +Optional fields: + +- `initial`: starting state fixture +- `expected`: expected artifact fixture +- `validate`: extra deterministic validation rules + +For `flow` mode, `validate` can express requirements such as: + +- accepted input schema shapes +- required `results.*` reference validity +- required module/code/input characteristics + +For `flow` mode, an `initial` fixture can also include a benchmark workspace catalog of +existing scripts and flows. That lets the real `search_workspace` and +`get_runnable_details` tools discover reusable workspace runnables during evals. + +## Results And Artifacts + +Every run writes: + +- a summary JSON under `ai_evals/results/` +- generated artifacts in a sibling directory + +If `--record` is used, the CLI also appends one compact JSON line to: + +- `ai_evals/history/flow.jsonl` +- `ai_evals/history/script.jsonl` +- `ai_evals/history/app.jsonl` +- `ai_evals/history/cli.jsonl` + +Each recorded line contains: + +- run metadata (`createdAt`, `gitSha`, `mode`, `runModel`, `judgeModel`) +- suite totals (`caseCount`, `attemptCount`, `passedAttempts`, `passRate`, `averageDurationMs`, `averageJudgeScore`) +- average token usage (`averageTokenUsagePerAttempt`) +- per-case metrics under `cases[]` (`averageDurationMs`, `averageJudgeScore`, `averageTokenUsagePerAttempt`, pass rate) +- `failedCaseIds` + +Example: + +- summary: `ai_evals/results/2026-04-09T09-40-33.051Z__flow.json` +- artifacts: `ai_evals/results/2026-04-09T09-40-33.051Z__flow/` + +Typical artifacts by mode: + +- `flow`: `flow.json` +- `script`: `script.json` plus the generated script file +- `app`: `app.json` plus frontend/backend files +- `cli`: `assistant-output.txt` plus generated workspace files + +## Layout + +- `cases/`: one YAML file per mode +- `fixtures/`: initial and expected fixtures +- `core/`: shared loading, model resolution, validation, judging, and result writing +- `modes/`: one runner per mode +- `history/`: optional tracked pass-rate history written by `run --record`, one JSONL file per mode +- `results/`: local benchmark output and artifacts + +## Notes + +- Frontend modes reuse the production frontend chat code through the Vitest bridge. +- CLI mode creates an isolated workspace, writes the current checkout guidance into it, and benchmarks the real skills / `AGENTS.md` flow. +- Frontend progress streams live while the benchmark is running. +- Deterministic validators should stay focused on real correctness constraints, not one exact implementation shape. diff --git a/ai_evals/adapters/cli/runtime.test.ts b/ai_evals/adapters/cli/runtime.test.ts new file mode 100644 index 0000000000..aedbcba58d --- /dev/null +++ b/ai_evals/adapters/cli/runtime.test.ts @@ -0,0 +1,72 @@ +import { describe, expect, it } from "bun:test"; +import { + anthropicUsageToBenchmarkTokenUsage, + extractCliResultTokenUsage, +} from "./runtime"; + +describe("anthropicUsageToBenchmarkTokenUsage", () => { + it("includes cache tokens in prompt usage", () => { + expect( + anthropicUsageToBenchmarkTokenUsage({ + input_tokens: 120, + output_tokens: 45, + cache_creation_input_tokens: 30, + cache_read_input_tokens: 5, + }) + ).toEqual({ + prompt: 155, + completion: 45, + total: 200, + }); + }); + + it("returns null when usage is absent", () => { + expect(anthropicUsageToBenchmarkTokenUsage(null)).toBeNull(); + }); +}); + +describe("extractCliResultTokenUsage", () => { + it("reads aggregate usage from the SDK result event", () => { + expect( + extractCliResultTokenUsage({ + type: "result", + usage: { + input_tokens: 400, + output_tokens: 120, + cache_creation_input_tokens: 50, + cache_read_input_tokens: 25, + }, + }) + ).toEqual({ + prompt: 475, + completion: 120, + total: 595, + }); + }); + + it("falls back to modelUsage when aggregate usage is unavailable", () => { + expect( + extractCliResultTokenUsage({ + type: "result", + modelUsage: { + opus: { + inputTokens: 200, + outputTokens: 60, + cacheCreationInputTokens: 10, + cacheReadInputTokens: 5, + }, + haiku: { + inputTokens: 80, + outputTokens: 20, + cacheCreationInputTokens: 0, + cacheReadInputTokens: 15, + }, + }, + }) + ).toEqual({ + prompt: 310, + completion: 80, + total: 390, + }); + }); +}); diff --git a/ai_evals/adapters/cli/runtime.ts b/ai_evals/adapters/cli/runtime.ts new file mode 100644 index 0000000000..3e184bae8d --- /dev/null +++ b/ai_evals/adapters/cli/runtime.ts @@ -0,0 +1,199 @@ +import { query, type Options } from "@anthropic-ai/claude-agent-sdk"; +import { join } from "path"; +import { fileURLToPath } from "url"; +import { getCliEvalModel, resolveEvalModel, type CliEvalModelConfig } from "../../core/models"; +import type { BenchmarkTokenUsage } from "../../core/types"; + +export interface ToolInvocation { + tool: string; + input: Record; + timestamp: number; +} + +export interface PromptRunResult { + toolsUsed: ToolInvocation[]; + skillsInvoked: string[]; + output: string; + durationMs: number; + assistantMessageCount: number; + tokenUsage: BenchmarkTokenUsage | null; +} + +interface AnthropicUsageLike { + input_tokens?: number | null; + output_tokens?: number | null; + cache_creation_input_tokens?: number | null; + cache_read_input_tokens?: number | null; +} + +interface AnthropicModelUsageLike { + inputTokens?: number | null; + outputTokens?: number | null; + cacheCreationInputTokens?: number | null; + cacheReadInputTokens?: number | null; +} + +interface CliResultMessageLike { + type?: string; + usage?: AnthropicUsageLike | null; + modelUsage?: Record | null; +} + +const REPO_ROOT = fileURLToPath(new URL("../../../", import.meta.url)); +export const DEFAULT_CLI_EVAL_MODEL: CliEvalModelConfig = getCliEvalModel(resolveEvalModel("cli")); + +export function getGeneratedSkillsSource(): string { + return join(REPO_ROOT, "system_prompts", "auto-generated", "skills"); +} + +export function anthropicUsageToBenchmarkTokenUsage( + usage: AnthropicUsageLike | null | undefined +): BenchmarkTokenUsage | null { + if (!usage) { + return null; + } + + const prompt = + (usage.input_tokens ?? 0) + + (usage.cache_creation_input_tokens ?? 0) + + (usage.cache_read_input_tokens ?? 0); + const completion = usage.output_tokens ?? 0; + + return { + prompt, + completion, + total: prompt + completion, + }; +} + +export function extractCliResultTokenUsage(message: unknown): BenchmarkTokenUsage | null { + if (!message || typeof message !== "object") { + return null; + } + + const resultMessage = message as CliResultMessageLike; + if (resultMessage.type !== "result") { + return null; + } + + const usage = anthropicUsageToBenchmarkTokenUsage(resultMessage.usage); + if (usage) { + return usage; + } + + if (!resultMessage.modelUsage || typeof resultMessage.modelUsage !== "object") { + return null; + } + + let prompt = 0; + let completion = 0; + let sawModelUsage = false; + + for (const modelUsage of Object.values(resultMessage.modelUsage)) { + if (!modelUsage || typeof modelUsage !== "object") { + continue; + } + + prompt += + (modelUsage.inputTokens ?? 0) + + (modelUsage.cacheCreationInputTokens ?? 0) + + (modelUsage.cacheReadInputTokens ?? 0); + completion += modelUsage.outputTokens ?? 0; + sawModelUsage = true; + } + + if (!sawModelUsage) { + return null; + } + + return { + prompt, + completion, + total: prompt + completion, + }; +} + +export async function runPromptAndCapture( + prompt: string, + cwd: string, + maxTurns: number = 3, + modelConfig: CliEvalModelConfig = DEFAULT_CLI_EVAL_MODEL +): Promise { + const toolsUsed: ToolInvocation[] = []; + const skillsInvoked: string[] = []; + let output = ""; + let assistantMessageCount = 0; + let tokenUsage: BenchmarkTokenUsage | null = null; + const startedAt = Date.now(); + + const options: Options = { + cwd, + model: modelConfig.model, + maxTurns, + settingSources: ["project"], + allowedTools: ["Skill", "Read", "Glob", "Grep", "Bash", "Write", "Edit"] + }; + + for await (const message of query({ prompt, options })) { + if (message.type === "assistant") { + assistantMessageCount += 1; + const content = message.message?.content; + if (Array.isArray(content)) { + for (const block of content) { + if (block.type === "tool_use") { + toolsUsed.push({ + tool: block.name, + input: block.input as Record, + timestamp: Date.now() + }); + + if (block.name === "Skill" && typeof block.input === "object" && block.input !== null) { + const skillInput = block.input as { skill?: string }; + if (skillInput.skill) { + skillsInvoked.push(skillInput.skill); + } + } + } else if (block.type === "text") { + output += block.text; + } + } + } + } else if (message.type === "result") { + const resultMessage = message as { result?: string }; + tokenUsage = extractCliResultTokenUsage(message) ?? tokenUsage; + if (typeof resultMessage.result === "string") { + output += resultMessage.result; + } + } + } + + return { + toolsUsed, + skillsInvoked, + output, + durationMs: Date.now() - startedAt, + assistantMessageCount, + tokenUsage, + }; +} + +export function wasSkillInvoked(result: PromptRunResult, skillName: string): boolean { + return result.skillsInvoked.some((skill) => skill === skillName || skill.includes(skillName)); +} + +export function wasToolUsed(result: PromptRunResult, toolName: string): boolean { + return result.toolsUsed.some((tool) => tool.tool === toolName); +} + +export function formatCliRunModelLabel(modelConfig: CliEvalModelConfig): string { + return `${modelConfig.provider}:${modelConfig.model}`; +} + +export function getToolInputs( + result: PromptRunResult, + toolName: string +): Record[] { + return result.toolsUsed + .filter((tool) => tool.tool === toolName) + .map((tool) => tool.input); +} diff --git a/ai_evals/adapters/frontend/benchmarkRunner.ts b/ai_evals/adapters/frontend/benchmarkRunner.ts new file mode 100644 index 0000000000..33b1555654 --- /dev/null +++ b/ai_evals/adapters/frontend/benchmarkRunner.ts @@ -0,0 +1,87 @@ +import { loadSelectedCases } from "../../core/cases"; +import { + formatRunModelLabel, + getFrontendEvalModel, + resolveEvalModel, +} from "../../core/models"; +import { buildRunResult } from "../../core/results"; +import { runSuite } from "../../core/runSuite"; +import type { BenchmarkRunResult, ModeRunner } from "../../core/types"; +import { emitFrontendBenchmarkProgress } from "./progress"; +import { createAppModeRunner } from "../../modes/app"; +import { createFlowModeRunner } from "../../modes/flow"; +import { createScriptModeRunner } from "../../modes/script"; +import { DEFAULT_JUDGE_MODEL } from "../../core/judge"; + +export type FrontendBenchmarkMode = "flow" | "app" | "script"; + +export async function runFrontendBenchmarkFromEnv(): Promise { + const mode = parseMode(process.env.WMILL_FRONTEND_AI_EVAL_MODE); + const caseIds = parseOptionalJsonStringArray(process.env.WMILL_FRONTEND_AI_EVAL_CASE_IDS); + const runs = parsePositiveInteger(process.env.WMILL_FRONTEND_AI_EVAL_RUNS, "WMILL_FRONTEND_AI_EVAL_RUNS"); + const emitProgress = process.env.WMILL_FRONTEND_AI_EVAL_PROGRESS === "1"; + const verbose = process.env.WMILL_FRONTEND_AI_EVAL_VERBOSE === "1"; + const model = resolveEvalModel(mode, process.env.WMILL_FRONTEND_AI_EVAL_MODEL); + + const selectedCases = await loadSelectedCases(mode, caseIds); + const modeRunner = getModeRunner(mode, getFrontendEvalModel(model)); + const runModel = formatRunModelLabel(mode, model); + const caseResults = await runSuite({ + modeRunner, + cases: selectedCases, + runs, + runModel, + judgeModel: DEFAULT_JUDGE_MODEL, + concurrency: verbose ? 1 : undefined, + verbose, + onProgress: emitProgress ? (event) => emitFrontendBenchmarkProgress(event) : undefined, + }); + + return buildRunResult({ + mode, + runs, + runModel, + judgeModel: DEFAULT_JUDGE_MODEL, + caseResults, + }); +} + +function getModeRunner( + mode: FrontendBenchmarkMode, + model: ReturnType +): ModeRunner { + switch (mode) { + case "flow": + return createFlowModeRunner(model); + case "app": + return createAppModeRunner(model); + case "script": + return createScriptModeRunner(model); + } +} + +function parseMode(value: string | undefined): FrontendBenchmarkMode { + if (value === "flow" || value === "app" || value === "script") { + return value; + } + throw new Error(`Unsupported frontend benchmark mode: ${String(value)}`); +} + +function parseOptionalJsonStringArray(value: string | undefined): string[] { + if (!value) { + return []; + } + const parsed = JSON.parse(value) as unknown; + if (!Array.isArray(parsed) || parsed.some((entry) => typeof entry !== "string")) { + throw new Error("WMILL_FRONTEND_AI_EVAL_CASE_IDS must be a JSON string array"); + } + return parsed; +} + +function parsePositiveInteger(value: string | undefined, envName: string): number { + const parsed = Number(value); + if (!Number.isInteger(parsed) || parsed <= 0) { + throw new Error(`${envName} must be a positive integer`); + } + return parsed; +} diff --git a/ai_evals/adapters/frontend/core/app/appEvalRunner.ts b/ai_evals/adapters/frontend/core/app/appEvalRunner.ts new file mode 100644 index 0000000000..f0d0ce91d3 --- /dev/null +++ b/ai_evals/adapters/frontend/core/app/appEvalRunner.ts @@ -0,0 +1,92 @@ +import { mkdtemp } from 'fs/promises' +import { tmpdir } from 'os' +import { join } from 'path' +import type { + AppFiles, + BackendRunnable, + AppAIChatHelpers +} from '../../../../../frontend/src/lib/components/copilot/chat/app/core' +import { + getAppTools, + prepareAppSystemMessage, + prepareAppUserMessage +} from '../../../../../frontend/src/lib/components/copilot/chat/app/core' +import type { Tool as ProductionTool } from '../../../../../frontend/src/lib/components/copilot/chat/shared' +import { createAppFileHelpers } from './fileHelpers' +import { runEval } from '../shared' +import type { AIProvider } from '$lib/gen/types.gen' +import type { ModeRunContext } from '../../../../core/types' +import type { TokenUsage } from '../shared/types' + +export interface AppEvalResult { + success: boolean + files: AppFiles + error?: string + assistantMessageCount: number + toolCallCount: number + toolsUsed: string[] + tokenUsage: TokenUsage +} + +export interface AppEvalOptions { + initialFrontend?: Record + initialBackend?: Record + model?: string + maxIterations?: number + provider?: AIProvider + workspaceRoot?: string + runContext?: ModeRunContext +} + +export async function runAppEval( + userPrompt: string, + apiKey: string, + options?: AppEvalOptions +): Promise { + const workspaceRoot = + options?.workspaceRoot ?? + (await mkdtemp(join(tmpdir(), 'wmill-frontend-app-benchmark-'))) + const { helpers, getFiles, cleanup } = await createAppFileHelpers( + options?.initialFrontend ?? {}, + options?.initialBackend ?? {}, + workspaceRoot + ) + + try { + const systemMessage = prepareAppSystemMessage() + const tools = getAppTools() as ProductionTool[] + const model = options?.model ?? 'claude-haiku-4-5-20251001' + const userMessage = prepareAppUserMessage(userPrompt, helpers.getSelectedContext()) + + const rawResult = await runEval({ + userPrompt, + systemMessage, + userMessage, + tools, + helpers, + apiKey, + getOutput: getFiles, + onAssistantMessageStart: options?.runContext?.onAssistantMessageStart, + onAssistantToken: options?.runContext?.onAssistantChunk, + onAssistantMessageEnd: options?.runContext?.onAssistantMessageEnd, + options: { + maxIterations: options?.maxIterations, + model, + workspace: workspaceRoot, + provider: options?.provider + } + }) + + return { + files: rawResult.output, + success: rawResult.success, + error: rawResult.error, + assistantMessageCount: rawResult.iterations, + toolCallCount: rawResult.toolCallsCount, + toolsUsed: rawResult.toolsCalled, + tokenUsage: rawResult.tokenUsage + } + } finally { + await cleanup() + } +} diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/appFixtureLoader.ts b/ai_evals/adapters/frontend/core/app/appFixtureLoader.ts similarity index 97% rename from frontend/src/lib/components/copilot/chat/__tests__/app/appFixtureLoader.ts rename to ai_evals/adapters/frontend/core/app/appFixtureLoader.ts index 8d3be427e4..3a1cacdb3e 100644 --- a/frontend/src/lib/components/copilot/chat/__tests__/app/appFixtureLoader.ts +++ b/ai_evals/adapters/frontend/core/app/appFixtureLoader.ts @@ -1,4 +1,8 @@ -import type { AppFiles, BackendRunnable, InlineScript } from '../../app/core' +import type { + AppFiles, + BackendRunnable, + InlineScript +} from '../../../../../frontend/src/lib/components/copilot/chat/app/core' /** * Backend runnable metadata stored in meta.json files. diff --git a/ai_evals/adapters/frontend/core/app/fileHelpers.ts b/ai_evals/adapters/frontend/core/app/fileHelpers.ts new file mode 100644 index 0000000000..02bfe799c5 --- /dev/null +++ b/ai_evals/adapters/frontend/core/app/fileHelpers.ts @@ -0,0 +1,255 @@ +import { mkdir, rm, writeFile } from 'fs/promises' +import { dirname, join } from 'path' +import type { + AppAIChatHelpers, + AppFiles, + BackendRunnable, + DataTableSchema, + LintResult, + SelectedContext +} from '../../../../../frontend/src/lib/components/copilot/chat/app/core' + +function createEmptyLintResult(): LintResult { + return { + errorCount: 0, + warningCount: 0, + errors: { frontend: {}, backend: {} }, + warnings: { frontend: {}, backend: {} } + } +} + +async function writeFrontendFile( + workspaceRoot: string | undefined, + path: string, + content: string +): Promise { + if (!workspaceRoot) { + return + } + const relativePath = path.startsWith('/') ? path.slice(1) : path + const fullPath = join(workspaceRoot, 'frontend', relativePath) + await mkdir(dirname(fullPath), { recursive: true }) + await writeFile(fullPath, content, 'utf8') +} + +async function removeFrontendFile(workspaceRoot: string | undefined, path: string): Promise { + if (!workspaceRoot) { + return + } + const relativePath = path.startsWith('/') ? path.slice(1) : path + await rm(join(workspaceRoot, 'frontend', relativePath), { force: true }) +} + +async function writeBackendRunnable( + workspaceRoot: string | undefined, + key: string, + runnable: BackendRunnable +): Promise { + if (!workspaceRoot) { + return + } + const runnableDir = join(workspaceRoot, 'backend', key) + await mkdir(runnableDir, { recursive: true }) + + const meta: { name: string; language?: string; type?: string; path?: string } = { + name: runnable.name + } + + if (runnable.type === 'inline' && runnable.inlineScript) { + meta.language = runnable.inlineScript.language + const extension = runnable.inlineScript.language === 'python3' ? 'py' : 'ts' + await writeFile( + join(runnableDir, `main.${extension}`), + runnable.inlineScript.content, + 'utf8' + ) + } else { + meta.type = runnable.type + if (runnable.path) { + meta.path = runnable.path + } + } + + await writeFile(join(runnableDir, 'meta.json'), JSON.stringify(meta, null, 2) + '\n', 'utf8') +} + +async function removeBackendRunnable(workspaceRoot: string | undefined, key: string): Promise { + if (!workspaceRoot) { + return + } + await rm(join(workspaceRoot, 'backend', key), { recursive: true, force: true }) +} + +async function persistDatatables( + workspaceRoot: string | undefined, + datatables: DataTableSchema[] +): Promise { + if (!workspaceRoot) { + return + } + await writeFile( + join(workspaceRoot, 'datatables.json'), + JSON.stringify(datatables, null, 2) + '\n', + 'utf8' + ) +} + +export async function createAppFileHelpers( + initialFrontend: Record = {}, + initialBackend: Record = {}, + workspaceRoot?: string +): Promise<{ + helpers: AppAIChatHelpers + getFiles: () => AppFiles + getFrontend: () => Record + getBackend: () => Record + cleanup: () => Promise + workspaceDir: string | null +}> { + let frontend = { ...initialFrontend } + let backend = { ...initialBackend } + let snapshotId = 0 + const snapshots = new Map< + number, + { frontend: Record; backend: Record } + >() + const datatables: DataTableSchema[] = [] + + for (const [path, content] of Object.entries(frontend)) { + await writeFrontendFile(workspaceRoot, path, content) + } + for (const [key, runnable] of Object.entries(backend)) { + await writeBackendRunnable(workspaceRoot, key, runnable) + } + await persistDatatables(workspaceRoot, datatables) + + const helpers: AppAIChatHelpers = { + listFrontendFiles: () => Object.keys(frontend), + getFrontendFile: (path: string) => frontend[path], + getFrontendFiles: () => ({ ...frontend }), + setFrontendFile: (path: string, content: string) => { + frontend[path] = content + void writeFrontendFile(workspaceRoot, path, content) + return createEmptyLintResult() + }, + deleteFrontendFile: (path: string) => { + delete frontend[path] + void removeFrontendFile(workspaceRoot, path) + }, + listBackendRunnables: () => + Object.entries(backend).map(([key, runnable]) => ({ + key, + name: runnable.name + })), + getBackendRunnable: (key: string) => backend[key], + getBackendRunnables: () => ({ ...backend }), + setBackendRunnable: async (key: string, runnable: BackendRunnable) => { + backend[key] = runnable + await writeBackendRunnable(workspaceRoot, key, runnable) + return createEmptyLintResult() + }, + deleteBackendRunnable: (key: string) => { + delete backend[key] + void removeBackendRunnable(workspaceRoot, key) + }, + getFiles: (): AppFiles => ({ + frontend: { ...frontend }, + backend: { ...backend } + }), + getSelectedContext: (): SelectedContext => ({ type: 'none' }), + snapshot: () => { + const id = ++snapshotId + snapshots.set(id, { + frontend: { ...frontend }, + backend: { ...backend } + }) + return id + }, + revertToSnapshot: (id: number) => { + const snapshot = snapshots.get(id) + if (!snapshot) { + return + } + frontend = { ...snapshot.frontend } + backend = { ...snapshot.backend } + void syncWorkspace() + }, + lint: () => createEmptyLintResult(), + getDatatables: async () => structuredClone(datatables), + getAvailableDatatableNames: () => datatables.map((datatable) => datatable.datatable_name), + execDatatableSql: async ( + datatableName: string, + sql: string, + newTable?: { schema: string; name: string } + ) => { + if (newTable) { + datatables.push({ + datatable_name: datatableName, + schemas: { + [newTable.schema]: { + [newTable.name]: {} + } + } + }) + await persistDatatables(workspaceRoot, datatables) + } + return { + success: true, + result: [ + { + datatableName, + sql + } + ] + } + }, + addTableToWhitelist: (datatableName: string, schemaName: string, tableName: string) => { + const existing = datatables.find((entry) => entry.datatable_name === datatableName) + if (existing) { + existing.schemas[schemaName] ??= {} + existing.schemas[schemaName][tableName] ??= {} + } else { + datatables.push({ + datatable_name: datatableName, + schemas: { + [schemaName]: { + [tableName]: {} + } + } + }) + } + void persistDatatables(workspaceRoot, datatables) + } + } + + async function syncWorkspace(): Promise { + if (!workspaceRoot) { + return + } + await rm(join(workspaceRoot, 'frontend'), { recursive: true, force: true }) + await rm(join(workspaceRoot, 'backend'), { recursive: true, force: true }) + for (const [path, content] of Object.entries(frontend)) { + await writeFrontendFile(workspaceRoot, path, content) + } + for (const [key, runnable] of Object.entries(backend)) { + await writeBackendRunnable(workspaceRoot, key, runnable) + } + await persistDatatables(workspaceRoot, datatables) + } + + return { + helpers, + getFiles: () => ({ + frontend: { ...frontend }, + backend: { ...backend } + }), + getFrontend: () => ({ ...frontend }), + getBackend: () => ({ ...backend }), + cleanup: async () => { + if (workspaceRoot) { + await rm(workspaceRoot, { recursive: true, force: true }) + } + }, + workspaceDir: workspaceRoot ?? null + } +} diff --git a/ai_evals/adapters/frontend/core/flow/fileHelpers.ts b/ai_evals/adapters/frontend/core/flow/fileHelpers.ts new file mode 100644 index 0000000000..22f6587de3 --- /dev/null +++ b/ai_evals/adapters/frontend/core/flow/fileHelpers.ts @@ -0,0 +1,161 @@ +import { mkdir, rm, writeFile } from 'fs/promises' +import { dirname, join } from 'path' +import type { FlowModule, InputTransform } from '../../../../../frontend/src/lib/gen' +import type { ExtendedOpenFlow } from '../../../../../frontend/src/lib/components/flows/types' +import type { FlowAIChatHelpers } from '../../../../../frontend/src/lib/components/copilot/chat/flow/core' +import type { ScriptLintResult } from '../../../../../frontend/src/lib/components/copilot/chat/shared' +import { findModuleById } from '../../../../../frontend/src/lib/components/copilot/chat/shared' +import { + createInlineScriptSession +} from '../../../../../frontend/src/lib/components/copilot/chat/flow/inlineScriptsUtils' +import { + registerBenchmarkWorkspace, + registerBenchmarkWorkspaceRunnables, + unregisterBenchmarkWorkspaceRunnables, + createBenchmarkCompletedJob, + type BenchmarkWorkspaceFlow, + type BenchmarkWorkspaceScript +} from '../../mockBackend' + +const EMPTY_SCRIPT_LINT_RESULT: ScriptLintResult = { + errorCount: 0, + warningCount: 0, + errors: [], + warnings: [] +} + +export interface FlowWorkspaceFixtures { + scripts?: BenchmarkWorkspaceScript[] + flows?: BenchmarkWorkspaceFlow[] +} + +export async function createFlowFileHelpers( + initialModules: FlowModule[] = [], + initialSchema?: Record, + workspaceRoot?: string, + workspaceFixtures?: FlowWorkspaceFixtures +): Promise<{ + helpers: FlowAIChatHelpers + getFlow: () => ExtendedOpenFlow + getModules: () => FlowModule[] + cleanup: () => Promise + workspaceDir: string | null +}> { + let flow: ExtendedOpenFlow = { + value: { modules: structuredClone(initialModules) }, + summary: '', + schema: initialSchema ?? { + $schema: 'https://json-schema.org/draft/2020-12/schema', + properties: {}, + required: [], + type: 'object' + } + } + const inlineScriptSession = createInlineScriptSession() + + const flowFilePath = workspaceRoot ? join(workspaceRoot, 'flow.json') : null + + async function persistFlow(): Promise { + if (!flowFilePath) { + return + } + await mkdir(dirname(flowFilePath), { recursive: true }) + await writeFile(flowFilePath, JSON.stringify(flow, null, 2) + '\n', 'utf8') + } + + await persistFlow() + + if (workspaceRoot) { + registerBenchmarkWorkspace(workspaceRoot) + if (workspaceFixtures) { + registerBenchmarkWorkspaceRunnables(workspaceRoot, workspaceFixtures) + } + } + + const helpers: FlowAIChatHelpers = { + getFlowAndSelectedId: () => ({ flow, selectedId: '' }), + getModules: (id?: string) => { + if (!id) return flow.value.modules + const module = findModuleById(flow.value.modules, id) + return module ? [module] : [] + }, + inlineScriptSession, + setSnapshot: () => {}, + revertToSnapshot: () => {}, + setCode: async (id: string, code: string) => { + const module = findModuleById(flow.value.modules, id) + if (module && module.value.type === 'rawscript') { + module.value.content = code + } + inlineScriptSession.set(id, code) + await persistFlow() + }, + setFlowJson: async ( + modules: FlowModule[] | undefined, + schema: Record | undefined + ) => { + if (modules) { + flow.value.modules = inlineScriptSession.restoreInlineScriptReferences(modules) + const unresolvedRefs = inlineScriptSession.findUnresolvedInlineScriptRefs(flow.value.modules) + if (unresolvedRefs.length > 0) { + throw new Error( + `Unresolved inline script references: ${unresolvedRefs.join(', ')}` + ) + } + } + if (schema !== undefined) { + flow.schema = schema + } + await persistFlow() + }, + getFlowInputsSchema: async () => flow.schema ?? {}, + updateExprsToSet: (_id: string, _inputTransforms: Record) => {}, + acceptAllModuleActions: () => {}, + rejectAllModuleActions: () => {}, + hasPendingChanges: () => false, + selectStep: (_id: string) => {}, + testFlow: async (args?: Record) => { + if (workspaceRoot) { + const runPath = join(workspaceRoot, 'test-run.json') + await writeFile( + runPath, + JSON.stringify( + { + requestedArgs: args ?? {}, + modules: flow.value.modules.map((module) => module.id) + }, + null, + 2 + ) + '\n', + 'utf8' + ) + } + return createBenchmarkCompletedJob({ + workspace: workspaceRoot ?? 'benchmark', + jobKind: 'flowpreview', + result: { + requestedArgs: args ?? {}, + modules: flow.value.modules.map((module) => module.id), + mocked: true + }, + logs: 'Mock benchmark flow test run completed successfully.' + }) + }, + getLintErrors: async () => EMPTY_SCRIPT_LINT_RESULT + } + + return { + helpers, + getFlow: () => flow, + getModules: () => flow.value.modules, + cleanup: async () => { + if (workspaceRoot) { + unregisterBenchmarkWorkspaceRunnables(workspaceRoot) + } + if (workspaceRoot) { + await rm(workspaceRoot, { recursive: true, force: true }) + } + }, + workspaceDir: workspaceRoot ?? null + } +} diff --git a/ai_evals/adapters/frontend/core/flow/flowEvalRunner.ts b/ai_evals/adapters/frontend/core/flow/flowEvalRunner.ts new file mode 100644 index 0000000000..a07e22f584 --- /dev/null +++ b/ai_evals/adapters/frontend/core/flow/flowEvalRunner.ts @@ -0,0 +1,103 @@ +import { mkdtemp } from 'fs/promises' +import { tmpdir } from 'os' +import { join } from 'path' +import type { FlowModule } from '$lib/gen' +import type { AIProvider } from '$lib/gen/types.gen' +import type { ExtendedOpenFlow } from '$lib/components/flows/types' +import { + flowTools, + prepareFlowSystemMessage, + prepareFlowUserMessage, + type FlowAIChatHelpers +} from '../../../../../frontend/src/lib/components/copilot/chat/flow/core' +import type { Tool as ProductionTool } from '../../../../../frontend/src/lib/components/copilot/chat/shared' +import { createFlowFileHelpers, type FlowWorkspaceFixtures } from './fileHelpers' +import { runEval } from '../shared' +import type { ModeRunContext } from '../../../../core/types' +import type { TokenUsage } from '../shared/types' + +export interface FlowFixture { + value?: { + modules?: FlowModule[] + } + schema?: Record +} + +export interface FlowEvalResult { + success: boolean + flow: ExtendedOpenFlow + error?: string + assistantMessageCount: number + toolCallCount: number + toolsUsed: string[] + tokenUsage: TokenUsage +} + +export interface FlowEvalOptions { + initialFlow?: FlowFixture + workspaceFixtures?: FlowWorkspaceFixtures + model?: string + maxIterations?: number + provider?: AIProvider + workspaceRoot?: string + runContext?: ModeRunContext +} + +export async function runFlowEval( + userPrompt: string, + apiKey: string, + options?: FlowEvalOptions +): Promise { + const workspaceRoot = + options?.workspaceRoot ?? + (await mkdtemp(join(tmpdir(), 'wmill-frontend-flow-benchmark-'))) + const { helpers, getFlow, cleanup } = await createFlowFileHelpers( + options?.initialFlow?.value?.modules ?? [], + options?.initialFlow?.schema, + workspaceRoot, + options?.workspaceFixtures + ) + + try { + const systemMessage = prepareFlowSystemMessage() + const tools = flowTools as ProductionTool[] + const model = options?.model ?? 'claude-haiku-4-5-20251001' + const userMessage = prepareFlowUserMessage( + userPrompt, + helpers.getFlowAndSelectedId(), + [], + helpers.inlineScriptSession + ) + + const rawResult = await runEval({ + userPrompt, + systemMessage, + userMessage, + tools, + helpers, + apiKey, + getOutput: getFlow, + onAssistantMessageStart: options?.runContext?.onAssistantMessageStart, + onAssistantToken: options?.runContext?.onAssistantChunk, + onAssistantMessageEnd: options?.runContext?.onAssistantMessageEnd, + options: { + maxIterations: options?.maxIterations, + model, + workspace: workspaceRoot, + provider: options?.provider + } + }) + + return { + flow: rawResult.output, + success: rawResult.success, + error: rawResult.error, + assistantMessageCount: rawResult.iterations, + toolCallCount: rawResult.toolCallsCount, + toolsUsed: rawResult.toolsCalled, + tokenUsage: rawResult.tokenUsage + } + } finally { + await cleanup() + } +} diff --git a/ai_evals/adapters/frontend/core/script/fileHelpers.ts b/ai_evals/adapters/frontend/core/script/fileHelpers.ts new file mode 100644 index 0000000000..3d6f3b1139 --- /dev/null +++ b/ai_evals/adapters/frontend/core/script/fileHelpers.ts @@ -0,0 +1,73 @@ +import { mkdir, rm, writeFile } from 'fs/promises' +import { dirname, join } from 'path' +import type { ScriptLang } from '../../../../../frontend/src/lib/gen/types.gen' +import type { ReviewChangesOpts } from '../../../../../frontend/src/lib/components/copilot/chat/monaco-adapter' +import type { ScriptChatHelpers } from '../../../../../frontend/src/lib/components/copilot/chat/script/core' +import { buildScriptLintResult } from './preview' +import { registerBenchmarkWorkspace, unregisterBenchmarkWorkspace } from '../../mockBackend' + +export interface ScriptEvalState { + code: string + lang: ScriptLang | 'bunnative' + path: string + args: Record +} + +export async function createScriptFileHelpers( + initialScript: ScriptEvalState, + workspaceRoot?: string +): Promise<{ + helpers: ScriptChatHelpers + getScript: () => ScriptEvalState + cleanup: () => Promise + workspaceDir: string | null +}> { + let script = structuredClone(initialScript) + const scriptFilePath = workspaceRoot ? join(workspaceRoot, script.path) : null + + async function persistScript(): Promise { + if (!scriptFilePath) { + return + } + await mkdir(dirname(scriptFilePath), { recursive: true }) + await writeFile(scriptFilePath, script.code, 'utf8') + } + + await persistScript() + + if (workspaceRoot) { + registerBenchmarkWorkspace(workspaceRoot) + } + + const helpers: ScriptChatHelpers = { + getScriptOptions: () => ({ + code: script.code, + lang: script.lang, + path: script.path, + args: structuredClone(script.args) + }), + applyCode: async (code: string, opts?: ReviewChangesOpts) => { + if (opts?.mode === 'revert') { + return + } + script = { + ...script, + code + } + await persistScript() + }, + getLintErrors: () => buildScriptLintResult(script.code, script.lang) + } + + return { + helpers, + getScript: () => structuredClone(script), + cleanup: async () => { + if (workspaceRoot) { + unregisterBenchmarkWorkspace(workspaceRoot) + await rm(workspaceRoot, { recursive: true, force: true }) + } + }, + workspaceDir: workspaceRoot ?? null + } +} diff --git a/ai_evals/adapters/frontend/core/script/preview.ts b/ai_evals/adapters/frontend/core/script/preview.ts new file mode 100644 index 0000000000..40befc5266 --- /dev/null +++ b/ai_evals/adapters/frontend/core/script/preview.ts @@ -0,0 +1,96 @@ +import ts from 'typescript' +import type { ScriptLang } from '../../../../../frontend/src/lib/gen/types.gen' +import type { ScriptLintResult } from '../../../../../frontend/src/lib/components/copilot/chat/shared' + +export type ScriptPreviewLanguage = ScriptLang | 'bunnative' + +const TS_LIKE_LANGUAGES = new Set(['bun', 'deno', 'nativets', 'bunnative']) +const JS_LIKE_LANGUAGES = new Set(['bun', 'deno', 'nativets', 'bunnative']) + +function hasSupportedEntrypoint(code: string): boolean { + return ( + /export\s+(async\s+)?function\s+main\s*\(/.test(code) || + /export\s+(async\s+)?function\s+preprocessor\s*\(/.test(code) + ) +} + +function compilerOptionsForLanguage(lang: ScriptPreviewLanguage): ts.CompilerOptions | null { + if (!TS_LIKE_LANGUAGES.has(lang)) { + return null + } + + return { + target: ts.ScriptTarget.ES2022, + module: ts.ModuleKind.ESNext, + moduleResolution: ts.ModuleResolutionKind.Bundler, + noEmit: true, + allowJs: true, + checkJs: false, + strict: false, + skipLibCheck: true + } +} + +function getLineAndColumn(sourceText: string, start: number): { line: number; column: number } { + const prefix = sourceText.slice(0, Math.max(0, start)) + const line = prefix.split('\n').length + const lastNewline = prefix.lastIndexOf('\n') + const column = lastNewline === -1 ? prefix.length + 1 : prefix.length - lastNewline + return { line, column } +} + +export function buildScriptLintResult( + code: string, + lang: ScriptPreviewLanguage +): ScriptLintResult { + const diagnostics: ScriptLintResult['errors'] = [] + const compilerOptions = compilerOptionsForLanguage(lang) + + if (compilerOptions) { + const sourceFile = ts.createSourceFile( + 'script.ts', + code, + ts.ScriptTarget.ES2022, + true, + JS_LIKE_LANGUAGES.has(lang) ? ts.ScriptKind.TS : ts.ScriptKind.JS + ) + const output = ts.transpileModule(code, { + compilerOptions, + fileName: sourceFile.fileName, + reportDiagnostics: true + }) + + for (const diagnostic of output.diagnostics ?? []) { + const start = diagnostic.start ?? 0 + const length = diagnostic.length ?? 1 + const { line, column } = getLineAndColumn(code, start) + const message = ts.flattenDiagnosticMessageText(diagnostic.messageText, '\n') + diagnostics.push({ + startLineNumber: line, + startColumn: column, + endLineNumber: line, + endColumn: column + Math.max(1, length), + message, + severity: 8 + } as ScriptLintResult['errors'][number]) + } + } + + if (!hasSupportedEntrypoint(code)) { + diagnostics.push({ + startLineNumber: 1, + startColumn: 1, + endLineNumber: 1, + endColumn: 1, + message: 'Script must export a main or preprocessor function.', + severity: 8 + } as ScriptLintResult['errors'][number]) + } + + return { + errorCount: diagnostics.length, + warningCount: 0, + errors: diagnostics, + warnings: [] + } +} diff --git a/ai_evals/adapters/frontend/core/script/scriptEvalRunner.ts b/ai_evals/adapters/frontend/core/script/scriptEvalRunner.ts new file mode 100644 index 0000000000..fe37078348 --- /dev/null +++ b/ai_evals/adapters/frontend/core/script/scriptEvalRunner.ts @@ -0,0 +1,109 @@ +import { mkdtemp } from 'fs/promises' +import { tmpdir } from 'os' +import { join } from 'path' +import type { AIProvider, AIProviderModel, ScriptLang } from '$lib/gen/types.gen' +import type { ContextElement } from '../../../../../frontend/src/lib/components/copilot/chat/context' +import { + prepareScriptSystemMessage, + prepareScriptTools, + prepareScriptUserMessage, + type ScriptChatHelpers +} from '../../../../../frontend/src/lib/components/copilot/chat/script/core' +import type { Tool as ProductionTool } from '../../../../../frontend/src/lib/components/copilot/chat/shared' +import { createScriptFileHelpers, type ScriptEvalState } from './fileHelpers' +import { runEval } from '../shared' +import type { ModeRunContext } from '../../../../core/types' +import type { TokenUsage } from '../shared/types' + +export interface ScriptEvalResult { + success: boolean + script: ScriptEvalState + error?: string + assistantMessageCount: number + toolCallCount: number + toolsUsed: string[] + tokenUsage: TokenUsage +} + +export interface ScriptEvalOptions { + initialScript: ScriptEvalState + model?: string + maxIterations?: number + provider?: AIProvider + workspaceRoot?: string + runContext?: ModeRunContext +} + +function resolveModelProvider( + model: string, + provider?: AIProvider +): AIProviderModel { + if (provider) { + return { provider, model } + } + if (model.startsWith('claude')) { + return { provider: 'anthropic', model } + } + return { provider: 'openai', model } +} + +export async function runScriptEval( + userPrompt: string, + apiKey: string, + options: ScriptEvalOptions +): Promise { + const workspaceRoot = + options.workspaceRoot ?? (await mkdtemp(join(tmpdir(), 'wmill-frontend-script-benchmark-'))) + const { helpers, getScript, cleanup } = await createScriptFileHelpers( + options.initialScript, + workspaceRoot + ) + + try { + const model = options.model ?? 'claude-haiku-4-5-20251001' + const modelProvider = resolveModelProvider(model, options.provider) + const selectedContext: ContextElement[] = [] + const systemMessage = prepareScriptSystemMessage( + modelProvider, + options.initialScript.lang, + {} + ) + const tools = prepareScriptTools( + modelProvider, + options.initialScript.lang, + selectedContext + ) as ProductionTool[] + const userMessage = prepareScriptUserMessage(userPrompt, selectedContext) + + const rawResult = await runEval({ + userPrompt, + systemMessage, + userMessage, + tools, + helpers, + apiKey, + getOutput: getScript, + onAssistantMessageStart: options.runContext?.onAssistantMessageStart, + onAssistantToken: options.runContext?.onAssistantChunk, + onAssistantMessageEnd: options.runContext?.onAssistantMessageEnd, + options: { + maxIterations: options.maxIterations, + model, + workspace: workspaceRoot, + provider: modelProvider.provider + } + }) + + return { + script: rawResult.output, + success: rawResult.success, + error: rawResult.error, + assistantMessageCount: rawResult.iterations, + toolCallCount: rawResult.toolCallsCount, + toolsUsed: rawResult.toolsCalled, + tokenUsage: rawResult.tokenUsage + } + } finally { + await cleanup() + } +} diff --git a/frontend/src/lib/components/copilot/chat/__tests__/shared/baseEvalRunner.ts b/ai_evals/adapters/frontend/core/shared/baseEvalRunner.ts similarity index 66% rename from frontend/src/lib/components/copilot/chat/__tests__/shared/baseEvalRunner.ts rename to ai_evals/adapters/frontend/core/shared/baseEvalRunner.ts index f46acb9108..964a349785 100644 --- a/frontend/src/lib/components/copilot/chat/__tests__/shared/baseEvalRunner.ts +++ b/ai_evals/adapters/frontend/core/shared/baseEvalRunner.ts @@ -1,29 +1,19 @@ -import OpenAI from 'openai' -import Anthropic from '@anthropic-ai/sdk' import type { ChatCompletionMessageParam, ChatCompletionSystemMessageParam } from 'openai/resources/chat/completions.mjs' -import type { AIProvider, AIProviderModel } from '$lib/gen/types.gen' -import type { TokenUsage, ToolCallDetail, EvalRunnerOptions } from './types' -import type { Tool } from './baseVariants' -import { runChatLoop, type ChatClients } from '../../chatLoop' -import type { Tool as ProductionTool, ToolCallbacks } from '../../shared' - -/** - * Result from a single eval run (before domain-specific evaluation). - */ -export interface RawEvalResult { - success: boolean - output: TOutput - error?: string - tokenUsage: TokenUsage - toolCallsCount: number - toolsCalled: string[] - toolCallDetails: ToolCallDetail[] - iterations: number - messages: ChatCompletionMessageParam[] -} +import type { AIProviderModel } from '$lib/gen/types.gen' +import type { TokenUsage, ToolCallDetail, EvalRunnerOptions, RawEvalResult } from './types' +import { runChatLoop, type ChatClients } from '../../../../../frontend/src/lib/components/copilot/chat/chatLoop' +import type { + Tool as ProductionTool, + ToolCallbacks +} from '../../../../../frontend/src/lib/components/copilot/chat/shared' +import { + createEvalClients, + type FrontendEvalProvider, + resolveEvalModelProvider +} from './providerConfig' /** * Parameters for running a base evaluation. @@ -38,7 +28,7 @@ export interface RunEvalParams { /** Tool definitions for the LLM API (unused — derived from tools) */ toolDefs?: unknown /** Full tool implementations for execution */ - tools: Tool[] + tools: ProductionTool[] /** Domain-specific helpers for tool execution */ helpers: THelpers /** API key for the provider */ @@ -47,35 +37,9 @@ export interface RunEvalParams { getOutput: () => TOutput /** Optional configuration */ options?: EvalRunnerOptions -} - -/** - * Creates SDK clients for the given provider. - */ -function createEvalClients(provider: AIProvider, apiKey: string): ChatClients { - if (provider === 'anthropic') { - return { - openai: new OpenAI({ apiKey: 'unused' }), - anthropic: new Anthropic({ apiKey }) - } - } - return { - openai: new OpenAI({ apiKey }), - anthropic: new Anthropic({ apiKey: 'unused' }) - } -} - -/** - * Resolves model string to AIProviderModel. - */ -function resolveModelProvider( - model: string, - provider?: AIProvider -): AIProviderModel { - if (provider) return { provider, model } - if (model.startsWith('claude')) return { provider: 'anthropic', model } - if (model.startsWith('gpt') || model.startsWith('o')) return { provider: 'openai', model } - return { provider: 'openai', model } + onAssistantMessageStart?: () => void + onAssistantToken?: (token: string) => void + onAssistantMessageEnd?: () => void } /** @@ -92,16 +56,23 @@ export async function runEval( helpers, apiKey, getOutput, - options + options, + onAssistantMessageStart, + onAssistantToken, + onAssistantMessageEnd } = params + let shouldEmitMessageStart = true const model = options?.model ?? 'gpt-4o' const maxIterations = options?.maxIterations ?? 20 const workspace = options?.workspace ?? 'test-workspace' const provider = options?.provider - const modelProvider = resolveModelProvider(model, provider) - const clients = createEvalClients(modelProvider.provider, apiKey) + const modelProvider = resolveEvalModelProvider( + model, + provider as FrontendEvalProvider | undefined + ) as AIProviderModel + const clients = createEvalClients(modelProvider.provider, apiKey) as ChatClients const messages: ChatCompletionMessageParam[] = [userMessage] let toolCallsCount = 0 @@ -128,7 +99,7 @@ export async function runEval( } return tool.fn(p) } - })) as ProductionTool[] + })) // No-op callbacks for eval const callbacks: ToolCallbacks & { @@ -137,8 +108,19 @@ export async function runEval( } = { setToolStatus: () => {}, removeToolStatus: () => {}, - onNewToken: () => {}, - onMessageEnd: () => {} + onNewToken: (token: string) => { + if (shouldEmitMessageStart) { + onAssistantMessageStart?.() + shouldEmitMessageStart = false + } + onAssistantToken?.(token) + }, + onMessageEnd: () => { + if (!shouldEmitMessageStart) { + onAssistantMessageEnd?.() + } + shouldEmitMessageStart = true + } } const abortController = new AbortController() @@ -161,7 +143,7 @@ export async function runEval( return { success: true, output: getOutput(), - tokenUsage: { prompt: 0, completion: 0, total: 0 }, + tokenUsage: result.tokenUsage, toolCallsCount, toolsCalled, toolCallDetails, diff --git a/ai_evals/adapters/frontend/core/shared/index.ts b/ai_evals/adapters/frontend/core/shared/index.ts new file mode 100644 index 0000000000..290abc8b0f --- /dev/null +++ b/ai_evals/adapters/frontend/core/shared/index.ts @@ -0,0 +1,3 @@ +export type { TokenUsage, ToolCallDetail, EvalRunnerOptions, RawEvalResult } from './types' +export type { RunEvalParams } from './baseEvalRunner' +export { runEval } from './baseEvalRunner' diff --git a/ai_evals/adapters/frontend/core/shared/providerConfig.test.ts b/ai_evals/adapters/frontend/core/shared/providerConfig.test.ts new file mode 100644 index 0000000000..ad0ef7652a --- /dev/null +++ b/ai_evals/adapters/frontend/core/shared/providerConfig.test.ts @@ -0,0 +1,41 @@ +import { describe, expect, it } from "bun:test"; +import { + buildOpenAICompatibleClientOptions, + resolveEvalModelProvider, +} from "./providerConfig"; + +describe("buildOpenAICompatibleClientOptions", () => { + it("adds Gemini's OpenAI-compatible base URL and client header", () => { + const options = buildOpenAICompatibleClientOptions("googleai", "gemini-test-key"); + + expect(options).toMatchObject({ + apiKey: "gemini-test-key", + baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/", + defaultHeaders: { + "x-goog-api-client": "windmill-ai-evals/1.0", + }, + }); + }); + + it("keeps the default OpenAI-compatible config for OpenAI", () => { + expect(buildOpenAICompatibleClientOptions("openai", "openai-test-key")).toEqual({ + apiKey: "openai-test-key", + }); + }); +}); + +describe("resolveEvalModelProvider", () => { + it("infers googleai from Gemini model ids", () => { + expect(resolveEvalModelProvider("gemini-2.5-flash")).toEqual({ + provider: "googleai", + model: "gemini-2.5-flash", + }); + }); + + it("preserves an explicit provider", () => { + expect(resolveEvalModelProvider("gemini-2.5-pro", "googleai")).toEqual({ + provider: "googleai", + model: "gemini-2.5-pro", + }); + }); +}); diff --git a/ai_evals/adapters/frontend/core/shared/providerConfig.ts b/ai_evals/adapters/frontend/core/shared/providerConfig.ts new file mode 100644 index 0000000000..44a698b2af --- /dev/null +++ b/ai_evals/adapters/frontend/core/shared/providerConfig.ts @@ -0,0 +1,71 @@ +import Anthropic from "@anthropic-ai/sdk"; +import OpenAI from "openai"; +import type { FrontendEvalModelConfig } from "../../../../core/models"; + +export type FrontendEvalProvider = FrontendEvalModelConfig["provider"]; + +export interface EvalClients { + openai: OpenAI; + anthropic: Anthropic; +} + +export interface ResolvedEvalModelProvider { + provider: FrontendEvalProvider; + model: string; +} + +const GEMINI_OPENAI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"; +const GEMINI_GOOG_API_CLIENT = "windmill-ai-evals/1.0"; + +export function buildOpenAICompatibleClientOptions( + provider: Exclude, + apiKey: string +): ConstructorParameters[0] { + if (provider === "googleai") { + return { + apiKey, + baseURL: GEMINI_OPENAI_BASE_URL, + defaultHeaders: { + "x-goog-api-client": GEMINI_GOOG_API_CLIENT, + }, + }; + } + + return { apiKey }; +} + +export function createEvalClients( + provider: FrontendEvalProvider, + apiKey: string +): EvalClients { + if (provider === "anthropic") { + return { + openai: new OpenAI({ apiKey: "unused" }), + anthropic: new Anthropic({ apiKey }), + }; + } + + return { + openai: new OpenAI(buildOpenAICompatibleClientOptions(provider, apiKey)), + anthropic: new Anthropic({ apiKey: "unused" }), + }; +} + +export function resolveEvalModelProvider( + model: string, + provider?: FrontendEvalProvider +): ResolvedEvalModelProvider { + if (provider) { + return { provider, model }; + } + if (model.startsWith("claude")) { + return { provider: "anthropic", model }; + } + if (model.startsWith("gemini")) { + return { provider: "googleai", model }; + } + if (model.startsWith("gpt") || model.startsWith("o")) { + return { provider: "openai", model }; + } + return { provider: "openai", model }; +} diff --git a/ai_evals/adapters/frontend/core/shared/types.ts b/ai_evals/adapters/frontend/core/shared/types.ts new file mode 100644 index 0000000000..4bc3a49b3c --- /dev/null +++ b/ai_evals/adapters/frontend/core/shared/types.ts @@ -0,0 +1,32 @@ +import type { ChatCompletionMessageParam } from 'openai/resources/chat/completions.mjs' +import type { AIProvider } from '$lib/gen/types.gen' + +export interface TokenUsage { + prompt: number + completion: number + total: number +} + +export interface ToolCallDetail { + name: string + arguments: Record +} + +export interface EvalRunnerOptions { + maxIterations?: number + model?: string + workspace?: string + provider?: AIProvider +} + +export interface RawEvalResult { + success: boolean + output: TOutput + error?: string + tokenUsage: TokenUsage + toolCallsCount: number + toolsCalled: string[] + toolCallDetails: ToolCallDetail[] + iterations: number + messages: ChatCompletionMessageParam[] +} diff --git a/ai_evals/adapters/frontend/mockBackend.ts b/ai_evals/adapters/frontend/mockBackend.ts new file mode 100644 index 0000000000..7a0bea443f --- /dev/null +++ b/ai_evals/adapters/frontend/mockBackend.ts @@ -0,0 +1,270 @@ +import { randomUUID } from 'node:crypto' +import type { CompletedJob, Flow, Script } from '../../../frontend/src/lib/gen' +import type { ScriptLang } from '../../../frontend/src/lib/gen/types.gen' +import { buildScriptLintResult } from './core/script/preview' + +const BENCHMARK_TIMESTAMP = '1970-01-01T00:00:00.000Z' + +export interface BenchmarkWorkspaceScript { + path: string + summary: string + description?: string + language: Script['language'] + schema?: Record + content: string +} + +export interface BenchmarkWorkspaceFlow { + path: string + summary: string + description?: string + schema?: Record + value: Flow['value'] +} + +export interface BenchmarkWorkspaceRunnables { + scripts?: BenchmarkWorkspaceScript[] + flows?: BenchmarkWorkspaceFlow[] +} + +type BenchmarkCompletedJob = CompletedJob & { type: 'CompletedJob' } + +const benchmarkWorkspaces = new Set() +const benchmarkWorkspaceRunnables = new Map() +const benchmarkJobs = new Map() + +export function resetBenchmarkMockBackend(): void { + benchmarkWorkspaces.clear() + benchmarkWorkspaceRunnables.clear() + benchmarkJobs.clear() +} + +export function registerBenchmarkWorkspace(workspace: string): void { + benchmarkWorkspaces.add(workspace) +} + +export function registerBenchmarkWorkspaceRunnables( + workspace: string, + runnables: BenchmarkWorkspaceRunnables +): void { + benchmarkWorkspaces.add(workspace) + benchmarkWorkspaceRunnables.set(workspace, runnables) +} + +export function unregisterBenchmarkWorkspace(workspace: string): void { + benchmarkWorkspaces.delete(workspace) + benchmarkWorkspaceRunnables.delete(workspace) + for (const [jobId, entry] of benchmarkJobs.entries()) { + if (entry.workspace === workspace) { + benchmarkJobs.delete(jobId) + } + } +} + +export function unregisterBenchmarkWorkspaceRunnables(workspace: string): void { + unregisterBenchmarkWorkspace(workspace) +} + +export function hasBenchmarkWorkspace(workspace: string): boolean { + return benchmarkWorkspaces.has(workspace) +} + +export function listBenchmarkScripts(workspace: string): Script[] | null { + const runnables = benchmarkWorkspaceRunnables.get(workspace) + if (!runnables) { + return null + } + return (runnables.scripts ?? []).map(buildBenchmarkScript) +} + +export function listBenchmarkFlows(workspace: string): Flow[] | null { + const runnables = benchmarkWorkspaceRunnables.get(workspace) + if (!runnables) { + return null + } + return (runnables.flows ?? []).map(buildBenchmarkFlow) +} + +export function getBenchmarkScriptByPath(workspace: string, path: string): Script | null { + const script = benchmarkWorkspaceRunnables + .get(workspace) + ?.scripts?.find((entry) => entry.path === path) + + return script ? buildBenchmarkScript(script) : null +} + +export function getBenchmarkScriptByHash(workspace: string, hash: string): Script | null { + const script = benchmarkWorkspaceRunnables + .get(workspace) + ?.scripts?.find((entry) => buildBenchmarkScriptHash(entry.path) === hash) + + return script ? buildBenchmarkScript(script) : null +} + +export function getBenchmarkFlowByPath(workspace: string, path: string): Flow | null { + const flow = benchmarkWorkspaceRunnables + .get(workspace) + ?.flows?.find((entry) => entry.path === path) + + return flow ? buildBenchmarkFlow(flow) : null +} + +export function createBenchmarkCompletedJob(input: { + workspace: string + jobKind: CompletedJob['job_kind'] + success?: boolean + result?: unknown + logs?: string + scriptPath?: string + scriptHash?: string + args?: Record +}): string { + const jobId = `benchmark-job-${randomUUID()}` + const now = new Date().toISOString() + const job: BenchmarkCompletedJob = { + type: 'CompletedJob', + id: jobId, + workspace_id: input.workspace, + created_by: 'ai-evals', + created_at: now, + started_at: now, + completed_at: now, + duration_ms: 0, + success: input.success ?? true, + script_path: input.scriptPath, + script_hash: input.scriptHash, + args: input.args, + result: input.result, + logs: input.logs, + canceled: false, + job_kind: input.jobKind, + permissioned_as: 'u/ai-evals', + is_flow_step: false, + is_skipped: false, + email: 'ai-evals@local', + visible_to_owner: true, + tag: 'benchmark' + } + + benchmarkJobs.set(jobId, { workspace: input.workspace, job }) + return jobId +} + +export function getBenchmarkCompletedJob( + workspace: string, + jobId: string +): BenchmarkCompletedJob | null { + const entry = benchmarkJobs.get(jobId) + if (!entry || entry.workspace !== workspace) { + return null + } + return structuredClone(entry.job) +} + +export function runBenchmarkScriptPreview(input: { + workspace: string + requestBody: { + content?: string + language?: ScriptLang | 'bunnative' + args?: Record + path?: string + } +}): string { + const content = input.requestBody.content ?? '' + const language = input.requestBody.language ?? 'bun' + const lintResult = buildScriptLintResult(content, language) + const success = lintResult.errorCount === 0 + + return createBenchmarkCompletedJob({ + workspace: input.workspace, + jobKind: 'preview', + success, + scriptPath: input.requestBody.path, + args: input.requestBody.args, + result: success + ? { + path: input.requestBody.path, + args: input.requestBody.args ?? {}, + validated: true + } + : { + path: input.requestBody.path, + args: input.requestBody.args ?? {}, + errorCount: lintResult.errorCount, + errors: lintResult.errors.map((entry) => ({ + line: entry.startLineNumber, + message: entry.message + })) + } + }) +} + +export function runBenchmarkFlowByPath(input: { + workspace: string + path: string + args?: Record +}): string { + const flow = getBenchmarkFlowByPath(input.workspace, input.path) + return createBenchmarkCompletedJob({ + workspace: input.workspace, + jobKind: 'flowpreview', + success: flow !== null, + args: input.args, + result: + flow !== null + ? { + path: input.path, + args: input.args ?? {}, + mocked: true + } + : { + error: `Flow "${input.path}" not found in benchmark workspace` + }, + logs: + flow !== null + ? 'Mock benchmark flow run completed successfully.' + : `Flow "${input.path}" not found in benchmark workspace.` + }) +} + +function buildBenchmarkScriptHash(path: string): string { + return `benchmark:${path}` +} + +function buildBenchmarkScript(script: BenchmarkWorkspaceScript): Script { + return { + workspace_id: 'benchmark', + hash: buildBenchmarkScriptHash(script.path), + path: script.path, + parent_hashes: [], + summary: script.summary, + description: script.description ?? '', + content: script.content, + created_by: 'benchmark', + created_at: BENCHMARK_TIMESTAMP, + archived: false, + schema: script.schema ?? {}, + deleted: false, + is_template: false, + extra_perms: {}, + language: script.language, + kind: 'script', + starred: false, + has_preprocessor: false, + modules: null + } +} + +function buildBenchmarkFlow(flow: BenchmarkWorkspaceFlow): Flow { + return { + path: flow.path, + summary: flow.summary, + description: flow.description ?? '', + value: flow.value, + schema: flow.schema ?? {}, + edited_by: 'benchmark', + edited_at: BENCHMARK_TIMESTAMP, + archived: false, + extra_perms: {} + } as Flow +} diff --git a/ai_evals/adapters/frontend/progress.ts b/ai_evals/adapters/frontend/progress.ts new file mode 100644 index 0000000000..5ec414c9fd --- /dev/null +++ b/ai_evals/adapters/frontend/progress.ts @@ -0,0 +1,133 @@ +export type FrontendBenchmarkProgressSurface = 'flow' | 'app' | 'script' + +export type FrontendBenchmarkProgressEvent = + | { + type: 'run-start' + surface: FrontendBenchmarkProgressSurface + totalCases: number + runs: number + concurrency: number + } + | { + type: 'attempt-start' + surface: FrontendBenchmarkProgressSurface + caseId: string + caseNumber: number + totalCases: number + attempt: number + runs: number + } + | { + type: 'attempt-finish' + surface: FrontendBenchmarkProgressSurface + caseId: string + caseNumber: number + totalCases: number + attempt: number + runs: number + passed: boolean + durationMs: number + judgeScore: number | null + error: string | null + } + | { + type: 'assistant-message-start' + surface: FrontendBenchmarkProgressSurface + caseId: string + caseNumber: number + totalCases: number + attempt: number + runs: number + } + | { + type: 'assistant-chunk' + surface: FrontendBenchmarkProgressSurface + caseId: string + caseNumber: number + totalCases: number + attempt: number + runs: number + chunk: string + } + | { + type: 'assistant-message-end' + surface: FrontendBenchmarkProgressSurface + caseId: string + caseNumber: number + totalCases: number + attempt: number + runs: number + } + +export const FRONTEND_BENCHMARK_PROGRESS_PREFIX = 'WMILL_FRONTEND_AI_EVAL_PROGRESS ' + +export function emitFrontendBenchmarkProgress(event: FrontendBenchmarkProgressEvent): void { + process.stderr.write( + `${FRONTEND_BENCHMARK_PROGRESS_PREFIX}${JSON.stringify(event)}\n` + ) +} + +export function parseFrontendBenchmarkProgressLine( + line: string +): FrontendBenchmarkProgressEvent | null { + if (!line.startsWith(FRONTEND_BENCHMARK_PROGRESS_PREFIX)) { + return null + } + + try { + const parsed = JSON.parse( + line.slice(FRONTEND_BENCHMARK_PROGRESS_PREFIX.length) + ) as FrontendBenchmarkProgressEvent + return parsed?.type ? parsed : null + } catch { + return null + } +} + +export function formatFrontendBenchmarkProgressEvent( + event: FrontendBenchmarkProgressEvent +): string { + switch (event.type) { + case 'run-start': + return `Running ${event.surface}: ${event.totalCases} cases x ${event.runs} run${event.runs === 1 ? '' : 's'}, concurrency ${event.concurrency}` + case 'attempt-start': + return `${formatCasePrefix(event.caseNumber, event.totalCases)} ${event.caseId} attempt ${event.attempt}/${event.runs}...` + case 'attempt-finish': { + const parts = [ + `${formatCasePrefix(event.caseNumber, event.totalCases)} ${event.caseId} attempt ${event.attempt}/${event.runs} ${event.passed ? 'pass' : 'fail'}`, + formatDuration(event.durationMs) + ] + if (event.judgeScore !== null) { + parts.push(`judge ${formatNumber(event.judgeScore)}`) + } + if (event.error) { + parts.push(truncateSingleLine(event.error, 120)) + } + return parts.join(' | ') + } + case 'assistant-message-start': + case 'assistant-chunk': + case 'assistant-message-end': + return '' + } +} + +function formatCasePrefix(caseNumber: number, totalCases: number): string { + return `[${caseNumber}/${totalCases}]` +} + +function formatDuration(durationMs: number): string { + return `${formatNumber(durationMs / 1000)}s` +} + +function formatNumber(value: number): string { + return Number.isInteger(value) ? String(value) : value.toFixed(1) +} + +function truncateSingleLine(value: string, maxLength: number): string { + const normalized = value.replace(/\s+/g, ' ').trim() + if (normalized.length <= maxLength) { + return normalized + } + return `${normalized.slice(0, Math.max(0, maxLength - 3))}...` +} diff --git a/ai_evals/adapters/frontend/runtime.ts b/ai_evals/adapters/frontend/runtime.ts new file mode 100644 index 0000000000..8828cd63af --- /dev/null +++ b/ai_evals/adapters/frontend/runtime.ts @@ -0,0 +1,216 @@ +import { spawn } from 'node:child_process' +import { mkdtemp, readFile, rm } from 'node:fs/promises' +import { tmpdir } from 'node:os' +import path from 'node:path' +import { fileURLToPath } from 'node:url' +import { + formatFrontendBenchmarkProgressEvent, + parseFrontendBenchmarkProgressLine +} from './progress' +import type { BenchmarkRunResult } from '../../core/types' + +const REPO_ROOT = fileURLToPath(new URL('../../../', import.meta.url)) +const FRONTEND_DIR = path.join(REPO_ROOT, 'frontend') +const FRONTEND_BENCHMARK_TEST = '../ai_evals/adapters/frontend/vitestAdapter.test.ts' +const FRONTEND_BENCHMARK_CONFIG = '../ai_evals/adapters/frontend/vitest.config.ts' + +export type FrontendMode = 'flow' | 'app' | 'script' + +export async function runFrontendBenchmarkAdapter(input: { + mode: FrontendMode + caseIds: string[] + runs: number + model?: string + verbose?: boolean +}): Promise { + const tempDir = await mkdtemp(path.join(tmpdir(), 'wmill-frontend-benchmark-')) + const outputPath = path.join(tempDir, 'result.json') + + try { + await runVitestBenchmark( + path.join(FRONTEND_DIR, 'node_modules', '.bin', 'vitest'), + [ + 'run', + FRONTEND_BENCHMARK_TEST, + '--project', + 'server', + '--config', + FRONTEND_BENCHMARK_CONFIG + ], + { + cwd: FRONTEND_DIR, + env: { + ...process.env, + BROWSERSLIST_IGNORE_OLD_DATA: '1', + WMILL_FRONTEND_AI_EVAL_OUTPUT_PATH: outputPath, + WMILL_FRONTEND_AI_EVAL_MODE: input.mode, + WMILL_FRONTEND_AI_EVAL_CASE_IDS: JSON.stringify(input.caseIds), + WMILL_FRONTEND_AI_EVAL_RUNS: String(input.runs), + WMILL_FRONTEND_AI_EVAL_MODEL: input.model ?? "", + WMILL_FRONTEND_AI_EVAL_PROGRESS: '1', + WMILL_FRONTEND_AI_EVAL_VERBOSE: input.verbose ? '1' : '0' + } + } + ) + + const raw = await readFile(outputPath, 'utf8') + return JSON.parse(raw) as BenchmarkRunResult + } catch (error) { + throw new Error(`Frontend benchmark adapter failed:\n${toErrorMessage(error)}`) + } finally { + await rm(tempDir, { recursive: true, force: true }) + } +} + +async function runVitestBenchmark( + command: string, + args: string[], + options: { + cwd: string + env: NodeJS.ProcessEnv + } +): Promise { + const child = spawn(command, args, { + cwd: options.cwd, + env: options.env, + stdio: ['ignore', 'pipe', 'pipe'] + }) + + let stdout = '' + let stderr = '' + let stderrLineBuffer = '' + let assistantStreamOpen = false + + child.stdout?.setEncoding('utf8') + child.stdout?.on('data', (chunk: string) => { + stdout += chunk + }) + + child.stderr?.setEncoding('utf8') + child.stderr?.on('data', (chunk: string) => { + stderrLineBuffer += chunk + const { remainder, passthrough, nextAssistantStreamOpen } = drainProgressLines( + stderrLineBuffer, + assistantStreamOpen + ) + stderrLineBuffer = remainder + stderr += passthrough + assistantStreamOpen = nextAssistantStreamOpen + }) + + await new Promise((resolve, reject) => { + child.once('error', reject) + child.once('close', (code) => { + if (stderrLineBuffer.length > 0) { + const { + remainder, + passthrough, + nextAssistantStreamOpen + } = drainProgressLines(`${stderrLineBuffer}\n`, assistantStreamOpen) + stderrLineBuffer = remainder + stderr += passthrough + assistantStreamOpen = nextAssistantStreamOpen + } + + if (code === 0) { + if (assistantStreamOpen) { + process.stderr.write('\n') + } + resolve() + return + } + + const details = [`vitest exited with code ${code}`, stdout, stderr].filter(Boolean).join('\n') + reject(new Error(details)) + }) + }) +} + +function drainProgressLines(buffer: string): { + remainder: string + passthrough: string + nextAssistantStreamOpen: boolean +} +function drainProgressLines( + buffer: string, + initialAssistantStreamOpen: boolean +): { + remainder: string + passthrough: string + nextAssistantStreamOpen: boolean +} { + let remainder = buffer + let passthrough = '' + let assistantStreamOpen = initialAssistantStreamOpen + + while (true) { + const newlineIndex = remainder.indexOf('\n') + if (newlineIndex === -1) { + return { remainder, passthrough, nextAssistantStreamOpen: assistantStreamOpen } + } + + const line = remainder.slice(0, newlineIndex).replace(/\r$/, '') + remainder = remainder.slice(newlineIndex + 1) + + const progressEvent = parseFrontendBenchmarkProgressLine(line) + if (progressEvent) { + if (progressEvent.type === 'assistant-message-start') { + if (assistantStreamOpen) { + process.stderr.write('\n') + } + process.stderr.write( + `${formatCasePrefix(progressEvent.caseNumber, progressEvent.totalCases)} ${progressEvent.caseId} attempt ${progressEvent.attempt}/${progressEvent.runs} assistant:\n` + ) + assistantStreamOpen = true + continue + } + + if (progressEvent.type === 'assistant-chunk') { + process.stderr.write(progressEvent.chunk) + continue + } + + if (progressEvent.type === 'assistant-message-end') { + if (assistantStreamOpen) { + process.stderr.write('\n') + } + assistantStreamOpen = false + continue + } + + if (assistantStreamOpen) { + process.stderr.write('\n') + assistantStreamOpen = false + } + process.stderr.write(`${formatFrontendBenchmarkProgressEvent(progressEvent)}\n`) + continue + } + + if (shouldSuppressFrontendStderrLine(line)) { + continue + } + + passthrough += `${line}\n` + process.stderr.write(`${line}\n`) + } +} + +function formatCasePrefix(caseNumber: number, totalCases: number): string { + return `[${caseNumber}/${totalCases}]` +} + +function shouldSuppressFrontendStderrLine(line: string): boolean { + return ( + line.startsWith('[baseline-browser-mapping] ') || + line.startsWith('Browserslist: browsers data (caniuse-lite) is ') || + line.includes('update-browserslist-db@latest') || + line.includes('update-db#readme') + ) +} + +function toErrorMessage(error: unknown): string { + if (error instanceof Error) { + return error.message + } + return String(error) +} diff --git a/ai_evals/adapters/frontend/vitest.config.ts b/ai_evals/adapters/frontend/vitest.config.ts new file mode 100644 index 0000000000..daed7749c1 --- /dev/null +++ b/ai_evals/adapters/frontend/vitest.config.ts @@ -0,0 +1,28 @@ +import { fileURLToPath } from 'node:url' +import frontendConfig from '../../../frontend/vite.config.js' + +const FRONTEND_VITE_CONFIG_PATH = fileURLToPath(new URL('../../../frontend/vite.config.js', import.meta.url)) +const FRONTEND_TEST_SETUP_PATH = fileURLToPath( + new URL('../../../frontend/src/lib/test-setup.ts', import.meta.url) +) +const ADAPTER_TEST_PATH = fileURLToPath(new URL('./vitestAdapter.test.ts', import.meta.url)) + +const config = { + ...frontendConfig, + test: { + ...frontendConfig.test, + projects: [ + { + extends: FRONTEND_VITE_CONFIG_PATH, + test: { + name: 'server', + environment: 'node', + include: [ADAPTER_TEST_PATH], + setupFiles: [FRONTEND_TEST_SETUP_PATH] + } + } + ] + } +} + +export default config diff --git a/ai_evals/adapters/frontend/vitestAdapter.test.ts b/ai_evals/adapters/frontend/vitestAdapter.test.ts new file mode 100644 index 0000000000..9256bc9334 --- /dev/null +++ b/ai_evals/adapters/frontend/vitestAdapter.test.ts @@ -0,0 +1,165 @@ +import { expect, it, vi } from 'vitest' +// @ts-ignore - Node.js fs/promises +import { mkdir, writeFile } from 'fs/promises' +// @ts-ignore - Node.js path +import { dirname, resolve } from 'path' + +vi.mock('monaco-editor', () => ({ + editor: {}, + languages: {}, + KeyCode: {}, + Uri: { + parse: (value: string) => ({ toString: () => value }) + }, + MarkerSeverity: { + Error: 8, + Warning: 4, + Info: 2, + Hint: 1 + } +})) + +vi.mock('@codingame/monaco-vscode-standalone-typescript-language-features', () => ({ + getTypeScriptWorker: async () => async () => ({}), + typescriptVersion: 'test' +})) + +vi.mock('@codingame/monaco-vscode-languages-service-override', () => ({ + default: () => ({}) +})) + +vi.mock('$lib/components/vscode', () => ({})) + +vi.mock('$lib/gen', async () => { + const actual = await vi.importActual('$lib/gen') + const { + getBenchmarkCompletedJob, + getBenchmarkFlowByPath, + getBenchmarkScriptByHash, + getBenchmarkScriptByPath, + hasBenchmarkWorkspace, + listBenchmarkFlows, + listBenchmarkScripts, + runBenchmarkFlowByPath, + runBenchmarkScriptPreview + } = await import('./mockBackend') + + function wrapService(target: T, overrides: Record): T { + return new Proxy(target, { + get(source, property, receiver) { + if (typeof property === 'string' && property in overrides) { + return overrides[property] + } + return Reflect.get(source, property, receiver) + } + }) + } + + return { + ...actual, + ScriptService: wrapService(actual.ScriptService, { + listScripts: async (data: { workspace: string }) => + hasBenchmarkWorkspace(data.workspace) + ? (listBenchmarkScripts(data.workspace) ?? []) + : actual.ScriptService.listScripts(data), + getScriptByPath: async (data: { workspace: string; path: string }) => { + if (hasBenchmarkWorkspace(data.workspace)) { + const script = getBenchmarkScriptByPath(data.workspace, data.path) + if (!script) { + throw new Error(`Script "${data.path}" not found in benchmark workspace`) + } + return script + } + return actual.ScriptService.getScriptByPath(data) + }, + getScriptByHash: async (data: { workspace: string; hash: string }) => { + if (hasBenchmarkWorkspace(data.workspace)) { + const script = getBenchmarkScriptByHash(data.workspace, data.hash) + if (!script) { + throw new Error(`Script hash "${data.hash}" not found in benchmark workspace`) + } + return script + } + return actual.ScriptService.getScriptByHash(data) + } + }), + FlowService: wrapService(actual.FlowService, { + listFlows: async (data: { workspace: string }) => + hasBenchmarkWorkspace(data.workspace) + ? (listBenchmarkFlows(data.workspace) ?? []) + : actual.FlowService.listFlows(data), + getFlowByPath: async (data: { workspace: string; path: string }) => { + if (hasBenchmarkWorkspace(data.workspace)) { + const flow = getBenchmarkFlowByPath(data.workspace, data.path) + if (!flow) { + throw new Error(`Flow "${data.path}" not found in benchmark workspace`) + } + return flow + } + return actual.FlowService.getFlowByPath(data) + } + }), + JobService: wrapService(actual.JobService, { + runScriptPreview: async (data: { + workspace: string + requestBody?: { + content?: string + language?: string + args?: Record + path?: string + } + }) => + hasBenchmarkWorkspace(data.workspace) + ? runBenchmarkScriptPreview({ + workspace: data.workspace, + requestBody: data.requestBody ?? {} + }) + : actual.JobService.runScriptPreview(data), + runFlowByPath: async (data: { + workspace: string + path: string + requestBody?: Record + }) => + hasBenchmarkWorkspace(data.workspace) + ? runBenchmarkFlowByPath({ + workspace: data.workspace, + path: data.path, + args: data.requestBody + }) + : actual.JobService.runFlowByPath(data), + getJob: async (data: { workspace: string; id: string }) => { + if (hasBenchmarkWorkspace(data.workspace)) { + const job = getBenchmarkCompletedJob(data.workspace, data.id) + if (!job) { + throw new Error(`Job "${data.id}" not found in benchmark workspace`) + } + return job + } + return actual.JobService.getJob(data) + } + }) + } +}) + +const benchmarkOutputPath = process.env.WMILL_FRONTEND_AI_EVAL_OUTPUT_PATH +const benchmarkIt = benchmarkOutputPath ? it : it.skip + +benchmarkIt( + 'runs the frontend benchmark adapter from environment input', + async () => { + const { resetBenchmarkMockBackend } = await import('./mockBackend') + resetBenchmarkMockBackend() + const { runFrontendBenchmarkFromEnv } = await import('./benchmarkRunner') + try { + const payload = await runFrontendBenchmarkFromEnv() + const absoluteOutputPath = resolve(benchmarkOutputPath!) + await mkdir(dirname(absoluteOutputPath), { recursive: true }) + await writeFile(absoluteOutputPath, JSON.stringify(payload, null, 2) + '\n', 'utf8') + + expect(payload.cases.length).toBeGreaterThan(0) + } finally { + resetBenchmarkMockBackend() + } + }, + 600_000 +) diff --git a/ai_evals/bun.lock b/ai_evals/bun.lock new file mode 100644 index 0000000000..eaed1db99a --- /dev/null +++ b/ai_evals/bun.lock @@ -0,0 +1,313 @@ +{ + "lockfileVersion": 1, + "configVersion": 1, + "workspaces": { + "": { + "name": "windmill-ai-evals", + "dependencies": { + "@anthropic-ai/claude-agent-sdk": "^0.2.25", + "@anthropic-ai/sdk": "^0.39.0", + "commander": "^14.0.3", + "openai": "^6.9.1", + "yaml": "^2.8.3", + }, + "devDependencies": { + "@types/bun": "latest", + "typescript": "^5.0.0", + }, + }, + }, + "packages": { + "@anthropic-ai/claude-agent-sdk": ["@anthropic-ai/claude-agent-sdk@0.2.87", "", { "dependencies": { "@anthropic-ai/sdk": "^0.74.0", "@modelcontextprotocol/sdk": "^1.27.1" }, "optionalDependencies": { "@img/sharp-darwin-arm64": "^0.34.2", "@img/sharp-darwin-x64": "^0.34.2", "@img/sharp-linux-arm": "^0.34.2", "@img/sharp-linux-arm64": "^0.34.2", "@img/sharp-linux-x64": "^0.34.2", "@img/sharp-linuxmusl-arm64": "^0.34.2", "@img/sharp-linuxmusl-x64": "^0.34.2", "@img/sharp-win32-arm64": "^0.34.2", "@img/sharp-win32-x64": "^0.34.2" }, "peerDependencies": { "zod": "^4.0.0" } }, "sha512-WWmgBPxPhBOvNT0ujI8vPTI2lK+w5YEkEZ/y1mH0EDkK/0kBnxVJNhCtG5vnueiAViwLoUOFn66pbkDiivijdA=="], + + "@anthropic-ai/sdk": ["@anthropic-ai/sdk@0.39.0", "", { "dependencies": { "@types/node": "^18.11.18", "@types/node-fetch": "^2.6.4", "abort-controller": "^3.0.0", "agentkeepalive": "^4.2.1", "form-data-encoder": "1.7.2", "formdata-node": "^4.3.2", "node-fetch": "^2.6.7" } }, "sha512-eMyDIPRZbt1CCLErRCi3exlAvNkBtRe+kW5vvJyef93PmNr/clstYgHhtvmkxN82nlKgzyGPCyGxrm0JQ1ZIdg=="], + + "@babel/runtime": ["@babel/runtime@7.29.2", "", {}, "sha512-JiDShH45zKHWyGe4ZNVRrCjBz8Nh9TMmZG1kh4QTK8hCBTWBi8Da+i7s1fJw7/lYpM4ccepSNfqzZ/QvABBi5g=="], + + "@hono/node-server": ["@hono/node-server@1.19.12", "", { "peerDependencies": { "hono": "^4" } }, "sha512-txsUW4SQ1iilgE0l9/e9VQWmELXifEFvmdA1j6WFh/aFPj99hIntrSsq/if0UWyGVkmrRPKA1wCeP+UCr1B9Uw=="], + + "@img/sharp-darwin-arm64": ["@img/sharp-darwin-arm64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-darwin-arm64": "1.2.4" }, "os": "darwin", "cpu": "arm64" }, "sha512-imtQ3WMJXbMY4fxb/Ndp6HBTNVtWCUI0WdobyheGf5+ad6xX8VIDO8u2xE4qc/fr08CKG/7dDseFtn6M6g/r3w=="], + + "@img/sharp-darwin-x64": ["@img/sharp-darwin-x64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-darwin-x64": "1.2.4" }, "os": "darwin", "cpu": "x64" }, "sha512-YNEFAF/4KQ/PeW0N+r+aVVsoIY0/qxxikF2SWdp+NRkmMB7y9LBZAVqQ4yhGCm/H3H270OSykqmQMKLBhBJDEw=="], + + "@img/sharp-libvips-darwin-arm64": ["@img/sharp-libvips-darwin-arm64@1.2.4", "", { "os": "darwin", "cpu": "arm64" }, "sha512-zqjjo7RatFfFoP0MkQ51jfuFZBnVE2pRiaydKJ1G/rHZvnsrHAOcQALIi9sA5co5xenQdTugCvtb1cuf78Vf4g=="], + + "@img/sharp-libvips-darwin-x64": ["@img/sharp-libvips-darwin-x64@1.2.4", "", { "os": "darwin", "cpu": "x64" }, "sha512-1IOd5xfVhlGwX+zXv2N93k0yMONvUlANylbJw1eTah8K/Jtpi15KC+WSiaX/nBmbm2HxRM1gZ0nSdjSsrZbGKg=="], + + "@img/sharp-libvips-linux-arm": ["@img/sharp-libvips-linux-arm@1.2.4", "", { "os": "linux", "cpu": "arm" }, "sha512-bFI7xcKFELdiNCVov8e44Ia4u2byA+l3XtsAj+Q8tfCwO6BQ8iDojYdvoPMqsKDkuoOo+X6HZA0s0q11ANMQ8A=="], + + "@img/sharp-libvips-linux-arm64": ["@img/sharp-libvips-linux-arm64@1.2.4", "", { "os": "linux", "cpu": "arm64" }, "sha512-excjX8DfsIcJ10x1Kzr4RcWe1edC9PquDRRPx3YVCvQv+U5p7Yin2s32ftzikXojb1PIFc/9Mt28/y+iRklkrw=="], + + "@img/sharp-libvips-linux-x64": ["@img/sharp-libvips-linux-x64@1.2.4", "", { "os": "linux", "cpu": "x64" }, "sha512-tJxiiLsmHc9Ax1bz3oaOYBURTXGIRDODBqhveVHonrHJ9/+k89qbLl0bcJns+e4t4rvaNBxaEZsFtSfAdquPrw=="], + + "@img/sharp-libvips-linuxmusl-arm64": ["@img/sharp-libvips-linuxmusl-arm64@1.2.4", "", { "os": "linux", "cpu": "arm64" }, "sha512-FVQHuwx1IIuNow9QAbYUzJ+En8KcVm9Lk5+uGUQJHaZmMECZmOlix9HnH7n1TRkXMS0pGxIJokIVB9SuqZGGXw=="], + + "@img/sharp-libvips-linuxmusl-x64": ["@img/sharp-libvips-linuxmusl-x64@1.2.4", "", { "os": "linux", "cpu": "x64" }, "sha512-+LpyBk7L44ZIXwz/VYfglaX/okxezESc6UxDSoyo2Ks6Jxc4Y7sGjpgU9s4PMgqgjj1gZCylTieNamqA1MF7Dg=="], + + "@img/sharp-linux-arm": ["@img/sharp-linux-arm@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-arm": "1.2.4" }, "os": "linux", "cpu": "arm" }, "sha512-9dLqsvwtg1uuXBGZKsxem9595+ujv0sJ6Vi8wcTANSFpwV/GONat5eCkzQo/1O6zRIkh0m/8+5BjrRr7jDUSZw=="], + + "@img/sharp-linux-arm64": ["@img/sharp-linux-arm64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-arm64": "1.2.4" }, "os": "linux", "cpu": "arm64" }, "sha512-bKQzaJRY/bkPOXyKx5EVup7qkaojECG6NLYswgktOZjaXecSAeCWiZwwiFf3/Y+O1HrauiE3FVsGxFg8c24rZg=="], + + "@img/sharp-linux-x64": ["@img/sharp-linux-x64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-x64": "1.2.4" }, "os": "linux", "cpu": "x64" }, "sha512-MEzd8HPKxVxVenwAa+JRPwEC7QFjoPWuS5NZnBt6B3pu7EG2Ge0id1oLHZpPJdn3OQK+BQDiw9zStiHBTJQQQQ=="], + + "@img/sharp-linuxmusl-arm64": ["@img/sharp-linuxmusl-arm64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linuxmusl-arm64": "1.2.4" }, "os": "linux", "cpu": "arm64" }, "sha512-fprJR6GtRsMt6Kyfq44IsChVZeGN97gTD331weR1ex1c1rypDEABN6Tm2xa1wE6lYb5DdEnk03NZPqA7Id21yg=="], + + "@img/sharp-linuxmusl-x64": ["@img/sharp-linuxmusl-x64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linuxmusl-x64": "1.2.4" }, "os": "linux", "cpu": "x64" }, "sha512-Jg8wNT1MUzIvhBFxViqrEhWDGzqymo3sV7z7ZsaWbZNDLXRJZoRGrjulp60YYtV4wfY8VIKcWidjojlLcWrd8Q=="], + + "@img/sharp-win32-arm64": ["@img/sharp-win32-arm64@0.34.5", "", { "os": "win32", "cpu": "arm64" }, "sha512-WQ3AgWCWYSb2yt+IG8mnC6Jdk9Whs7O0gxphblsLvdhSpSTtmu69ZG1Gkb6NuvxsNACwiPV6cNSZNzt0KPsw7g=="], + + "@img/sharp-win32-x64": ["@img/sharp-win32-x64@0.34.5", "", { "os": "win32", "cpu": "x64" }, "sha512-+29YMsqY2/9eFEiW93eqWnuLcWcufowXewwSNIT6UwZdUUCrM3oFjMWH/Z6/TMmb4hlFenmfAVbpWeup2jryCw=="], + + "@modelcontextprotocol/sdk": ["@modelcontextprotocol/sdk@1.29.0", "", { "dependencies": { "@hono/node-server": "^1.19.9", "ajv": "^8.17.1", "ajv-formats": "^3.0.1", "content-type": "^1.0.5", "cors": "^2.8.5", "cross-spawn": "^7.0.5", "eventsource": "^3.0.2", "eventsource-parser": "^3.0.0", "express": "^5.2.1", "express-rate-limit": "^8.2.1", "hono": "^4.11.4", "jose": "^6.1.3", "json-schema-typed": "^8.0.2", "pkce-challenge": "^5.0.0", "raw-body": "^3.0.0", "zod": "^3.25 || ^4.0", "zod-to-json-schema": "^3.25.1" }, "peerDependencies": { "@cfworker/json-schema": "^4.1.1" }, "optionalPeers": ["@cfworker/json-schema"] }, "sha512-zo37mZA9hJWpULgkRpowewez1y6ML5GsXJPY8FI0tBBCd77HEvza4jDqRKOXgHNn867PVGCyTdzqpz0izu5ZjQ=="], + + "@types/bun": ["@types/bun@1.3.11", "", { "dependencies": { "bun-types": "1.3.11" } }, "sha512-5vPne5QvtpjGpsGYXiFyycfpDF2ECyPcTSsFBMa0fraoxiQyMJ3SmuQIGhzPg2WJuWxVBoxWJ2kClYTcw/4fAg=="], + + "@types/node": ["@types/node@18.19.130", "", { "dependencies": { "undici-types": "~5.26.4" } }, "sha512-GRaXQx6jGfL8sKfaIDD6OupbIHBr9jv7Jnaml9tB7l4v068PAOXqfcujMMo5PhbIs6ggR1XODELqahT2R8v0fg=="], + + "@types/node-fetch": ["@types/node-fetch@2.6.13", "", { "dependencies": { "@types/node": "*", "form-data": "^4.0.4" } }, "sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw=="], + + "abort-controller": ["abort-controller@3.0.0", "", { "dependencies": { "event-target-shim": "^5.0.0" } }, "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg=="], + + "accepts": ["accepts@2.0.0", "", { "dependencies": { "mime-types": "^3.0.0", "negotiator": "^1.0.0" } }, "sha512-5cvg6CtKwfgdmVqY1WIiXKc3Q1bkRqGLi+2W/6ao+6Y7gu/RCwRuAhGEzh5B4KlszSuTLgZYuqFqo5bImjNKng=="], + + "agentkeepalive": ["agentkeepalive@4.6.0", "", { "dependencies": { "humanize-ms": "^1.2.1" } }, "sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ=="], + + "ajv": ["ajv@8.18.0", "", { "dependencies": { "fast-deep-equal": "^3.1.3", "fast-uri": "^3.0.1", "json-schema-traverse": "^1.0.0", "require-from-string": "^2.0.2" } }, "sha512-PlXPeEWMXMZ7sPYOHqmDyCJzcfNrUr3fGNKtezX14ykXOEIvyK81d+qydx89KY5O71FKMPaQ2vBfBFI5NHR63A=="], + + "ajv-formats": ["ajv-formats@3.0.1", "", { "dependencies": { "ajv": "^8.0.0" } }, "sha512-8iUql50EUR+uUcdRQ3HDqa6EVyo3docL8g5WJ3FNcWmu62IbkGUue/pEyLBW8VGKKucTPgqeks4fIU1DA4yowQ=="], + + "asynckit": ["asynckit@0.4.0", "", {}, "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="], + + "body-parser": ["body-parser@2.2.2", "", { "dependencies": { "bytes": "^3.1.2", "content-type": "^1.0.5", "debug": "^4.4.3", "http-errors": "^2.0.0", "iconv-lite": "^0.7.0", "on-finished": "^2.4.1", "qs": "^6.14.1", "raw-body": "^3.0.1", "type-is": "^2.0.1" } }, "sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA=="], + + "bun-types": ["bun-types@1.3.11", "", { "dependencies": { "@types/node": "*" } }, "sha512-1KGPpoxQWl9f6wcZh57LvrPIInQMn2TQ7jsgxqpRzg+l0QPOFvJVH7HmvHo/AiPgwXy+/Thf6Ov3EdVn1vOabg=="], + + "bytes": ["bytes@3.1.2", "", {}, "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg=="], + + "call-bind-apply-helpers": ["call-bind-apply-helpers@1.0.2", "", { "dependencies": { "es-errors": "^1.3.0", "function-bind": "^1.1.2" } }, "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ=="], + + "call-bound": ["call-bound@1.0.4", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "get-intrinsic": "^1.3.0" } }, "sha512-+ys997U96po4Kx/ABpBCqhA9EuxJaQWDQg7295H4hBphv3IZg0boBKuwYpt4YXp6MZ5AmZQnU/tyMTlRpaSejg=="], + + "combined-stream": ["combined-stream@1.0.8", "", { "dependencies": { "delayed-stream": "~1.0.0" } }, "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg=="], + + "commander": ["commander@14.0.3", "", {}, "sha512-H+y0Jo/T1RZ9qPP4Eh1pkcQcLRglraJaSLoyOtHxu6AapkjWVCy2Sit1QQ4x3Dng8qDlSsZEet7g5Pq06MvTgw=="], + + "content-disposition": ["content-disposition@1.0.1", "", {}, "sha512-oIXISMynqSqm241k6kcQ5UwttDILMK4BiurCfGEREw6+X9jkkpEe5T9FZaApyLGGOnFuyMWZpdolTXMtvEJ08Q=="], + + "content-type": ["content-type@1.0.5", "", {}, "sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA=="], + + "cookie": ["cookie@0.7.2", "", {}, "sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w=="], + + "cookie-signature": ["cookie-signature@1.2.2", "", {}, "sha512-D76uU73ulSXrD1UXF4KE2TMxVVwhsnCgfAyTg9k8P6KGZjlXKrOLe4dJQKI3Bxi5wjesZoFXJWElNWBjPZMbhg=="], + + "cors": ["cors@2.8.6", "", { "dependencies": { "object-assign": "^4", "vary": "^1" } }, "sha512-tJtZBBHA6vjIAaF6EnIaq6laBBP9aq/Y3ouVJjEfoHbRBcHBAHYcMh/w8LDrk2PvIMMq8gmopa5D4V8RmbrxGw=="], + + "cross-spawn": ["cross-spawn@7.0.6", "", { "dependencies": { "path-key": "^3.1.0", "shebang-command": "^2.0.0", "which": "^2.0.1" } }, "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA=="], + + "debug": ["debug@4.4.3", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA=="], + + "delayed-stream": ["delayed-stream@1.0.0", "", {}, "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ=="], + + "depd": ["depd@2.0.0", "", {}, "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw=="], + + "dunder-proto": ["dunder-proto@1.0.1", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.1", "es-errors": "^1.3.0", "gopd": "^1.2.0" } }, "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A=="], + + "ee-first": ["ee-first@1.1.1", "", {}, "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow=="], + + "encodeurl": ["encodeurl@2.0.0", "", {}, "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg=="], + + "es-define-property": ["es-define-property@1.0.1", "", {}, "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g=="], + + "es-errors": ["es-errors@1.3.0", "", {}, "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw=="], + + "es-object-atoms": ["es-object-atoms@1.1.1", "", { "dependencies": { "es-errors": "^1.3.0" } }, "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA=="], + + "es-set-tostringtag": ["es-set-tostringtag@2.1.0", "", { "dependencies": { "es-errors": "^1.3.0", "get-intrinsic": "^1.2.6", "has-tostringtag": "^1.0.2", "hasown": "^2.0.2" } }, "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA=="], + + "escape-html": ["escape-html@1.0.3", "", {}, "sha512-NiSupZ4OeuGwr68lGIeym/ksIZMJodUGOSCZ/FSnTxcrekbvqrgdUxlJOMpijaKZVjAJrWrGs/6Jy8OMuyj9ow=="], + + "etag": ["etag@1.8.1", "", {}, "sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg=="], + + "event-target-shim": ["event-target-shim@5.0.1", "", {}, "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ=="], + + "eventsource": ["eventsource@3.0.7", "", { "dependencies": { "eventsource-parser": "^3.0.1" } }, "sha512-CRT1WTyuQoD771GW56XEZFQ/ZoSfWid1alKGDYMmkt2yl8UXrVR4pspqWNEcqKvVIzg6PAltWjxcSSPrboA4iA=="], + + "eventsource-parser": ["eventsource-parser@3.0.6", "", {}, "sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg=="], + + "express": ["express@5.2.1", "", { "dependencies": { "accepts": "^2.0.0", "body-parser": "^2.2.1", "content-disposition": "^1.0.0", "content-type": "^1.0.5", "cookie": "^0.7.1", "cookie-signature": "^1.2.1", "debug": "^4.4.0", "depd": "^2.0.0", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "etag": "^1.8.1", "finalhandler": "^2.1.0", "fresh": "^2.0.0", "http-errors": "^2.0.0", "merge-descriptors": "^2.0.0", "mime-types": "^3.0.0", "on-finished": "^2.4.1", "once": "^1.4.0", "parseurl": "^1.3.3", "proxy-addr": "^2.0.7", "qs": "^6.14.0", "range-parser": "^1.2.1", "router": "^2.2.0", "send": "^1.1.0", "serve-static": "^2.2.0", "statuses": "^2.0.1", "type-is": "^2.0.1", "vary": "^1.1.2" } }, "sha512-hIS4idWWai69NezIdRt2xFVofaF4j+6INOpJlVOLDO8zXGpUVEVzIYk12UUi2JzjEzWL3IOAxcTubgz9Po0yXw=="], + + "express-rate-limit": ["express-rate-limit@8.3.2", "", { "dependencies": { "ip-address": "10.1.0" }, "peerDependencies": { "express": ">= 4.11" } }, "sha512-77VmFeJkO0/rvimEDuUC5H30oqUC4EyOhyGccfqoLebB0oiEYfM7nwPrsDsBL1gsTpwfzX8SFy2MT3TDyRq+bg=="], + + "fast-deep-equal": ["fast-deep-equal@3.1.3", "", {}, "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q=="], + + "fast-uri": ["fast-uri@3.1.0", "", {}, "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA=="], + + "finalhandler": ["finalhandler@2.1.1", "", { "dependencies": { "debug": "^4.4.0", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "on-finished": "^2.4.1", "parseurl": "^1.3.3", "statuses": "^2.0.1" } }, "sha512-S8KoZgRZN+a5rNwqTxlZZePjT/4cnm0ROV70LedRHZ0p8u9fRID0hJUZQpkKLzro8LfmC8sx23bY6tVNxv8pQA=="], + + "form-data": ["form-data@4.0.5", "", { "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", "es-set-tostringtag": "^2.1.0", "hasown": "^2.0.2", "mime-types": "^2.1.12" } }, "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w=="], + + "form-data-encoder": ["form-data-encoder@1.7.2", "", {}, "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A=="], + + "formdata-node": ["formdata-node@4.4.1", "", { "dependencies": { "node-domexception": "1.0.0", "web-streams-polyfill": "4.0.0-beta.3" } }, "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ=="], + + "forwarded": ["forwarded@0.2.0", "", {}, "sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow=="], + + "fresh": ["fresh@2.0.0", "", {}, "sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A=="], + + "function-bind": ["function-bind@1.1.2", "", {}, "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA=="], + + "get-intrinsic": ["get-intrinsic@1.3.0", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "es-define-property": "^1.0.1", "es-errors": "^1.3.0", "es-object-atoms": "^1.1.1", "function-bind": "^1.1.2", "get-proto": "^1.0.1", "gopd": "^1.2.0", "has-symbols": "^1.1.0", "hasown": "^2.0.2", "math-intrinsics": "^1.1.0" } }, "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ=="], + + "get-proto": ["get-proto@1.0.1", "", { "dependencies": { "dunder-proto": "^1.0.1", "es-object-atoms": "^1.0.0" } }, "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g=="], + + "gopd": ["gopd@1.2.0", "", {}, "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg=="], + + "has-symbols": ["has-symbols@1.1.0", "", {}, "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ=="], + + "has-tostringtag": ["has-tostringtag@1.0.2", "", { "dependencies": { "has-symbols": "^1.0.3" } }, "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw=="], + + "hasown": ["hasown@2.0.2", "", { "dependencies": { "function-bind": "^1.1.2" } }, "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ=="], + + "hono": ["hono@4.12.9", "", {}, "sha512-wy3T8Zm2bsEvxKZM5w21VdHDDcwVS1yUFFY6i8UobSsKfFceT7TOwhbhfKsDyx7tYQlmRM5FLpIuYvNFyjctiA=="], + + "http-errors": ["http-errors@2.0.1", "", { "dependencies": { "depd": "~2.0.0", "inherits": "~2.0.4", "setprototypeof": "~1.2.0", "statuses": "~2.0.2", "toidentifier": "~1.0.1" } }, "sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ=="], + + "humanize-ms": ["humanize-ms@1.2.1", "", { "dependencies": { "ms": "^2.0.0" } }, "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ=="], + + "iconv-lite": ["iconv-lite@0.7.2", "", { "dependencies": { "safer-buffer": ">= 2.1.2 < 3.0.0" } }, "sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw=="], + + "inherits": ["inherits@2.0.4", "", {}, "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="], + + "ip-address": ["ip-address@10.1.0", "", {}, "sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q=="], + + "ipaddr.js": ["ipaddr.js@1.9.1", "", {}, "sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g=="], + + "is-promise": ["is-promise@4.0.0", "", {}, "sha512-hvpoI6korhJMnej285dSg6nu1+e6uxs7zG3BYAm5byqDsgJNWwxzM6z6iZiAgQR4TJ30JmBTOwqZUw3WlyH3AQ=="], + + "isexe": ["isexe@2.0.0", "", {}, "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw=="], + + "jose": ["jose@6.2.2", "", {}, "sha512-d7kPDd34KO/YnzaDOlikGpOurfF0ByC2sEV4cANCtdqLlTfBlw2p14O/5d/zv40gJPbIQxfES3nSx1/oYNyuZQ=="], + + "json-schema-to-ts": ["json-schema-to-ts@3.1.1", "", { "dependencies": { "@babel/runtime": "^7.18.3", "ts-algebra": "^2.0.0" } }, "sha512-+DWg8jCJG2TEnpy7kOm/7/AxaYoaRbjVB4LFZLySZlWn8exGs3A4OLJR966cVvU26N7X9TWxl+Jsw7dzAqKT6g=="], + + "json-schema-traverse": ["json-schema-traverse@1.0.0", "", {}, "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="], + + "json-schema-typed": ["json-schema-typed@8.0.2", "", {}, "sha512-fQhoXdcvc3V28x7C7BMs4P5+kNlgUURe2jmUT1T//oBRMDrqy1QPelJimwZGo7Hg9VPV3EQV5Bnq4hbFy2vetA=="], + + "math-intrinsics": ["math-intrinsics@1.1.0", "", {}, "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g=="], + + "media-typer": ["media-typer@1.1.0", "", {}, "sha512-aisnrDP4GNe06UcKFnV5bfMNPBUw4jsLGaWwWfnH3v02GnBuXX2MCVn5RbrWo0j3pczUilYblq7fQ7Nw2t5XKw=="], + + "merge-descriptors": ["merge-descriptors@2.0.0", "", {}, "sha512-Snk314V5ayFLhp3fkUREub6WtjBfPdCPY1Ln8/8munuLuiYhsABgBVWsozAG+MWMbVEvcdcpbi9R7ww22l9Q3g=="], + + "mime-db": ["mime-db@1.54.0", "", {}, "sha512-aU5EJuIN2WDemCcAp2vFBfp/m4EAhWJnUNSSw0ixs7/kXbd6Pg64EmwJkNdFhB8aWt1sH2CTXrLxo/iAGV3oPQ=="], + + "mime-types": ["mime-types@3.0.2", "", { "dependencies": { "mime-db": "^1.54.0" } }, "sha512-Lbgzdk0h4juoQ9fCKXW4by0UJqj+nOOrI9MJ1sSj4nI8aI2eo1qmvQEie4VD1glsS250n15LsWsYtCugiStS5A=="], + + "ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="], + + "negotiator": ["negotiator@1.0.0", "", {}, "sha512-8Ofs/AUQh8MaEcrlq5xOX0CQ9ypTF5dl78mjlMNfOK08fzpgTHQRQPBxcPlEtIw0yRpws+Zo/3r+5WRby7u3Gg=="], + + "node-domexception": ["node-domexception@1.0.0", "", {}, "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ=="], + + "node-fetch": ["node-fetch@2.7.0", "", { "dependencies": { "whatwg-url": "^5.0.0" }, "peerDependencies": { "encoding": "^0.1.0" }, "optionalPeers": ["encoding"] }, "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A=="], + + "object-assign": ["object-assign@4.1.1", "", {}, "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg=="], + + "object-inspect": ["object-inspect@1.13.4", "", {}, "sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew=="], + + "on-finished": ["on-finished@2.4.1", "", { "dependencies": { "ee-first": "1.1.1" } }, "sha512-oVlzkg3ENAhCk2zdv7IJwd/QUD4z2RxRwpkcGY8psCVcCYZNq4wYnVWALHM+brtuJjePWiYF/ClmuDr8Ch5+kg=="], + + "once": ["once@1.4.0", "", { "dependencies": { "wrappy": "1" } }, "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w=="], + + "openai": ["openai@6.34.0", "", { "peerDependencies": { "ws": "^8.18.0", "zod": "^3.25 || ^4.0" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-yEr2jdGf4tVFYG6ohmr3pF6VJuveP0EA/sS8TBx+4Eq5NT10alu5zg2dmxMXMgqpihRDQlFGpRt2XwsGj+Fyxw=="], + + "parseurl": ["parseurl@1.3.3", "", {}, "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ=="], + + "path-key": ["path-key@3.1.1", "", {}, "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q=="], + + "path-to-regexp": ["path-to-regexp@8.4.1", "", {}, "sha512-fvU78fIjZ+SBM9YwCknCvKOUKkLVqtWDVctl0s7xIqfmfb38t2TT4ZU2gHm+Z8xGwgW+QWEU3oQSAzIbo89Ggw=="], + + "pkce-challenge": ["pkce-challenge@5.0.1", "", {}, "sha512-wQ0b/W4Fr01qtpHlqSqspcj3EhBvimsdh0KlHhH8HRZnMsEa0ea2fTULOXOS9ccQr3om+GcGRk4e+isrZWV8qQ=="], + + "proxy-addr": ["proxy-addr@2.0.7", "", { "dependencies": { "forwarded": "0.2.0", "ipaddr.js": "1.9.1" } }, "sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg=="], + + "qs": ["qs@6.15.0", "", { "dependencies": { "side-channel": "^1.1.0" } }, "sha512-mAZTtNCeetKMH+pSjrb76NAM8V9a05I9aBZOHztWy/UqcJdQYNsf59vrRKWnojAT9Y+GbIvoTBC++CPHqpDBhQ=="], + + "range-parser": ["range-parser@1.2.1", "", {}, "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg=="], + + "raw-body": ["raw-body@3.0.2", "", { "dependencies": { "bytes": "~3.1.2", "http-errors": "~2.0.1", "iconv-lite": "~0.7.0", "unpipe": "~1.0.0" } }, "sha512-K5zQjDllxWkf7Z5xJdV0/B0WTNqx6vxG70zJE4N0kBs4LovmEYWJzQGxC9bS9RAKu3bgM40lrd5zoLJ12MQ5BA=="], + + "require-from-string": ["require-from-string@2.0.2", "", {}, "sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw=="], + + "router": ["router@2.2.0", "", { "dependencies": { "debug": "^4.4.0", "depd": "^2.0.0", "is-promise": "^4.0.0", "parseurl": "^1.3.3", "path-to-regexp": "^8.0.0" } }, "sha512-nLTrUKm2UyiL7rlhapu/Zl45FwNgkZGaCpZbIHajDYgwlJCOzLSk+cIPAnsEqV955GjILJnKbdQC1nVPz+gAYQ=="], + + "safer-buffer": ["safer-buffer@2.1.2", "", {}, "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="], + + "send": ["send@1.2.1", "", { "dependencies": { "debug": "^4.4.3", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "etag": "^1.8.1", "fresh": "^2.0.0", "http-errors": "^2.0.1", "mime-types": "^3.0.2", "ms": "^2.1.3", "on-finished": "^2.4.1", "range-parser": "^1.2.1", "statuses": "^2.0.2" } }, "sha512-1gnZf7DFcoIcajTjTwjwuDjzuz4PPcY2StKPlsGAQ1+YH20IRVrBaXSWmdjowTJ6u8Rc01PoYOGHXfP1mYcZNQ=="], + + "serve-static": ["serve-static@2.2.1", "", { "dependencies": { "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "parseurl": "^1.3.3", "send": "^1.2.0" } }, "sha512-xRXBn0pPqQTVQiC8wyQrKs2MOlX24zQ0POGaj0kultvoOCstBQM5yvOhAVSUwOMjQtTvsPWoNCHfPGwaaQJhTw=="], + + "setprototypeof": ["setprototypeof@1.2.0", "", {}, "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw=="], + + "shebang-command": ["shebang-command@2.0.0", "", { "dependencies": { "shebang-regex": "^3.0.0" } }, "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA=="], + + "shebang-regex": ["shebang-regex@3.0.0", "", {}, "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A=="], + + "side-channel": ["side-channel@1.1.0", "", { "dependencies": { "es-errors": "^1.3.0", "object-inspect": "^1.13.3", "side-channel-list": "^1.0.0", "side-channel-map": "^1.0.1", "side-channel-weakmap": "^1.0.2" } }, "sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw=="], + + "side-channel-list": ["side-channel-list@1.0.0", "", { "dependencies": { "es-errors": "^1.3.0", "object-inspect": "^1.13.3" } }, "sha512-FCLHtRD/gnpCiCHEiJLOwdmFP+wzCmDEkc9y7NsYxeF4u7Btsn1ZuwgwJGxImImHicJArLP4R0yX4c2KCrMrTA=="], + + "side-channel-map": ["side-channel-map@1.0.1", "", { "dependencies": { "call-bound": "^1.0.2", "es-errors": "^1.3.0", "get-intrinsic": "^1.2.5", "object-inspect": "^1.13.3" } }, "sha512-VCjCNfgMsby3tTdo02nbjtM/ewra6jPHmpThenkTYh8pG9ucZ/1P8So4u4FGBek/BjpOVsDCMoLA/iuBKIFXRA=="], + + "side-channel-weakmap": ["side-channel-weakmap@1.0.2", "", { "dependencies": { "call-bound": "^1.0.2", "es-errors": "^1.3.0", "get-intrinsic": "^1.2.5", "object-inspect": "^1.13.3", "side-channel-map": "^1.0.1" } }, "sha512-WPS/HvHQTYnHisLo9McqBHOJk2FkHO/tlpvldyrnem4aeQp4hai3gythswg6p01oSoTl58rcpiFAjF2br2Ak2A=="], + + "statuses": ["statuses@2.0.2", "", {}, "sha512-DvEy55V3DB7uknRo+4iOGT5fP1slR8wQohVdknigZPMpMstaKJQWhwiYBACJE3Ul2pTnATihhBYnRhZQHGBiRw=="], + + "toidentifier": ["toidentifier@1.0.1", "", {}, "sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA=="], + + "tr46": ["tr46@0.0.3", "", {}, "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw=="], + + "ts-algebra": ["ts-algebra@2.0.0", "", {}, "sha512-FPAhNPFMrkwz76P7cdjdmiShwMynZYN6SgOujD1urY4oNm80Ou9oMdmbR45LotcKOXoy7wSmHkRFE6Mxbrhefw=="], + + "type-is": ["type-is@2.0.1", "", { "dependencies": { "content-type": "^1.0.5", "media-typer": "^1.1.0", "mime-types": "^3.0.0" } }, "sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw=="], + + "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="], + + "undici-types": ["undici-types@5.26.5", "", {}, "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA=="], + + "unpipe": ["unpipe@1.0.0", "", {}, "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ=="], + + "vary": ["vary@1.1.2", "", {}, "sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg=="], + + "web-streams-polyfill": ["web-streams-polyfill@4.0.0-beta.3", "", {}, "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug=="], + + "webidl-conversions": ["webidl-conversions@3.0.1", "", {}, "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ=="], + + "whatwg-url": ["whatwg-url@5.0.0", "", { "dependencies": { "tr46": "~0.0.3", "webidl-conversions": "^3.0.0" } }, "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw=="], + + "which": ["which@2.0.2", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "node-which": "./bin/node-which" } }, "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA=="], + + "wrappy": ["wrappy@1.0.2", "", {}, "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ=="], + + "yaml": ["yaml@2.8.3", "", { "bin": { "yaml": "bin.mjs" } }, "sha512-AvbaCLOO2Otw/lW5bmh9d/WEdcDFdQp2Z2ZUH3pX9U2ihyUY0nvLv7J6TrWowklRGPYbB/IuIMfYgxaCPg5Bpg=="], + + "zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="], + + "zod-to-json-schema": ["zod-to-json-schema@3.25.2", "", { "peerDependencies": { "zod": "^3.25.28 || ^4" } }, "sha512-O/PgfnpT1xKSDeQYSCfRI5Gy3hPf91mKVDuYLUHZJMiDFptvP41MSnWofm8dnCm0256ZNfZIM7DSzuSMAFnjHA=="], + + "@anthropic-ai/claude-agent-sdk/@anthropic-ai/sdk": ["@anthropic-ai/sdk@0.74.0", "", { "dependencies": { "json-schema-to-ts": "^3.1.1" }, "peerDependencies": { "zod": "^3.25.0 || ^4.0.0" }, "optionalPeers": ["zod"], "bin": { "anthropic-ai-sdk": "bin/cli" } }, "sha512-srbJV7JKsc5cQ6eVuFzjZO7UR3xEPJqPamHFIe29bs38Ij2IripoAhC0S5NslNbaFUYqBKypmmpzMTpqfHEUDw=="], + + "@types/node-fetch/@types/node": ["@types/node@25.5.0", "", { "dependencies": { "undici-types": "~7.18.0" } }, "sha512-jp2P3tQMSxWugkCUKLRPVUpGaL5MVFwF8RDuSRztfwgN1wmqJeMSbKlnEtQqU8UrhTmzEmZdu2I6v2dpp7XIxw=="], + + "bun-types/@types/node": ["@types/node@25.5.0", "", { "dependencies": { "undici-types": "~7.18.0" } }, "sha512-jp2P3tQMSxWugkCUKLRPVUpGaL5MVFwF8RDuSRztfwgN1wmqJeMSbKlnEtQqU8UrhTmzEmZdu2I6v2dpp7XIxw=="], + + "form-data/mime-types": ["mime-types@2.1.35", "", { "dependencies": { "mime-db": "1.52.0" } }, "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw=="], + + "@types/node-fetch/@types/node/undici-types": ["undici-types@7.18.2", "", {}, "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w=="], + + "bun-types/@types/node/undici-types": ["undici-types@7.18.2", "", {}, "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w=="], + + "form-data/mime-types/mime-db": ["mime-db@1.52.0", "", {}, "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg=="], + } +} diff --git a/ai_evals/cases/app.yaml b/ai_evals/cases/app.yaml new file mode 100644 index 0000000000..4eed8b443e --- /dev/null +++ b/ai_evals/cases/app.yaml @@ -0,0 +1,93 @@ +- id: app-test1-counter-create + prompt: |- + Create a simple counter app with increment and decrement buttons. + judgeChecklist: + - shows the current count in the UI + - includes an increment button + - includes a decrement button + - clicking the buttons updates the count correctly + +- id: app-test2-counter-reset + prompt: |- + Add a reset button that sets the counter back to 0 + initial: ai_evals/fixtures/frontend/app/initial/test1_counter_app + judgeChecklist: + - adds a reset control to the existing counter app + - clicking reset sets the count back to 0 + - keeps the existing increment and decrement behavior working + +- id: app-test3-shopping-cart-quantity + prompt: |- + Add a quantity selector (+ and - buttons) to each cart item so users can adjust quantities without removing and re-adding items + initial: ai_evals/fixtures/frontend/app/initial/shopping_cart + judgeChecklist: + - each cart item has visible plus and minus quantity controls + - users can increase quantity without re-adding the product + - users can decrease quantity from the cart UI + - cart totals stay in sync with quantity changes + +- id: app-test4-shopping-cart-discount + prompt: |- + Add a discount code input field in the cart. + When the code "SAVE10" is entered, apply a 10% discount to the total + initial: ai_evals/fixtures/frontend/app/initial/shopping_cart + judgeChecklist: + - adds a discount code input to the cart + - recognizes the code SAVE10 + - applies a 10 percent discount to the displayed total + - keeps the rest of the cart behavior intact + +- id: app-test5-file-manager-search + prompt: |- + Add a search bar in the toolbar that filters files and folders by name as the user types + initial: ai_evals/fixtures/frontend/app/initial/file_manager + judgeChecklist: + - adds a search input in the toolbar + - filters files and folders by name as the user types + - updates the visible file list from the search query + - keeps the rest of the file manager usable + +- id: app-test6-file-manager-inline-rename + prompt: |- + Let users rename files and folders directly from the file list without leaving the page. + initial: ai_evals/fixtures/frontend/app/initial/file_manager + judgeChecklist: + - adds a visible rename action or inline edit mode in the file list + - lets users edit an item's name directly from the list + - saves the renamed item through the app's existing rename behavior + - refreshes the displayed name after a successful rename + +- id: app-test7-file-manager-select-all + prompt: |- + Add a "Select All" checkbox in the file list header and individual checkboxes for each file. + Add a "Delete Selected" button that appears when items are selected + initial: ai_evals/fixtures/frontend/app/initial/file_manager + judgeChecklist: + - adds a select-all control in the file list header + - adds per-item selection controls + - shows a delete-selected action only when there is a selection + - deleting selected items updates the visible list + +- id: app-test8-inventory-tracker-create + prompt: |- + Create an inventory tracker app for a small store. + Users should be able to add items with a name, sku, quantity, and price, search items by name or sku, and delete items. + The inventory should persist between sessions. + judgeChecklist: + - includes a form to add inventory items with name, sku, quantity, and price + - shows a list or table of saved inventory items + - supports searching or filtering by name or sku + - lets users delete existing inventory items + - persists the inventory data appropriately for a raw Windmill app + +- id: app-test9-recipe-book-create + prompt: |- + Create a recipe book app where users can add recipes with a name, ingredients list, and instructions. + Include a search bar to filter recipes by name and the ability to delete recipes. + Recipes should persist between sessions. + judgeChecklist: + - includes a form to add recipes with name, ingredients, and instructions + - shows saved recipes in the app + - supports searching recipes by name + - lets users delete recipes + - persists recipes appropriately for a raw Windmill app diff --git a/ai_evals/cases/cli.yaml b/ai_evals/cases/cli.yaml new file mode 100644 index 0000000000..735976781c --- /dev/null +++ b/ai_evals/cases/cli.yaml @@ -0,0 +1,66 @@ +- id: bun-hello-script + prompt: |- + Create a Windmill Bun script at `f/evals/hello.ts`. + It should take a `name` input and return a greeting object like `{ greeting: "Hello, Alice!" }`. + expected: ai_evals/fixtures/cli/expected/bun-hello-script + judgeChecklist: + - creates the requested Bun script at f/evals/hello.ts + - takes a name input + - returns an object containing the greeting + +- id: bun-hello-flow + prompt: |- + Create a Windmill flow at `f/evals/hello__flow`. + It should take a `name` input and return a greeting object like `{ greeting: "Hello, Alice!" }`. + Put the step code in `hello.ts`. + expected: ai_evals/fixtures/cli/expected/bun-hello-flow + judgeChecklist: + - creates the requested flow folder with flow.yaml and hello.ts + - wires the name input into the flow step + - returns the greeting object + +- id: python-add-numbers-script + prompt: |- + Add a Windmill Python script at `f/evals/add_numbers.py`. + It should take `a` and `b` as inputs and return `{ "total": a + b }`. + expected: ai_evals/fixtures/cli/expected/python-add-numbers-script + judgeChecklist: + - creates the requested Python script at f/evals/add_numbers.py + - takes `a` and `b` as inputs + - returns an object with total equal to a plus b + +- id: bun-hello-script-uppercase + prompt: |- + Update `f/evals/hello.ts` so it accepts an optional `uppercase` boolean. + Keep returning `{ greeting: ... }`, but when `uppercase` is true the greeting should be uppercased before returning it. + initial: ai_evals/fixtures/cli/initial/bun-hello-script-uppercase + expected: ai_evals/fixtures/cli/expected/bun-hello-script-uppercase + judgeChecklist: + - updates the existing hello.ts file rather than creating a new script + - accepts an optional uppercase boolean input + - keeps returning an object with greeting + - uppercases the greeting when uppercase is true + +- id: bun-hello-flow-punctuation + prompt: |- + Update the existing flow in `f/evals/hello__flow` so it also accepts an optional `punctuation` input. + The greeting should use that punctuation and default to `!` when it is missing. + initial: ai_evals/fixtures/cli/initial/bun-hello-flow-punctuation + expected: ai_evals/fixtures/cli/expected/bun-hello-flow-punctuation + judgeChecklist: + - updates the existing hello flow instead of creating a new one + - adds an optional punctuation input to the flow + - updates the step code so the returned greeting uses punctuation + - defaults punctuation to an exclamation mark when omitted + +- id: flow-reuse-existing-script + prompt: |- + There is already a reusable greeting script at `f/lib/format_greeting.ts`. + Create a flow at `f/evals/reuse_greeting__flow` that takes a `name` input and reuses that existing script instead of duplicating the logic inline. + initial: ai_evals/fixtures/cli/initial/flow-reuse-existing-script + expected: ai_evals/fixtures/cli/expected/flow-reuse-existing-script + judgeChecklist: + - creates the requested flow at f/evals/reuse_greeting__flow + - reuses the existing script from f/lib by path + - does not duplicate the greeting logic in a new inline script + - wires the name input into the reused script diff --git a/ai_evals/cases/flow.yaml b/ai_evals/cases/flow.yaml new file mode 100644 index 0000000000..cdb53c696a --- /dev/null +++ b/ai_evals/cases/flow.yaml @@ -0,0 +1,246 @@ +- id: flow-test0-sum-two-numbers + prompt: |- + Create a flow that takes two numbers, `a` and `b`, and returns their sum. + Keep it simple and use a single step named `sum_numbers`. + expected: ai_evals/fixtures/frontend/flow/expected/test0_sum_two_numbers.json + judgeChecklist: + - "the flow takes `a` and `b` as inputs" + - "the main step is named `sum_numbers`" + - the flow returns the sum of the two numbers + +- id: flow-test1-reuse-existing-script + prompt: |- + I need a flow that adds two numbers. + If there is already a script in the workspace that does that, reuse it instead of rewriting the logic. + The flow should take `a` and `b` as inputs and use a single step named `sum_numbers`. + initial: ai_evals/fixtures/frontend/flow/initial/test1_reuse_existing_script_initial.json + expected: ai_evals/fixtures/frontend/flow/expected/test1_reuse_existing_script.json + judgeChecklist: + - "the flow takes `a` and `b` as inputs" + - "the main step is named `sum_numbers`" + - the flow reuses the existing workspace script instead of rewriting the addition logic + +- id: flow-test2-call-existing-subflow + prompt: |- + Create a parent flow that adds two numbers by reusing an existing flow in the workspace if one already exists. + The parent flow should take `a` and `b` as inputs and delegate the calculation instead of inlining it. + Use a single step named `call_add_numbers`. + initial: ai_evals/fixtures/frontend/flow/initial/test2_call_existing_subflow_initial.json + expected: ai_evals/fixtures/frontend/flow/expected/test2_call_existing_subflow.json + judgeChecklist: + - "the parent flow takes `a` and `b` as inputs" + - "the main step is named `call_add_numbers`" + - the parent flow delegates to an existing workspace subflow instead of inlining the addition logic + +- id: flow-test3-branchone-routing + prompt: |- + Create a flow that routes incoming support requests based on the customer's tier. + The input should contain a string field named `tier`. + Free, pro, and enterprise requests should go to different queues, and unknown tiers should fall back to a default queue. + Name the main routing step `route_by_tier`. + expected: ai_evals/fixtures/frontend/flow/expected/test3_branchone_routing.json + judgeChecklist: + - "the input schema includes a string field named `tier`" + - "the main routing step is named `route_by_tier`" + - free requests go to a free queue + - pro requests go to a pro queue + - enterprise requests go to an enterprise queue + - unknown tiers fall back to a default queue + +- id: flow-test4-order-processing-loop + prompt: |- + Build an order-processing flow. + + The input should include an order with: + - an `items` array containing `name`, `price`, and `quantity` + - `customer_email` + - `shipping_address` + + The flow should: + - validate that every item has a positive price and quantity + - calculate the order total with 8% tax + - check inventory for each item using placeholder availability data + - create a shipment if everything is in stock, otherwise create a backorder + - send a confirmation using placeholder email logic + - return a final order summary with the status + validate: + schemaAnyOf: + - requiredPaths: + - order + - order.items + - order.customer_email + - order.shipping_address + - requiredPaths: + - items + - customer_email + - shipping_address + resolveResultsRefs: true + judgeChecklist: + - the flow validates that every item has a positive price and quantity + - the flow calculates the order total with 8% tax + - the flow checks inventory for each item using placeholder availability data + - the flow creates a shipment if everything is in stock, otherwise a backorder + - the flow sends a confirmation using placeholder email logic + - the flow returns a final order summary with the resulting status + +- id: flow-test5-parallel-data-pipeline + prompt: |- + Create a data-processing flow for three external data sources. + + It should: + - load a small placeholder configuration listing the three sources + - fetch placeholder records from each source + - clean and validate each source's records + - combine everything into one dataset + - compute an overall quality score + - store the result differently depending on the score: + - 90 or above goes to the primary database + - 70 to 89 goes to a secondary database with a warning + - below 70 goes to quarantine and triggers an alert + - return a processing report with total records, quality score, and destination + judgeChecklist: + - the flow loads a placeholder configuration listing three external sources + - the flow fetches placeholder records from each source + - the flow cleans and validates each source's records + - the flow combines everything into one dataset + - the flow computes an overall quality score + - scores of 90 or above go to the primary database + - scores from 70 to 89 go to a secondary database with a warning + - scores below 70 go to quarantine and trigger an alert + - the final report includes total records, quality score, and destination + +- id: flow-test6-ai-agent-tools + prompt: |- + Create a customer support flow. + + The input should include `customer_id` and `query_text`. + The flow should load the customer's profile and order history, then use an AI assistant to help with the request. + The assistant should be able to: + - look up orders + - check refund eligibility + - search FAQs + - open a support ticket when needed + + After that, log the interaction and return the assistant's response along with any actions it took. + judgeChecklist: + - "the input schema includes `customer_id` and `query_text`" + - the flow loads the customer's profile and order history + - the flow uses an AI assistant step + - the assistant can look up orders + - the assistant can check refund eligibility + - the assistant can search FAQs + - the assistant can open a support ticket + - the flow logs the interaction + - the final output returns the assistant response along with any actions taken or resulting support action details + +- id: flow-test7-simple-modification + prompt: |- + Update this flow so it validates processed data before saving it. + + After `process_data`, add a `validate_data` step that checks the data array is not empty. + If the array is empty, it should return an error object with the message `No data to save`. + If validation passes, let the save continue normally. + Update `save_results` so it handles the validation result correctly. + initial: ai_evals/fixtures/frontend/flow/initial/test5_initial.json + expected: ai_evals/fixtures/frontend/flow/expected/test5_modify_simple.json + judgeChecklist: + - the updated flow keeps the original fetch and process steps intact + - "a `validate_data` step is added after `process_data`" + - "`validate_data` checks that the processed data array is not empty" + - "empty data returns an error object with the message `No data to save`" + - "`save_results` handles the validation result correctly" + +- id: flow-test8-branching-in-loop + prompt: |- + Update the order-processing logic inside `loop_orders` so different order types are handled differently. + + For `express`, mark the order as priority and use a shipping cost of $15.99. + For `standard`, use a shipping cost of $5.99. + For `pickup`, mark it as no shipping required with a cost of $0. + Keep the existing processing as a fallback for unknown order types. + Each path should return the orderId, shipping cost, and shipping type. + initial: ai_evals/fixtures/frontend/flow/initial/test6_initial.json + judgeChecklist: + - "the existing `loop_orders` flow still handles per-order processing" + - exact branching topology is not required as long as `loop_orders` handles the order types correctly + - express orders are marked as priority and use a shipping cost of 15.99 + - standard orders use a shipping cost of 5.99 + - pickup orders use a shipping cost of 0 and are treated as no shipping required + - unknown order types still follow a fallback path + - "each processed order returns `orderId`, `shippingCost`, and `shippingType`" + +- id: flow-test9-parallel-refactor + prompt: |- + Refactor this flow so the enrichment work no longer runs one step at a time. + + `enrich_price`, `enrich_inventory`, and `enrich_reviews` should run independently. + Each one should return a fallback value if it fails. + Update `combine_data` so it merges the enrichment results and sets a `hasFallbacks` flag when any fallback was used. + Keep `get_item` as the first step and `return_result` as the last step. + initial: ai_evals/fixtures/frontend/flow/initial/test7_initial.json + expected: ai_evals/fixtures/frontend/flow/expected/test7_modify_complex.json + judgeChecklist: + - "the updated flow keeps `get_item` as the first step" + - "the updated flow keeps `return_result` as the last step" + - "`enrich_price`, `enrich_inventory`, and `enrich_reviews` run independently rather than sequentially" + - each enrichment path returns a fallback value if it fails + - "`combine_data` merges the enrichment results" + - "`combine_data` sets `hasFallbacks` when any fallback was used" + +- id: flow-test10-while-loop-counter + prompt: |- + Create a flow that keeps incrementing a counter until it reaches a target value. + The input should include a number field named `target`. + Name the looping step `count_until_target`. + Once the target is reached, return the final counter value. + expected: ai_evals/fixtures/frontend/flow/expected/test10_while_loop_counter.json + judgeChecklist: + - "the input schema includes a number field named `target`" + - "the looping step is named `count_until_target`" + - the flow keeps incrementing a counter until the target is reached + - the final output returns the final counter value + +- id: flow-test11-preprocessor-and-failure-handler + prompt: |- + Create an event-processing flow for a string payload. + + Before the main processing runs, trim the payload and reject empty strings. + The main step should be named `process_event` and return a simple success object. + If anything fails, return a compact error object with the error message and the failing step id. + expected: ai_evals/fixtures/frontend/flow/expected/test11_preprocessor_failure.json + validate: + requireSpecialModules: + - preprocessor_module + - failure_module + judgeChecklist: + - the flow trims the payload before the main processing runs + - the flow rejects empty payload strings + - "the main step is named `process_event`" + - "`process_event` returns a simple success object" + - failures return a compact error object with the error message and failing step id + +- id: flow-test12-approval-step + prompt: |- + Create a purchase approval flow. + + The input should include `requester_email` and `amount`. + Add an approval step named `request_approval` that pauses the flow and asks the approver for a comment. + One approval should be enough to continue. + After approval, add a final step named `finalize_purchase` that returns an approved status object. + expected: ai_evals/fixtures/frontend/flow/expected/test12_approval_step.json + validate: + schemaRequiredPaths: + - requester_email + - amount + requireSuspendSteps: + - id: request_approval + requiredEvents: 1 + resumeRequiredStringFieldAnyOf: + - comment + - approver_comment + judgeChecklist: + - "the flow includes an approval step named `request_approval`" + - "`request_approval` pauses the flow and asks the approver for a comment" + - one approval is enough to continue + - "the flow includes a final step named `finalize_purchase`" + - "`finalize_purchase` returns an approved status object after approval" diff --git a/ai_evals/cases/script.yaml b/ai_evals/cases/script.yaml new file mode 100644 index 0000000000..6fc20199e3 --- /dev/null +++ b/ai_evals/cases/script.yaml @@ -0,0 +1,11 @@ +- id: script-test1-greet-user + prompt: |- + Update the current Bun script so it takes the existing `name` input and returns a plain greeting string like `Hello, Alice!`. + Do not wrap the result in an object or array. + Keep it simple and do not add external dependencies. + initial: ai_evals/fixtures/frontend/script/initial/test1_empty_bun.json + expected: ai_evals/fixtures/frontend/script/expected/test1_greet_user.json + judgeChecklist: + - uses the existing `name` input + - returns a plain greeting string + - does not wrap the result in an object or array diff --git a/ai_evals/cli/index.ts b/ai_evals/cli/index.ts new file mode 100644 index 0000000000..d64a547f14 --- /dev/null +++ b/ai_evals/cli/index.ts @@ -0,0 +1,295 @@ +#!/usr/bin/env bun + +import { Command, InvalidArgumentError } from "commander"; +import { loadCases, loadSelectedCases } from "../core/cases"; +import { + EVAL_MODELS, + type EvalModelSpec, + formatRunModelLabel, + getCliEvalModel, + getEvalModelHelpText, + resolveEvalModel, +} from "../core/models"; +import { + appendHistoryRecord, + buildRunResult, + formatRunSummary, + resolveRunOutputPath, + writeRunArtifacts, + writeRunResult, +} from "../core/results"; +import { runSuite } from "../core/runSuite"; +import { EVAL_MODES, type EvalMode } from "../core/types"; +import { DEFAULT_JUDGE_MODEL } from "../core/judge"; +import { createCliModeRunner } from "../modes/cli"; +import { runFrontendBenchmarkAdapter } from "../adapters/frontend/runtime"; + +async function main() { + const program = new Command() + .name("bun run cli --") + .description("Run AI eval cases against the current production prompts and guidance") + .showHelpAfterError() + .showSuggestionAfterError() + .addHelpText( + "after", + [ + "", + "Examples:", + " bun run cli -- models", + " bun run cli -- cases", + " bun run cli -- cases flow", + " bun run cli -- run flow", + " bun run cli -- run flow --model 4o", + " bun run cli -- run flow --models haiku,opus,4o", + " bun run cli -- run flow flow-test0-sum-two-numbers --verbose", + " bun run cli -- run flow --record", + " bun run cli -- run flow flow-test5-simple-modification --runs 3", + " bun run cli -- run cli bun-hello-script", + "", + "Models:", + getEvalModelHelpText(), + ].join("\n") + ); + + program + .command("models") + .description("List available model aliases") + .action(() => { + handleModels(); + }); + + program + .command("cases") + .description("List available cases") + .argument("[mode]", "cli, flow, script, or app", parseOptionalMode) + .action(async (mode?: EvalMode) => { + await handleCases(mode); + }); + + program + .command("run") + .description("Run one benchmark mode") + .argument("", "cli, flow, script, or app", parseMode) + .argument("[caseIds...]", "specific case ids to run") + .option("--runs ", "number of attempts per case", parsePositiveInteger, 1) + .option("--output ", "write the result JSON to this path") + .option("--model ", `model alias (${EVAL_MODELS.map((entry) => entry.id).join(", ")})`) + .option("--models ", "comma-separated model aliases to run sequentially") + .option("--verbose", "stream assistant output during frontend runs") + .option("--record", "append a compact summary line to ai_evals/history/.jsonl") + .action( + async ( + mode: EvalMode, + caseIds: string[], + options: { + runs: number; + output?: string; + model?: string; + models?: string; + verbose?: boolean; + record?: boolean; + } + ) => { + await handleRun({ + mode, + caseIds, + runs: options.runs, + outputPath: options.output, + model: options.model, + models: options.models, + verbose: options.verbose ?? false, + record: options.record ?? false, + }); + } + ); + + await program.parseAsync(process.argv); +} + +async function handleCases(mode?: EvalMode) { + const modes = mode ? [mode] : [...EVAL_MODES]; + + for (const entry of modes) { + const cases = await loadCases(entry); + process.stdout.write(`${entry} (${cases.length})\n`); + for (const evalCase of cases) { + process.stdout.write(`- ${evalCase.id}\n`); + } + process.stdout.write("\n"); + } +} + +function handleModels() { + process.stdout.write("Available models\n"); + for (const model of EVAL_MODELS) { + const supports = [ + ...(model.frontend ? ["flow", "script", "app"] : []), + ...(model.cli ? ["cli"] : []), + ]; + const aliases = [model.id, ...model.aliases.filter((alias) => alias !== model.id)]; + process.stdout.write(`- ${model.id}: ${model.label}\n`); + process.stdout.write(` aliases: ${aliases.join(", ")}\n`); + process.stdout.write(` modes: ${supports.join(", ")}\n`); + } + process.stdout.write(`\nJudge model: ${DEFAULT_JUDGE_MODEL}\n`); +} + +async function handleRun(input: { + mode: EvalMode; + caseIds: string[]; + runs: number; + outputPath?: string; + model?: string; + models?: string; + verbose: boolean; + record: boolean; +}) { + if (input.record && input.caseIds.length > 0) { + throw new Error("--record only supports full-suite runs; omit case ids to record history"); + } + if (input.model && input.models) { + throw new Error("Use either --model or --models, not both"); + } + + const selectedCases = await loadSelectedCases(input.mode, input.caseIds); + const models = resolveRequestedModels(input.mode, input.model, input.models); + if (input.outputPath && models.length > 1) { + throw new Error("--output only supports a single model run"); + } + + const summaries: Array<{ label: string; passRate: number; averageDurationMs: number }> = []; + + for (const [index, model] of models.entries()) { + const runModel = formatRunModelLabel(input.mode, model); + if (models.length > 1) { + process.stdout.write( + `${index > 0 ? "\n" : ""}=== ${input.mode} ${model.id} (${runModel}) ===\n` + ); + } + process.stderr.write(`Starting ${input.mode} benchmark...\n`); + + const result = + input.mode === "cli" + ? await runCliBenchmark(selectedCases, input.runs, getCliEvalModel(model), runModel) + : await runFrontendBenchmarkAdapter({ + mode: input.mode, + caseIds: input.caseIds, + runs: input.runs, + model: model.id, + verbose: input.verbose, + }); + + const resolvedOutputPath = + models.length === 1 + ? resolveRunOutputPath(input.mode, input.outputPath) + : resolveRunOutputPath(input.mode); + const artifactsPath = await writeRunArtifacts(result, resolvedOutputPath); + const resultPath = await writeRunResult(result, resolvedOutputPath); + const historyPath = input.record ? await appendHistoryRecord(result) : null; + process.stdout.write(`${formatRunSummary(result)}\n`); + process.stdout.write(`Saved: ${resultPath}\n`); + if (artifactsPath) { + process.stdout.write(`Artifacts: ${artifactsPath}\n`); + } + if (historyPath) { + process.stdout.write(`Recorded: ${historyPath}\n`); + } + + summaries.push({ + label: `${model.id} (${runModel})`, + passRate: result.passRate, + averageDurationMs: result.averageDurationMs, + }); + } + + if (summaries.length > 1) { + process.stdout.write("\nModel summary\n"); + for (const summary of summaries) { + process.stdout.write( + `- ${summary.label}: ${formatPercent(summary.passRate)} | ${Math.round(summary.averageDurationMs)}ms\n` + ); + } + } +} + +async function runCliBenchmark( + cases: Awaited>, + runs: number, + model: ReturnType, + runModel: string +) { + const caseResults = await runSuite({ + modeRunner: createCliModeRunner(model), + cases, + runs, + runModel, + judgeModel: DEFAULT_JUDGE_MODEL, + }); + + return buildRunResult({ + mode: "cli", + runs, + runModel, + judgeModel: DEFAULT_JUDGE_MODEL, + caseResults, + }); +} + +function parseMode(value: string): EvalMode { + if (EVAL_MODES.includes(value as EvalMode)) { + return value as EvalMode; + } + throw new InvalidArgumentError(`mode must be one of: ${EVAL_MODES.join(", ")}`); +} + +function parseOptionalMode(value: string | undefined): EvalMode | undefined { + return value ? parseMode(value) : undefined; +} + +function parsePositiveInteger(value: string): number { + const parsed = Number(value); + if (!Number.isInteger(parsed) || parsed <= 0) { + throw new InvalidArgumentError("must be a positive integer"); + } + return parsed; +} + +function resolveRequestedModels( + mode: EvalMode, + singleModel?: string, + multipleModels?: string +): EvalModelSpec[] { + if (!multipleModels) { + return [resolveEvalModel(mode, singleModel)]; + } + + const aliases = multipleModels + .split(",") + .map((value) => value.trim()) + .filter(Boolean); + if (aliases.length === 0) { + throw new Error("--models requires at least one model alias"); + } + + const seen = new Set(); + const models: EvalModelSpec[] = []; + for (const alias of aliases) { + const model = resolveEvalModel(mode, alias); + if (seen.has(model.id)) { + continue; + } + seen.add(model.id); + models.push(model); + } + return models; +} + +function formatPercent(value: number): string { + return `${(value * 100).toFixed(1)}%`; +} + +void main().catch((error) => { + const message = error instanceof Error ? error.message : String(error); + process.stderr.write(`${message}\n`); + process.exit(1); +}); diff --git a/ai_evals/core/cases.ts b/ai_evals/core/cases.ts new file mode 100644 index 0000000000..69f1e8a890 --- /dev/null +++ b/ai_evals/core/cases.ts @@ -0,0 +1,71 @@ +import { readFile } from "node:fs/promises"; +import path from "node:path"; +import { fileURLToPath } from "node:url"; +import { parse } from "yaml"; +import type { EvalCase, EvalMode, FlowValidationSpec } from "./types"; + +const REPO_ROOT = fileURLToPath(new URL("../../", import.meta.url)); +const CASES_DIR = path.join(REPO_ROOT, "ai_evals", "cases"); + +interface RawEvalCase { + id: string; + prompt: string; + initial?: string; + expected?: string; + validate?: FlowValidationSpec; + judgeChecklist?: string[]; +} + +export function getRepoRoot(): string { + return REPO_ROOT; +} + +export function getAiEvalsRoot(): string { + return path.join(REPO_ROOT, "ai_evals"); +} + +export async function loadCases(mode: EvalMode): Promise { + const filePath = path.join(CASES_DIR, `${mode}.yaml`); + const raw = await readFile(filePath, "utf8"); + const parsed = parse(raw); + + if (!Array.isArray(parsed)) { + throw new Error(`Expected ${filePath} to contain a YAML list of cases`); + } + + return parsed.map((entry) => ({ + id: entry.id, + prompt: entry.prompt, + initialPath: resolveFixturePath(entry.initial), + expectedPath: resolveFixturePath(entry.expected), + validate: entry.validate, + judgeChecklist: entry.judgeChecklist, + })); +} + +export async function loadSelectedCases( + mode: EvalMode, + selectedIds: string[] +): Promise { + const allCases = await loadCases(mode); + if (selectedIds.length === 0) { + return allCases; + } + + const caseMap = new Map(allCases.map((entry) => [entry.id, entry])); + const missing = selectedIds.filter((id) => !caseMap.has(id)); + if (missing.length > 0) { + throw new Error( + `Unknown ${mode} case${missing.length === 1 ? "" : "s"}: ${missing.join(", ")}` + ); + } + + return selectedIds.map((id) => caseMap.get(id)!); +} + +function resolveFixturePath(value: string | undefined): string | undefined { + if (!value) { + return undefined; + } + return path.isAbsolute(value) ? value : path.join(REPO_ROOT, value); +} diff --git a/ai_evals/core/files.ts b/ai_evals/core/files.ts new file mode 100644 index 0000000000..7b58f76e28 --- /dev/null +++ b/ai_evals/core/files.ts @@ -0,0 +1,67 @@ +import { access, copyFile, mkdir, readdir, readFile } from "node:fs/promises"; +import path from "node:path"; + +export async function exists(filePath: string): Promise { + try { + await access(filePath); + return true; + } catch { + return false; + } +} + +export async function readJsonFile(filePath: string): Promise { + const raw = await readFile(filePath, "utf8"); + return JSON.parse(raw) as T; +} + +export async function readDirectoryFiles( + rootDir: string, + options: { + ignore?: Set; + } = {} +): Promise> { + const files: Record = {}; + await walkDirectory(rootDir, "", files, options.ignore ?? new Set()); + return files; +} + +export async function copyDirectory(sourceDir: string, targetDir: string): Promise { + const entries = await readdir(sourceDir, { withFileTypes: true }); + await mkdir(targetDir, { recursive: true }); + + for (const entry of entries) { + const sourcePath = path.join(sourceDir, entry.name); + const targetPath = path.join(targetDir, entry.name); + if (entry.isDirectory()) { + await copyDirectory(sourcePath, targetPath); + continue; + } + await mkdir(path.dirname(targetPath), { recursive: true }); + await copyFile(sourcePath, targetPath); + } +} + +async function walkDirectory( + absoluteDir: string, + relativeDir: string, + output: Record, + ignore: Set +): Promise { + const entries = await readdir(absoluteDir, { withFileTypes: true }); + + for (const entry of entries) { + const relativePath = relativeDir ? `${relativeDir}/${entry.name}` : entry.name; + if (ignore.has(relativePath) || ignore.has(entry.name)) { + continue; + } + + const absolutePath = path.join(absoluteDir, entry.name); + if (entry.isDirectory()) { + await walkDirectory(absolutePath, relativePath, output, ignore); + continue; + } + + output[relativePath] = await readFile(absolutePath, "utf8"); + } +} diff --git a/ai_evals/core/judge.ts b/ai_evals/core/judge.ts new file mode 100644 index 0000000000..cae66721c6 --- /dev/null +++ b/ai_evals/core/judge.ts @@ -0,0 +1,149 @@ +import Anthropic from "@anthropic-ai/sdk"; +import type { EvalMode, JudgeResult } from "./types"; + +export const DEFAULT_JUDGE_MODEL = "claude-sonnet-4-6"; + +const JUDGE_TOOL_NAME = "submit_judgement"; + +export async function judgeOutput(input: { + mode: EvalMode; + prompt: string; + checklist?: string[]; + initial?: unknown; + expected?: unknown; + actual: unknown; + model?: string; +}): Promise { + const apiKey = process.env.ANTHROPIC_API_KEY; + if (!apiKey) { + return { + success: false, + score: 0, + summary: "Judge unavailable", + error: "ANTHROPIC_API_KEY is not set", + }; + } + + const client = new Anthropic({ apiKey }); + const model = input.model ?? DEFAULT_JUDGE_MODEL; + + const system = [ + "You evaluate benchmark outputs for Windmill AI generation.", + "Deterministic checks already run separately. Focus on whether the final output satisfies the user request.", + "If expected state is provided, treat it as a valid example and reward semantically equivalent outputs.", + "If a checklist is provided, treat it as the explicit acceptance criteria for this case.", + "Be strict about missing requested functionality.", + "When the prompt wording is ambiguous, prefer the checklist over inferred structural requirements.", + "Do not require exact ids, exact topology, or exact field names unless the prompt, checklist, or expected state clearly requires them.", + `Always respond by calling the ${JUDGE_TOOL_NAME} tool exactly once.`, + ].join("\n\n"); + + const user = [ + `Mode: ${input.mode}`, + "", + "User prompt:", + input.prompt, + "", + "Checklist:", + formatChecklist(input.checklist), + "", + "Initial state:", + formatJsonBlock(input.initial), + "", + "Expected state:", + formatJsonBlock(input.expected), + "", + "Actual result:", + formatJsonBlock(input.actual), + ].join("\n"); + + try { + const response = await client.messages.create({ + model, + max_tokens: 1024, + temperature: 0, + system, + messages: [{ role: "user", content: user }], + tools: [ + { + name: JUDGE_TOOL_NAME, + description: "Submit the benchmark judgement as structured data.", + input_schema: { + type: "object", + properties: { + score: { + type: "integer", + minimum: 0, + maximum: 100, + }, + summary: { + type: "string", + }, + }, + required: ["score", "summary"], + }, + }, + ], + tool_choice: { + type: "tool", + name: JUDGE_TOOL_NAME, + disable_parallel_tool_use: true, + }, + }); + + const toolUseBlock = response.content.find( + (block): block is Anthropic.ToolUseBlock => + block.type === "tool_use" && block.name === JUDGE_TOOL_NAME + ); + + if (!toolUseBlock) { + return { + success: false, + score: 0, + summary: "Judge returned no tool output", + error: "Expected structured tool output from judge", + }; + } + + const parsed = toolUseBlock.input as { + score: number; + summary: string; + }; + + return { + success: true, + score: normalizeScore(parsed.score), + summary: parsed.summary, + }; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + return { + success: false, + score: 0, + summary: "Judge failed", + error: message, + }; + } +} + +function formatJsonBlock(value: unknown): string { + if (value === undefined) { + return "(none)"; + } + return JSON.stringify(value, null, 2); +} + +function formatChecklist(checklist: string[] | undefined): string { + if (!checklist || checklist.length === 0) { + return "(none)"; + } + + return checklist.map((item) => `- ${item}`).join("\n"); +} + +function normalizeScore(value: number): number { + if (!Number.isFinite(value)) { + return 0; + } + return Math.max(0, Math.min(100, Math.round(value))); +} diff --git a/ai_evals/core/models.test.ts b/ai_evals/core/models.test.ts new file mode 100644 index 0000000000..86bf1c6a9a --- /dev/null +++ b/ai_evals/core/models.test.ts @@ -0,0 +1,29 @@ +import { describe, expect, it } from "bun:test"; +import { resolveEvalModel } from "./models"; + +describe("resolveEvalModel", () => { + it("supports Gemini aliases for frontend evals", () => { + expect(resolveEvalModel("flow", "gemini").frontend).toEqual({ + provider: "googleai", + model: "gemini-2.5-flash", + }); + expect(resolveEvalModel("app", "gemini-pro").frontend).toEqual({ + provider: "googleai", + model: "gemini-2.5-pro", + }); + expect(resolveEvalModel("script", "gemini-3-flash-preview").frontend).toEqual({ + provider: "googleai", + model: "gemini-3-flash-preview", + }); + expect(resolveEvalModel("flow", "gemini-3.1-pro-preview").frontend).toEqual({ + provider: "googleai", + model: "gemini-3.1-pro-preview", + }); + }); + + it("rejects Gemini aliases for cli evals", () => { + expect(() => resolveEvalModel("cli", "gemini")).toThrow( + "Model gemini-flash is not supported for cli mode" + ); + }); +}); diff --git a/ai_evals/core/models.ts b/ai_evals/core/models.ts new file mode 100644 index 0000000000..9cc0ab0597 --- /dev/null +++ b/ai_evals/core/models.ts @@ -0,0 +1,185 @@ +import type { EvalMode } from "./types"; + +export interface FrontendEvalModelConfig { + provider: "anthropic" | "openai" | "googleai"; + model: string; +} + +export interface CliEvalModelConfig { + provider: "anthropic"; + model: string; +} + +export interface EvalModelSpec { + id: string; + label: string; + aliases: string[]; + frontend?: FrontendEvalModelConfig; + cli?: CliEvalModelConfig; +} + +export const EVAL_MODELS: EvalModelSpec[] = [ + { + id: "haiku", + label: "Claude Haiku 4.5", + aliases: [ + "haiku", + "haiku-4.5", + "claude-haiku", + "claude-haiku-4.5", + "claude-haiku-4-5", + "claude-haiku-4-5-20251001", + ], + frontend: { + provider: "anthropic", + model: "claude-haiku-4-5-20251001", + }, + cli: { + provider: "anthropic", + model: "haiku", + }, + }, + { + id: "sonnet", + label: "Claude Sonnet 4.5", + aliases: [ + "sonnet", + "sonnet-4.5", + "claude-sonnet", + "claude-sonnet-4.5", + "claude-sonnet-4-5", + "claude-sonnet-4-5-20250929", + ], + frontend: { + provider: "anthropic", + model: "claude-sonnet-4-5-20250929", + }, + cli: { + provider: "anthropic", + model: "sonnet", + }, + }, + { + id: "opus", + label: "Claude Opus 4.6", + aliases: [ + "opus", + "opus-4.6", + "claude-opus", + "claude-opus-4.6", + "claude-opus-4-6", + ], + frontend: { + provider: "anthropic", + model: "claude-opus-4-6", + }, + cli: { + provider: "anthropic", + model: "opus", + }, + }, + { + id: "4o", + label: "GPT-4o", + aliases: ["4o", "gpt-4o"], + frontend: { + provider: "openai", + model: "gpt-4o", + }, + }, + { + id: "gemini-flash", + label: "Gemini 2.5 Flash", + aliases: ["gemini", "gemini-flash", "gemini-2.5-flash"], + frontend: { + provider: "googleai", + model: "gemini-2.5-flash", + }, + }, + { + id: "gemini-pro", + label: "Gemini 2.5 Pro", + aliases: ["gemini-pro", "gemini-2.5-pro"], + frontend: { + provider: "googleai", + model: "gemini-2.5-pro", + }, + }, + { + id: "gemini-3-flash-preview", + label: "Gemini 3 Flash Preview", + aliases: ["gemini-3-flash-preview", "gemini-3-flash"], + frontend: { + provider: "googleai", + model: "gemini-3-flash-preview", + }, + }, + { + id: "gemini-3.1-pro-preview", + label: "Gemini 3.1 Pro Preview", + aliases: ["gemini-3.1-pro-preview", "gemini-3.1-pro", "gemini-3-pro-preview"], + frontend: { + provider: "googleai", + model: "gemini-3.1-pro-preview", + }, + }, +]; + +export function resolveEvalModel(mode: EvalMode, alias?: string): EvalModelSpec { + const spec = alias ? findEvalModel(alias) : getDefaultEvalModel(mode); + if (!spec) { + throw new Error(`Unknown model: ${alias}`); + } + + if (mode === "cli" && !spec.cli) { + throw new Error(`Model ${spec.id} is not supported for cli mode`); + } + + if (mode !== "cli" && !spec.frontend) { + throw new Error(`Model ${spec.id} is not supported for ${mode} mode`); + } + + return spec; +} + +export function getEvalModelHelpText(): string { + return EVAL_MODELS.map((model) => { + const modes = [ + ...(model.frontend ? ["flow", "script", "app"] : []), + ...(model.cli ? ["cli"] : []), + ]; + return ` ${model.id.padEnd(8)} ${model.label} (${modes.join(", ")})`; + }).join("\n"); +} + +export function formatRunModelLabel(mode: EvalMode, model: EvalModelSpec): string { + if (mode === "cli") { + return `${model.cli!.provider}:${model.cli!.model}`; + } + return `${model.frontend!.provider}:${model.frontend!.model}`; +} + +export function getFrontendEvalModel(model: EvalModelSpec): FrontendEvalModelConfig { + if (!model.frontend) { + throw new Error(`Model ${model.id} does not support frontend evals`); + } + return model.frontend; +} + +export function getCliEvalModel(model: EvalModelSpec): CliEvalModelConfig { + if (!model.cli) { + throw new Error(`Model ${model.id} does not support cli evals`); + } + return model.cli; +} + +function getDefaultEvalModel(mode: EvalMode): EvalModelSpec { + return mode === "cli" ? EVAL_MODELS[0]! : EVAL_MODELS[0]!; +} + +function findEvalModel(alias: string): EvalModelSpec | undefined { + const normalized = alias.trim().toLowerCase(); + return EVAL_MODELS.find((model) => + [model.id, ...model.aliases].some((candidate) => candidate.toLowerCase() === normalized) + ); +} diff --git a/ai_evals/core/results.ts b/ai_evals/core/results.ts new file mode 100644 index 0000000000..f2ea5b7dc4 --- /dev/null +++ b/ai_evals/core/results.ts @@ -0,0 +1,296 @@ +import { appendFile, mkdir, rm, writeFile } from "node:fs/promises"; +import path from "node:path"; +import { execFileSync } from "node:child_process"; +import { getAiEvalsRoot, getRepoRoot } from "./cases"; +import type { + BenchmarkArtifactFile, + BenchmarkCaseResult, + BenchmarkRunResult, + BenchmarkTokenUsage, + EvalMode, +} from "./types"; + +export async function writeRunResult( + result: BenchmarkRunResult, + outputPath?: string +): Promise { + const targetPath = resolveRunOutputPath(result.mode, outputPath); + await mkdir(path.dirname(targetPath), { recursive: true }); + await writeFile(targetPath, JSON.stringify(toSerializableRunResult(result), null, 2) + "\n", "utf8"); + return targetPath; +} + +export async function appendHistoryRecord( + result: BenchmarkRunResult, + historyPath = resolveHistoryPath(result.mode) +): Promise { + await mkdir(path.dirname(historyPath), { recursive: true }); + await appendFile(historyPath, JSON.stringify(toHistoryRecord(result)) + "\n", "utf8"); + return historyPath; +} + +export async function writeRunArtifacts( + result: BenchmarkRunResult, + outputPath?: string +): Promise { + const targetPath = resolveRunOutputPath(result.mode, outputPath); + const artifactRoot = defaultArtifactsRoot(targetPath); + + await rm(artifactRoot, { recursive: true, force: true }); + + let wroteArtifacts = false; + for (const caseResult of result.cases) { + for (const attempt of caseResult.attempts) { + const artifactFiles = attempt.artifactFiles ?? []; + if (artifactFiles.length === 0) { + attempt.artifactsPath = null; + continue; + } + + const attemptDir = path.join(artifactRoot, caseResult.id, `attempt-${attempt.attempt}`); + await writeArtifactFiles(attemptDir, artifactFiles); + attempt.artifactsPath = attemptDir; + wroteArtifacts = true; + } + } + + result.artifactsPath = wroteArtifacts ? artifactRoot : null; + return result.artifactsPath ?? null; +} + +export function buildRunResult(input: { + mode: EvalMode; + runs: number; + runModel: string | null; + judgeModel: string | null; + caseResults: BenchmarkCaseResult[]; +}): BenchmarkRunResult { + const attemptCount = input.caseResults.reduce((sum, entry) => sum + entry.attempts.length, 0); + const passedAttempts = input.caseResults.reduce( + (sum, entry) => sum + entry.attempts.filter((attempt) => attempt.passed).length, + 0 + ); + const durationTotal = input.caseResults.reduce( + (sum, entry) => sum + entry.attempts.reduce((inner, attempt) => inner + attempt.durationMs, 0), + 0 + ); + const tokenUsageTotal = input.caseResults.reduce( + (sum, entry) => { + for (const attempt of entry.attempts) { + if (!attempt.tokenUsage) { + continue; + } + sum ??= { prompt: 0, completion: 0, total: 0 }; + sum.prompt += attempt.tokenUsage.prompt; + sum.completion += attempt.tokenUsage.completion; + sum.total += attempt.tokenUsage.total; + } + return sum; + }, + null + ); + + return { + version: 1, + mode: input.mode, + createdAt: new Date().toISOString(), + gitSha: getGitSha(), + runs: input.runs, + runModel: input.runModel, + judgeModel: input.judgeModel, + caseCount: input.caseResults.length, + attemptCount, + passedAttempts, + passRate: attemptCount === 0 ? 0 : passedAttempts / attemptCount, + averageDurationMs: attemptCount === 0 ? 0 : durationTotal / attemptCount, + totalTokenUsage: tokenUsageTotal, + averageTokenUsagePerAttempt: + attemptCount === 0 || !tokenUsageTotal + ? null + : { + prompt: tokenUsageTotal.prompt / attemptCount, + completion: tokenUsageTotal.completion / attemptCount, + total: tokenUsageTotal.total / attemptCount, + }, + cases: input.caseResults, + }; +} + +export function formatRunSummary(result: BenchmarkRunResult): string { + const lines = [ + `${result.mode} benchmark complete`, + `Pass rate: ${formatPercent(result.passRate)} (${result.passedAttempts}/${result.attemptCount})`, + `Average duration: ${Math.round(result.averageDurationMs)}ms`, + ]; + + const failures = collectFailures(result); + if (failures.length > 0) { + lines.push("Failures:"); + for (const entry of failures.slice(0, 10)) { + lines.push(`- ${entry}`); + } + } + + return lines.join("\n"); +} + +function collectFailures(result: BenchmarkRunResult): string[] { + const failures: string[] = []; + + for (const caseResult of result.cases) { + for (const attempt of caseResult.attempts) { + if (attempt.passed) { + continue; + } + const failedChecks = attempt.checks.filter((check) => !check.passed).map((check) => check.name); + failures.push( + `${caseResult.id} attempt ${attempt.attempt}: ${failedChecks.join(", ") || attempt.error || "failed"}` + ); + } + } + + return failures; +} + +function defaultFileName(mode: EvalMode): string { + return `${new Date().toISOString().replaceAll(":", "-")}__${mode}.json`; +} + +export function resolveRunOutputPath(mode: EvalMode, outputPath?: string): string { + return outputPath ?? path.join(getAiEvalsRoot(), "results", defaultFileName(mode)); +} + +export function resolveHistoryPath(mode: EvalMode): string { + return path.join(getAiEvalsRoot(), "history", `${mode}.jsonl`); +} + +function defaultArtifactsRoot(resultPath: string): string { + return resultPath.endsWith(".json") + ? resultPath.slice(0, -".json".length) + : `${resultPath}.artifacts`; +} + +async function writeArtifactFiles( + rootDir: string, + files: BenchmarkArtifactFile[] +): Promise { + for (const file of files) { + const relativePath = normalizeArtifactPath(file.path); + const targetPath = path.join(rootDir, relativePath); + await mkdir(path.dirname(targetPath), { recursive: true }); + await writeFile(targetPath, file.content, "utf8"); + } +} + +function normalizeArtifactPath(filePath: string): string { + const normalized = filePath.replaceAll("\\", "/").replace(/^\/+/, ""); + const parts = normalized.split("/").filter(Boolean); + if (parts.length === 0 || parts.some((part) => part === "." || part === "..")) { + throw new Error(`Invalid artifact path: ${filePath}`); + } + return parts.join("/"); +} + +function toSerializableRunResult(result: BenchmarkRunResult): BenchmarkRunResult { + return { + ...result, + cases: result.cases.map((caseResult) => ({ + ...caseResult, + attempts: caseResult.attempts.map(({ artifactFiles, ...attempt }) => attempt), + })), + }; +} + +function toHistoryRecord(result: BenchmarkRunResult) { + const judgeScores = result.cases.flatMap((caseResult) => + caseResult.attempts.flatMap((attempt) => + typeof attempt.judgeScore === "number" ? [attempt.judgeScore] : [] + ) + ); + + return { + createdAt: result.createdAt, + gitSha: result.gitSha, + mode: result.mode, + runs: result.runs, + runModel: result.runModel, + judgeModel: result.judgeModel, + caseCount: result.caseCount, + attemptCount: result.attemptCount, + passedAttempts: result.passedAttempts, + passRate: result.passRate, + averageDurationMs: result.averageDurationMs, + averageJudgeScore: + judgeScores.length === 0 + ? null + : judgeScores.reduce((sum, score) => sum + score, 0) / judgeScores.length, + averageTokenUsagePerAttempt: result.averageTokenUsagePerAttempt ?? null, + failedCaseIds: Array.from( + new Set( + result.cases + .filter((caseResult) => caseResult.attempts.some((attempt) => !attempt.passed)) + .map((caseResult) => caseResult.id) + ) + ), + cases: result.cases.map((caseResult) => { + const attemptCount = caseResult.attempts.length; + const passedAttempts = caseResult.attempts.filter((attempt) => attempt.passed).length; + const totalDurationMs = caseResult.attempts.reduce( + (sum, attempt) => sum + attempt.durationMs, + 0 + ); + const judgeScores = caseResult.attempts.flatMap((attempt) => + typeof attempt.judgeScore === "number" ? [attempt.judgeScore] : [] + ); + const totalTokenUsage = caseResult.attempts.reduce( + (sum, attempt) => { + if (!attempt.tokenUsage) { + return sum; + } + sum ??= { prompt: 0, completion: 0, total: 0 }; + sum.prompt += attempt.tokenUsage.prompt; + sum.completion += attempt.tokenUsage.completion; + sum.total += attempt.tokenUsage.total; + return sum; + }, + null + ); + + return { + id: caseResult.id, + attemptCount, + passedAttempts, + passRate: attemptCount === 0 ? 0 : passedAttempts / attemptCount, + averageDurationMs: attemptCount === 0 ? 0 : totalDurationMs / attemptCount, + averageJudgeScore: + judgeScores.length === 0 + ? null + : judgeScores.reduce((sum, score) => sum + score, 0) / judgeScores.length, + averageTokenUsagePerAttempt: + attemptCount === 0 || !totalTokenUsage + ? null + : { + prompt: totalTokenUsage.prompt / attemptCount, + completion: totalTokenUsage.completion / attemptCount, + total: totalTokenUsage.total / attemptCount, + }, + }; + }), + }; +} + +function getGitSha(): string | null { + try { + return execFileSync("git", ["rev-parse", "HEAD"], { + cwd: getRepoRoot(), + encoding: "utf8", + stdio: ["ignore", "pipe", "ignore"], + }).trim(); + } catch { + return null; + } +} + +function formatPercent(value: number): string { + return `${(value * 100).toFixed(1)}%`; +} diff --git a/ai_evals/core/runSuite.ts b/ai_evals/core/runSuite.ts new file mode 100644 index 0000000000..9e155298f0 --- /dev/null +++ b/ai_evals/core/runSuite.ts @@ -0,0 +1,264 @@ +import { judgeOutput, DEFAULT_JUDGE_MODEL } from "./judge"; +import type { + BenchmarkAttemptResult, + BenchmarkCaseResult, + BenchmarkCheck, + EvalCase, + FrontendBenchmarkProgressEvent, + ModeRunner, +} from "./types"; + +export async function runSuite(input: { + modeRunner: ModeRunner; + cases: EvalCase[]; + runs: number; + runModel: string | null; + judgeModel?: string | null; + concurrency?: number; + verbose?: boolean; + onProgress?: (event: FrontendBenchmarkProgressEvent) => void; +}): Promise { + const judgeModel = input.judgeModel ?? DEFAULT_JUDGE_MODEL; + const concurrency = Math.max(1, input.concurrency ?? input.modeRunner.concurrency); + const results = new Array(input.cases.length); + let cursor = 0; + + if (input.modeRunner.mode !== "cli") { + input.onProgress?.({ + type: "run-start", + surface: input.modeRunner.mode, + totalCases: input.cases.length, + runs: input.runs, + concurrency, + }); + } + + async function worker(): Promise { + while (true) { + const caseIndex = cursor++; + if (caseIndex >= input.cases.length) { + return; + } + const evalCase = input.cases[caseIndex]; + results[caseIndex] = { + id: evalCase.id, + prompt: evalCase.prompt, + initialPath: evalCase.initialPath, + expectedPath: evalCase.expectedPath, + attempts: await runCaseAttempts({ + caseIndex, + evalCase, + runs: input.runs, + judgeModel, + judgeThreshold: input.modeRunner.judgeThreshold ?? 80, + modeRunner: input.modeRunner, + totalCases: input.cases.length, + verbose: input.verbose ?? false, + onProgress: input.onProgress, + }), + }; + } + } + + await Promise.all( + Array.from({ length: Math.min(concurrency, input.cases.length) }, () => worker()) + ); + + return results; +} + +async function runCaseAttempts(input: { + caseIndex: number; + evalCase: EvalCase; + runs: number; + judgeModel: string; + judgeThreshold: number; + modeRunner: ModeRunner; + totalCases: number; + verbose: boolean; + onProgress?: (event: FrontendBenchmarkProgressEvent) => void; +}): Promise { + const attempts: BenchmarkAttemptResult[] = []; + const surface = input.modeRunner.mode === "cli" ? null : input.modeRunner.mode; + + for (let attempt = 1; attempt <= input.runs; attempt += 1) { + if (surface) { + input.onProgress?.({ + type: "attempt-start", + surface, + caseId: input.evalCase.id, + caseNumber: input.caseIndex + 1, + totalCases: input.totalCases, + attempt, + runs: input.runs, + }); + } + + const startedAt = Date.now(); + + try { + const initial = await input.modeRunner.loadInitial(input.evalCase.initialPath); + const expected = await input.modeRunner.loadExpected(input.evalCase.expectedPath); + const run = await input.modeRunner.run(input.evalCase.prompt, initial, { + caseId: input.evalCase.id, + caseNumber: input.caseIndex + 1, + totalCases: input.totalCases, + attempt, + runs: input.runs, + verbose: input.verbose, + onAssistantMessageStart: input.verbose && surface + ? () => + input.onProgress?.({ + type: "assistant-message-start", + surface, + caseId: input.evalCase.id, + caseNumber: input.caseIndex + 1, + totalCases: input.totalCases, + attempt, + runs: input.runs, + }) + : undefined, + onAssistantChunk: input.verbose && surface + ? (chunk: string) => + input.onProgress?.({ + type: "assistant-chunk", + surface, + caseId: input.evalCase.id, + caseNumber: input.caseIndex + 1, + totalCases: input.totalCases, + attempt, + runs: input.runs, + chunk, + }) + : undefined, + onAssistantMessageEnd: input.verbose && surface + ? () => + input.onProgress?.({ + type: "assistant-message-end", + surface, + caseId: input.evalCase.id, + caseNumber: input.caseIndex + 1, + totalCases: input.totalCases, + attempt, + runs: input.runs, + }) + : undefined, + }); + const checks: BenchmarkCheck[] = [ + buildCheck("run succeeded", run.success, run.error), + ...input.modeRunner.validate({ + evalCase: input.evalCase, + prompt: input.evalCase.prompt, + initial, + expected, + actual: run.actual, + run, + }), + ]; + + let judgeScore: number | null = null; + let judgeSummary: string | null = null; + + if (run.success) { + const judge = await judgeOutput({ + mode: input.modeRunner.mode, + prompt: input.evalCase.prompt, + checklist: input.evalCase.judgeChecklist, + initial, + expected: input.modeRunner.mode === "cli" ? undefined : expected, + actual: run.actual, + model: input.judgeModel, + }); + + judgeScore = judge.success ? judge.score : null; + judgeSummary = judge.summary; + checks.push(buildCheck("judge succeeded", judge.success, judge.error)); + checks.push( + buildCheck( + `judge score >= ${input.judgeThreshold}`, + (judgeScore ?? 0) >= input.judgeThreshold, + judge.success ? `score=${judgeScore}` : judge.error + ) + ); + } + + const artifactFiles = input.modeRunner.buildArtifacts?.(run.actual) ?? []; + const attemptResult: BenchmarkAttemptResult = { + attempt, + passed: checks.every((check) => check.passed), + durationMs: Date.now() - startedAt, + assistantMessageCount: run.assistantMessageCount, + toolCallCount: run.toolCallCount, + toolsUsed: uniqueStrings(run.toolsUsed), + skillsInvoked: uniqueStrings(run.skillsInvoked), + checks, + judgeScore, + judgeSummary, + error: run.error ?? null, + tokenUsage: run.tokenUsage ?? null, + artifactsPath: null, + artifactFiles, + }; + + if (surface) { + input.onProgress?.({ + type: "attempt-finish", + surface, + caseId: input.evalCase.id, + caseNumber: input.caseIndex + 1, + totalCases: input.totalCases, + attempt, + runs: input.runs, + passed: attemptResult.passed, + durationMs: attemptResult.durationMs, + judgeScore: attemptResult.judgeScore, + error: attemptResult.error, + }); + } + + attempts.push(attemptResult); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + const failedAttempt: BenchmarkAttemptResult = { + attempt, + passed: false, + durationMs: Date.now() - startedAt, + assistantMessageCount: 0, + toolCallCount: 0, + toolsUsed: [], + skillsInvoked: [], + checks: [buildCheck("run crashed", false, message)], + judgeScore: null, + judgeSummary: null, + error: message, + tokenUsage: null, + }; + if (surface) { + input.onProgress?.({ + type: "attempt-finish", + surface, + caseId: input.evalCase.id, + caseNumber: input.caseIndex + 1, + totalCases: input.totalCases, + attempt, + runs: input.runs, + passed: false, + durationMs: failedAttempt.durationMs, + judgeScore: null, + error: message, + }); + } + attempts.push(failedAttempt); + } + } + + return attempts; +} + +function buildCheck(name: string, passed: boolean, details?: string): BenchmarkCheck { + return details ? { name, passed, details } : { name, passed }; +} + +function uniqueStrings(values: string[]): string[] { + return [...new Set(values)]; +} diff --git a/ai_evals/core/types.ts b/ai_evals/core/types.ts new file mode 100644 index 0000000000..a8ed0baa28 --- /dev/null +++ b/ai_evals/core/types.ts @@ -0,0 +1,198 @@ +export const EVAL_MODES = ["cli", "flow", "script", "app"] as const; + +export type EvalMode = (typeof EVAL_MODES)[number]; + +export interface FlowValidationSpec { + schemaRequiredPaths?: string[]; + schemaAnyOf?: Array<{ + requiredPaths: string[]; + }>; + resolveResultsRefs?: boolean; + requireSpecialModules?: Array<"preprocessor_module" | "failure_module">; + requireSuspendSteps?: Array<{ + id: string; + requiredEvents?: number; + resumeRequiredStringFieldAnyOf?: string[]; + }>; +} + +export interface EvalCase { + id: string; + prompt: string; + initialPath?: string; + expectedPath?: string; + validate?: FlowValidationSpec; + judgeChecklist?: string[]; +} + +export interface BenchmarkCheck { + name: string; + passed: boolean; + details?: string; +} + +export interface JudgeResult { + success: boolean; + score: number; + summary: string; + error?: string; +} + +export interface BenchmarkArtifactFile { + path: string; + content: string; +} + +export interface BenchmarkTokenUsage { + prompt: number; + completion: number; + total: number; +} + +export interface ModeRunOutput { + success: boolean; + actual: TActual; + error?: string; + assistantMessageCount: number; + toolCallCount: number; + toolsUsed: string[]; + skillsInvoked: string[]; + tokenUsage?: BenchmarkTokenUsage | null; +} + +export interface ModeRunContext { + caseId: string; + caseNumber: number; + totalCases: number; + attempt: number; + runs: number; + verbose: boolean; + onAssistantMessageStart?: () => void; + onAssistantChunk?: (chunk: string) => void; + onAssistantMessageEnd?: () => void; +} + +export interface ModeRunner { + mode: EvalMode; + concurrency: number; + judgeThreshold?: number; + loadInitial(path?: string): Promise; + loadExpected(path?: string): Promise; + run( + prompt: string, + initial: TInitial | undefined, + context: ModeRunContext + ): Promise>; + validate(input: { + evalCase: EvalCase; + prompt: string; + initial: TInitial | undefined; + expected: TExpected | undefined; + actual: TActual; + run: ModeRunOutput; + }): BenchmarkCheck[]; + buildArtifacts?(actual: TActual): BenchmarkArtifactFile[]; +} + +export interface BenchmarkAttemptResult { + attempt: number; + passed: boolean; + durationMs: number; + assistantMessageCount: number; + toolCallCount: number; + toolsUsed: string[]; + skillsInvoked: string[]; + checks: BenchmarkCheck[]; + judgeScore: number | null; + judgeSummary: string | null; + error: string | null; + tokenUsage?: BenchmarkTokenUsage | null; + artifactsPath?: string | null; + artifactFiles?: BenchmarkArtifactFile[]; +} + +export interface BenchmarkCaseResult { + id: string; + prompt: string; + initialPath?: string; + expectedPath?: string; + attempts: BenchmarkAttemptResult[]; +} + +export interface BenchmarkRunResult { + version: 1; + mode: EvalMode; + createdAt: string; + gitSha: string | null; + runs: number; + runModel: string | null; + judgeModel: string | null; + caseCount: number; + attemptCount: number; + passedAttempts: number; + passRate: number; + averageDurationMs: number; + totalTokenUsage?: BenchmarkTokenUsage | null; + averageTokenUsagePerAttempt?: BenchmarkTokenUsage | null; + artifactsPath?: string | null; + cases: BenchmarkCaseResult[]; +} + +export type FrontendBenchmarkProgressEvent = + | { + type: "run-start"; + surface: Exclude; + totalCases: number; + runs: number; + concurrency: number; + } + | { + type: "attempt-start"; + surface: Exclude; + caseId: string; + caseNumber: number; + totalCases: number; + attempt: number; + runs: number; + } + | { + type: "attempt-finish"; + surface: Exclude; + caseId: string; + caseNumber: number; + totalCases: number; + attempt: number; + runs: number; + passed: boolean; + durationMs: number; + judgeScore: number | null; + error: string | null; + } + | { + type: "assistant-message-start"; + surface: Exclude; + caseId: string; + caseNumber: number; + totalCases: number; + attempt: number; + runs: number; + } + | { + type: "assistant-chunk"; + surface: Exclude; + caseId: string; + caseNumber: number; + totalCases: number; + attempt: number; + runs: number; + chunk: string; + } + | { + type: "assistant-message-end"; + surface: Exclude; + caseId: string; + caseNumber: number; + totalCases: number; + attempt: number; + runs: number; + }; diff --git a/ai_evals/core/validators.test.ts b/ai_evals/core/validators.test.ts new file mode 100644 index 0000000000..93578f6c5f --- /dev/null +++ b/ai_evals/core/validators.test.ts @@ -0,0 +1,36 @@ +import { describe, expect, it } from "bun:test"; +import { validateScriptState } from "./validators"; + +describe("validateScriptState", () => { + it("accepts semantically equivalent script implementations", () => { + const checks = validateScriptState({ + actual: { + path: "f/evals/greet_user.ts", + lang: "bun", + code: "export async function main(name: string): Promise {\n return `Hello, ${name}!`;\n}\n", + }, + expected: { + path: "f/evals/greet_user.ts", + lang: "bun", + code: "export async function main(name: string) {\n\treturn `Hello, ${name}!`\n}\n", + }, + }); + + expect(checks.every((check) => check.passed)).toBe(true); + }); + + it("still requires an exported main entrypoint", () => { + const checks = validateScriptState({ + actual: { + path: "f/evals/greet_user.ts", + lang: "bun", + code: "async function main(name: string) {\n return `Hello, ${name}!`;\n}\n", + }, + }); + + expect(checks).toContainEqual({ + name: "script exports entrypoint", + passed: false, + }); + }); +}); diff --git a/ai_evals/core/validators.ts b/ai_evals/core/validators.ts new file mode 100644 index 0000000000..86ddc70566 --- /dev/null +++ b/ai_evals/core/validators.ts @@ -0,0 +1,997 @@ +import path from "node:path"; +import ts from "typescript"; +import type { BenchmarkCheck, FlowValidationSpec } from "./types"; + +export interface ScriptState { + path: string; + lang: string; + args?: Record; + code: string; +} + +export interface FlowState { + summary?: string; + value?: { + preprocessor_module?: Record; + failure_module?: Record; + modules?: Array>; + [key: string]: unknown; + }; + schema?: Record; +} + +export interface AppFilesState { + frontend: Record; + backend: Record; +} + +export interface AppRunnableState { + type?: string; + name?: string; + path?: string; + inlineScript?: { + language?: string; + content?: string; + }; +} + +const TS_LIKE_LANGUAGES = new Set(["bun", "deno", "nativets", "bunnative", "ts", "typescript"]); +const CONTROL_FLOW_MODULE_TYPES = new Set(["branchone", "branchall", "forloopflow", "whileloopflow"]); + +export function validateScriptState(input: { + actual: ScriptState; + initial?: ScriptState; + expected?: ScriptState; +}): BenchmarkCheck[] { + const checks: BenchmarkCheck[] = [ + check("script exports entrypoint", hasSupportedEntrypoint(input.actual.code)), + check("script has no syntax errors", getScriptSyntaxErrors(input.actual.code, input.actual.lang).length === 0), + ]; + + if (input.expected) { + checks.push( + check( + "script path matches expected", + input.actual.path === input.expected.path, + `expected ${input.expected.path}, got ${input.actual.path}` + ) + ); + checks.push( + check( + "script language matches expected", + input.actual.lang === input.expected.lang, + `expected ${input.expected.lang}, got ${input.actual.lang}` + ) + ); + } + + if (input.initial) { + checks.push( + check( + "script differs from initial", + normalizeText(input.actual.code) !== normalizeText(input.initial.code) + ) + ); + } + + return checks; +} + +export function validateFlowState(input: { + actual: FlowState; + initial?: FlowState; + expected?: FlowState; + validate?: FlowValidationSpec; +}): BenchmarkCheck[] { + const actualModules = getFlowModules(input.actual); + const placeholderModuleIds = getInlineScriptPlaceholderModuleIds(input.actual); + const checks: BenchmarkCheck[] = [ + check("flow has modules", actualModules.length > 0), + check( + "flow has no inline placeholder code", + placeholderModuleIds.length === 0, + placeholderModuleIds.length > 0 + ? `placeholder content in: ${placeholderModuleIds.join(", ")}` + : undefined + ), + ]; + + if (input.initial) { + checks.push( + check( + "flow differs from initial", + normalizeJson(input.actual) !== normalizeJson(input.initial) + ) + ); + } + + if (input.expected) { + checks.push(...validateFlowExpectedStructure(input.actual, input.expected)); + } + + if (input.validate) { + checks.push(...validateFlowRequirements(input.actual, input.validate)); + } + + return checks; +} + +export function validateAppState(input: { + actual: AppFilesState; + initial?: AppFilesState; + expected?: AppFilesState; +}): BenchmarkCheck[] { + const checks: BenchmarkCheck[] = []; + const frontendEntries = Object.entries(input.actual.frontend ?? {}); + const backendEntries = Object.entries(input.actual.backend ?? {}); + const frontendSyntaxProblems = getAppFrontendSyntaxProblems(input.actual.frontend); + const backendSyntaxProblems = getAppBackendSyntaxProblems(input.actual.backend); + const unresolvedBackendRefs = getUnresolvedBackendReferences( + input.actual.frontend, + input.actual.backend + ); + + checks.push(check("app has frontend entrypoint", Boolean(input.actual.frontend["/index.tsx"]))); + checks.push( + check( + "app has non-empty frontend files", + frontendEntries.some(([, content]) => content.trim().length > 0) + ) + ); + checks.push( + check( + "frontend files have no syntax errors", + frontendSyntaxProblems.length === 0, + summarizeProblems(frontendSyntaxProblems) + ) + ); + checks.push( + check( + "backend inline scripts have entrypoints", + backendEntries.every(([, runnable]) => { + if (runnable.type !== "inline") { + return true; + } + return hasSupportedEntrypoint(runnable.inlineScript?.content ?? ""); + }) + ) + ); + checks.push( + check( + "backend inline scripts have no syntax errors", + backendSyntaxProblems.length === 0, + summarizeProblems(backendSyntaxProblems) + ) + ); + checks.push( + check( + "frontend backend references resolve", + unresolvedBackendRefs.length === 0, + summarizeProblems(unresolvedBackendRefs) + ) + ); + + if (input.initial) { + checks.push(check("app differs from initial", !appStatesEqual(input.actual, input.initial))); + } + + if (input.expected) { + for (const [filePath, content] of Object.entries(input.expected.frontend)) { + checks.push( + check( + `frontend includes ${filePath}`, + normalizeText(input.actual.frontend[filePath] ?? "") === normalizeText(content) + ) + ); + } + for (const [runnableName, runnable] of Object.entries(input.expected.backend)) { + const actualRunnable = input.actual.backend[runnableName]; + checks.push(check(`backend includes ${runnableName}`, Boolean(actualRunnable))); + if (actualRunnable && runnable.inlineScript?.content) { + checks.push( + check( + `${runnableName} code matches expected`, + normalizeText(actualRunnable.inlineScript?.content ?? "") === + normalizeText(runnable.inlineScript.content) + ) + ); + } + } + } + + return checks; +} + +export function validateCliWorkspace(input: { + actualFiles: Record; + expectedFiles?: Record; + initialFiles?: Record; +}): BenchmarkCheck[] { + const checks: BenchmarkCheck[] = []; + + if (input.expectedFiles) { + for (const [filePath, expectedContent] of Object.entries(input.expectedFiles)) { + const actualContent = input.actualFiles[filePath]; + checks.push(check(`creates ${filePath}`, actualContent !== undefined)); + if (actualContent !== undefined) { + checks.push( + check( + `${filePath} contains expected content`, + cliFileContainsExpectedContent(actualContent, expectedContent) + ) + ); + } + } + + const expectedPaths = new Set(Object.keys(input.expectedFiles)); + const unexpectedPaths = Object.keys(input.actualFiles).filter((filePath) => !expectedPaths.has(filePath)); + checks.push( + check( + "workspace contains no unexpected files", + unexpectedPaths.length === 0, + summarizeProblems(unexpectedPaths) + ) + ); + } + + if (input.initialFiles) { + checks.push(check("workspace differs from initial", !fileMapsEqual(input.actualFiles, input.initialFiles))); + } + + return checks; +} + +function cliFileContainsExpectedContent(actualContent: string, expectedContent: string): boolean { + const expectedSnippets = expectedContent + .replace(/\r\n/g, "\n") + .split("\n") + .map((line) => line.trim()) + .filter((line) => line.length > 0); + + if (expectedSnippets.length === 0) { + return true; + } + + const normalizedActual = actualContent.replace(/\r\n/g, "\n"); + + return expectedSnippets.every((snippet) => normalizedActual.includes(snippet)); +} + +function check(name: string, passed: boolean, details?: string): BenchmarkCheck { + return !passed && details ? { name, passed, details } : { name, passed }; +} + +function normalizeText(value: string): string { + return value.replace(/\r\n/g, "\n").trim(); +} + +function normalizeJson(value: unknown): string { + return JSON.stringify(value); +} + +function summarizeProblems(problems: string[], limit = 5): string | undefined { + if (problems.length === 0) { + return undefined; + } + + if (problems.length <= limit) { + return problems.join("; "); + } + + return `${problems.slice(0, limit).join("; ")}; ...and ${problems.length - limit} more`; +} + +function hasSupportedEntrypoint(code: string): boolean { + return ( + /export\s+(async\s+)?function\s+main\s*\(/.test(code) || + /export\s+default\s+(async\s+)?function\s*\(/.test(code) + ); +} + +function getScriptSyntaxErrors(code: string, lang: string): string[] { + if (!TS_LIKE_LANGUAGES.has(lang)) { + return []; + } + + return getTypeScriptSyntaxErrors(code, "eval.ts"); +} + +function getTypeScriptSyntaxErrors(code: string, fileName: string): string[] { + const result = ts.transpileModule(code, { + compilerOptions: { + target: ts.ScriptTarget.ES2022, + module: ts.ModuleKind.ESNext, + jsx: ts.JsxEmit.ReactJSX, + }, + reportDiagnostics: true, + fileName, + }); + + return (result.diagnostics ?? []).map((diagnostic) => + ts.flattenDiagnosticMessageText(diagnostic.messageText, "\n") + ); +} + +function getAppFrontendSyntaxProblems(frontend: Record): string[] { + const problems: string[] = []; + + for (const [filePath, content] of Object.entries(frontend)) { + if (!isFrontendCodeFile(filePath)) { + continue; + } + + const errors = getTypeScriptSyntaxErrors(content, filePath); + for (const error of errors) { + problems.push(`${filePath}: ${error}`); + } + } + + return problems; +} + +function getAppBackendSyntaxProblems(backend: Record): string[] { + const problems: string[] = []; + + for (const [key, runnable] of Object.entries(backend)) { + if (runnable.type !== "inline") { + continue; + } + + const language = runnable.inlineScript?.language ?? ""; + const content = runnable.inlineScript?.content ?? ""; + for (const error of getScriptSyntaxErrors(content, language)) { + problems.push(`${key}: ${error}`); + } + } + + return problems; +} + +function isFrontendCodeFile(filePath: string): boolean { + const extension = path.extname(filePath).toLowerCase(); + return extension === ".ts" || extension === ".tsx" || extension === ".js" || extension === ".jsx"; +} + +function getUnresolvedBackendReferences( + frontend: Record, + backend: Record +): string[] { + const backendKeys = new Set(Object.keys(backend)); + const unresolved = new Set(); + + for (const [filePath, content] of Object.entries(frontend)) { + for (const key of extractBackendCallKeys(content)) { + if (!backendKeys.has(key)) { + unresolved.add(`${filePath} references missing backend.${key}()`); + } + } + } + + return [...unresolved]; +} + +function extractBackendCallKeys(content: string): string[] { + const matches = content.matchAll(/\bbackend\.([A-Za-z_][A-Za-z0-9_]*)\s*\(/g); + return [...new Set([...matches].map((match) => match[1]))]; +} + +function getFlowModules(flow: FlowState): Array> { + return Array.isArray(flow.value?.modules) ? flow.value.modules : []; +} + +function validateFlowExpectedStructure( + actual: FlowState, + expected: FlowState +): BenchmarkCheck[] { + const checks: BenchmarkCheck[] = []; + const expectedTopLevelModules = getFlowModules(expected); + const actualTopLevelModules = getFlowModules(actual); + + const expectedSchemaFields = getTopLevelSchemaFields(expected.schema); + if (expectedSchemaFields.length > 0) { + checks.push( + check( + "flow schema includes expected top-level fields", + expectedSchemaFields.every((field) => hasSchemaPath(actual.schema, field)), + `missing one of: ${expectedSchemaFields.join(", ")}` + ) + ); + } + + if (expectedTopLevelModules.length > 0) { + const actualIds = actualTopLevelModules + .map((module) => (typeof module.id === "string" ? module.id : null)) + .filter((id): id is string => Boolean(id)); + const expectedIds = expectedTopLevelModules + .map((module) => (typeof module.id === "string" ? module.id : null)) + .filter((id): id is string => Boolean(id)); + + checks.push( + check( + "flow includes expected top-level step ids", + expectedIds.every((id) => actualIds.includes(id)), + `expected ids: ${expectedIds.join(", ")}; actual ids: ${actualIds.join(", ")}` + ) + ); + + checks.push( + check( + "flow preserves expected top-level step order", + preservesRelativeOrder(actualIds, expectedIds), + `expected order: ${expectedIds.join(" -> ")}; actual ids: ${actualIds.join(" -> ")}` + ) + ); + + for (const expectedModule of expectedTopLevelModules) { + const moduleId = typeof expectedModule.id === "string" ? expectedModule.id : null; + if (!moduleId) { + continue; + } + + const actualModule = actualTopLevelModules.find((module) => module.id === moduleId); + if (!actualModule) { + continue; + } + + const expectedType = getModuleType(expectedModule); + if (expectedType && !(hasSuspendConfig(expectedModule) || hasSuspendConfig(actualModule))) { + checks.push( + check( + `${moduleId} type matches expected`, + getModuleType(actualModule) === expectedType, + `expected ${expectedType}, got ${getModuleType(actualModule) ?? "(missing)"}` + ) + ); + } + + const expectedPath = getModulePath(expectedModule); + if (expectedPath) { + checks.push( + check( + `${moduleId} path matches expected`, + getModulePath(actualModule) === expectedPath, + `expected ${expectedPath}, got ${getModulePath(actualModule) ?? "(missing)"}` + ) + ); + } + } + } + + for (const specialModuleKey of ["preprocessor_module", "failure_module"] as const) { + const expectedSpecialModule = getSpecialFlowModule(expected, specialModuleKey); + if (!expectedSpecialModule) { + continue; + } + + const actualSpecialModule = getSpecialFlowModule(actual, specialModuleKey); + checks.push(check(`${specialModuleKey} matches expected presence`, Boolean(actualSpecialModule))); + + if (!actualSpecialModule) { + continue; + } + + const expectedType = getModuleType(expectedSpecialModule); + if (expectedType) { + checks.push( + check( + `${specialModuleKey} type matches expected`, + getModuleType(actualSpecialModule) === expectedType, + `expected ${expectedType}, got ${getModuleType(actualSpecialModule) ?? "(missing)"}` + ) + ); + } + } + + return checks; +} + +function validateFlowRequirements( + flow: FlowState, + validate: FlowValidationSpec +): BenchmarkCheck[] { + const checks: BenchmarkCheck[] = []; + + for (const requiredPath of validate.schemaRequiredPaths ?? []) { + checks.push( + check( + `schema includes ${requiredPath}`, + hasSchemaPath(flow.schema, requiredPath), + `missing schema path ${requiredPath}` + ) + ); + } + + if (validate.schemaAnyOf && validate.schemaAnyOf.length > 0) { + const matchingVariant = validate.schemaAnyOf.find((variant) => + variant.requiredPaths.every((requiredPath) => hasSchemaPath(flow.schema, requiredPath)) + ); + + checks.push( + check( + "schema matches one accepted input shape", + Boolean(matchingVariant), + matchingVariant + ? undefined + : `expected one of: ${validate.schemaAnyOf + .map((variant) => `[${variant.requiredPaths.join(", ")}]`) + .join(" or ")}` + ) + ); + } + + if (validate.resolveResultsRefs) { + const unresolved = collectUnresolvedResultsRefs(flow); + checks.push( + check( + "results references resolve", + unresolved.length === 0, + unresolved.length > 0 ? unresolved.join("; ") : undefined + ) + ); + } + + for (const specialModule of validate.requireSpecialModules ?? []) { + checks.push( + check( + `${specialModule} exists`, + Boolean(getSpecialFlowModule(flow, specialModule)) + ) + ); + } + + for (const suspendStep of validate.requireSuspendSteps ?? []) { + const module = findFlowModuleById(flow, suspendStep.id); + checks.push(check(`${suspendStep.id} step exists`, Boolean(module))); + if (!module) { + continue; + } + + checks.push(check(`${suspendStep.id} includes suspend config`, hasSuspendConfig(module))); + if (!hasSuspendConfig(module)) { + continue; + } + + if (suspendStep.requiredEvents !== undefined) { + checks.push( + check( + `${suspendStep.id} requires ${suspendStep.requiredEvents} approval event${suspendStep.requiredEvents === 1 ? "" : "s"}`, + getSuspendRequiredEvents(module) === suspendStep.requiredEvents, + `expected ${suspendStep.requiredEvents}, got ${getSuspendRequiredEvents(module) ?? "(missing)"}` + ) + ); + } + + if ( + suspendStep.resumeRequiredStringFieldAnyOf && + suspendStep.resumeRequiredStringFieldAnyOf.length > 0 + ) { + const stringFields = getSuspendResumeStringFields(module); + checks.push( + check( + `${suspendStep.id} resume form includes one accepted comment field`, + suspendStep.resumeRequiredStringFieldAnyOf.some((field) => + stringFields.includes(field) + ), + `expected one of [${suspendStep.resumeRequiredStringFieldAnyOf.join(", ")}], got [${stringFields.join(", ")}]` + ) + ); + } + } + + return checks; +} + +function hasSchemaPath(schema: Record | undefined, dottedPath: string): boolean { + if (!schema || typeof schema !== "object") { + return false; + } + + const segments = dottedPath.split(".").filter(Boolean); + if (segments.length === 0) { + return false; + } + + let current: Record | undefined = schema; + for (const segment of segments) { + const properties = current?.properties; + if (!properties || typeof properties !== "object") { + return false; + } + + const next = (properties as Record)[segment]; + if (!next || typeof next !== "object") { + return false; + } + current = next as Record; + } + + return true; +} + +function getTopLevelSchemaFields(schema: Record | undefined): string[] { + if (!schema || typeof schema !== "object") { + return []; + } + + const properties = schema.properties; + if (!properties || typeof properties !== "object") { + return []; + } + + return Object.keys(properties as Record).filter((key) => key.length > 0); +} + +function preservesRelativeOrder(actualIds: string[], expectedIds: string[]): boolean { + if (expectedIds.length === 0) { + return true; + } + + let cursor = 0; + for (const actualId of actualIds) { + if (actualId === expectedIds[cursor]) { + cursor += 1; + if (cursor === expectedIds.length) { + return true; + } + } + } + + return false; +} + +function collectUnresolvedResultsRefs(flow: FlowState): string[] { + const unresolved = new Set(); + validateModuleSequence(getFlowModules(flow), new Map>(), unresolved); + return [...unresolved]; +} + +function validateModuleSequence( + modules: Array>, + parentVisibleModules: Map>, + unresolved: Set +): void { + const visibleModules = new Map(parentVisibleModules); + + for (const module of modules) { + validateResultsRefsInRecord(module, visibleModules, unresolved); + validateNestedModuleResultsRefs(module, visibleModules, unresolved); + + if (typeof module.id === "string" && module.id.length > 0) { + visibleModules.set(module.id, module); + } + } +} + +function validateNestedModuleResultsRefs( + module: Record, + visibleModules: Map>, + unresolved: Set +): void { + const value = isObjectRecord(module.value) ? module.value : null; + if (!value) { + return; + } + + const nestedSequences: Array>> = []; + + if (Array.isArray(value.modules)) { + nestedSequences.push(asModuleArray(value.modules)); + } + + if (Array.isArray(value.default)) { + nestedSequences.push(asModuleArray(value.default)); + } + + if (Array.isArray(value.branches)) { + for (const branch of value.branches) { + if (!isObjectRecord(branch)) { + continue; + } + if (typeof branch.expr === "string") { + validateResultsRefsInExpression( + branch.expr, + `branch ${module.id ?? "(unnamed)"}`, + visibleModules, + unresolved + ); + } + if (Array.isArray(branch.modules)) { + nestedSequences.push(asModuleArray(branch.modules)); + } + } + } + + for (const sequence of nestedSequences) { + validateModuleSequence(sequence, visibleModules, unresolved); + } +} + +function validateResultsRefsInRecord( + value: unknown, + visibleModules: Map>, + unresolved: Set, + context = "expression" +): void { + if (typeof value === "string") { + validateResultsRefsInExpression(value, context, visibleModules, unresolved); + return; + } + + if (Array.isArray(value)) { + for (const entry of value) { + validateResultsRefsInRecord(entry, visibleModules, unresolved, context); + } + return; + } + + if (!isObjectRecord(value)) { + return; + } + + for (const [key, entry] of Object.entries(value)) { + if (key === "content" || key === "modules" || key === "branches" || key === "default") { + continue; + } + validateResultsRefsInRecord(entry, visibleModules, unresolved, key); + } +} + +function validateResultsRefsInExpression( + expression: string, + context: string, + visibleModules: Map>, + unresolved: Set +): void { + for (const ref of extractResultsRefs(expression)) { + const module = visibleModules.get(ref.root); + if (!module) { + unresolved.add(`${context} references missing results.${ref.root}`); + continue; + } + validateNestedResultsRefPath(ref.root, ref.path, module, context, unresolved); + } +} + +function extractResultsRefs( + expression: string +): Array<{ root: string; path: string[] }> { + const matches = expression.matchAll(/\bresults\.([A-Za-z0-9_-]+)((?:\.[A-Za-z0-9_-]+)*)/g); + const refs = new Map(); + + for (const match of matches) { + const root = match[1]; + const path = match[2] + .split(".") + .filter(Boolean); + const key = `${root}:${path.join(".")}`; + refs.set(key, { root, path }); + } + + return [...refs.values()]; +} + +function validateNestedResultsRefPath( + rootId: string, + path: string[], + module: Record, + context: string, + unresolved: Set +): void { + if (path.length === 0) { + return; + } + + const moduleType = getModuleType(module); + if (!moduleType || !CONTROL_FLOW_MODULE_TYPES.has(moduleType)) { + return; + } + + const nestedIds = new Set(getImmediateNestedModuleIds(module)); + const [firstSegment] = path; + if (nestedIds.has(firstSegment)) { + unresolved.add( + `${context} references nested results.${rootId}.${firstSegment} inside ${moduleType} ${rootId}` + ); + } +} + +function getAllFlowModules(flow: FlowState): Array> { + const modules: Array> = []; + const specialModules = ["preprocessor_module", "failure_module"] as const; + + for (const key of specialModules) { + const specialModule = getSpecialFlowModule(flow, key); + if (specialModule) { + modules.push(specialModule); + modules.push(...collectNestedModules(specialModule)); + } + } + + for (const module of getFlowModules(flow)) { + modules.push(module); + modules.push(...collectNestedModules(module)); + } + + return modules; +} + +function collectNestedModules(module: Record): Array> { + const nested: Array> = []; + const value = isObjectRecord(module.value) ? module.value : null; + if (!value) { + return nested; + } + + if (Array.isArray(value.modules)) { + for (const child of asModuleArray(value.modules)) { + nested.push(child, ...collectNestedModules(child)); + } + } + + if (Array.isArray(value.default)) { + for (const child of asModuleArray(value.default)) { + nested.push(child, ...collectNestedModules(child)); + } + } + + if (Array.isArray(value.branches)) { + for (const branch of value.branches) { + if (!isObjectRecord(branch) || !Array.isArray(branch.modules)) { + continue; + } + for (const child of asModuleArray(branch.modules)) { + nested.push(child, ...collectNestedModules(child)); + } + } + } + + return nested; +} + +function findFlowModuleById(flow: FlowState, id: string): Record | null { + for (const module of getAllFlowModules(flow)) { + if (module.id === id) { + return module; + } + } + return null; +} + +function getInlineScriptPlaceholderModuleIds(flow: FlowState): string[] { + return getAllFlowModules(flow).flatMap((module) => { + const code = getModuleCode(module)?.trim(); + if (!code || !/^inline_script\.[A-Za-z0-9_-]+$/.test(code)) { + return []; + } + + if (typeof module.id === "string" && module.id.length > 0) { + return [module.id]; + } + + return ["(unnamed)"]; + }); +} + +function getImmediateNestedModuleIds(module: Record): string[] { + const ids: string[] = []; + const value = isObjectRecord(module.value) ? module.value : null; + if (!value) { + return ids; + } + + if (Array.isArray(value.modules)) { + ids.push(...asModuleArray(value.modules).flatMap((child) => (typeof child.id === "string" ? [child.id] : []))); + } + + if (Array.isArray(value.default)) { + ids.push(...asModuleArray(value.default).flatMap((child) => (typeof child.id === "string" ? [child.id] : []))); + } + + if (Array.isArray(value.branches)) { + for (const branch of value.branches) { + if (!isObjectRecord(branch) || !Array.isArray(branch.modules)) { + continue; + } + ids.push( + ...asModuleArray(branch.modules).flatMap((child) => (typeof child.id === "string" ? [child.id] : [])) + ); + } + } + + return ids; +} + +function getModuleCode(module: Record): string | null { + const value = isObjectRecord(module.value) ? module.value : null; + return typeof value?.content === "string" ? value.content : null; +} + +function asModuleArray(value: unknown[]): Array> { + return value.filter(isObjectRecord); +} + +function isObjectRecord(value: unknown): value is Record { + return typeof value === "object" && value !== null && !Array.isArray(value); +} + +function getSpecialFlowModule( + flow: FlowState, + key: "preprocessor_module" | "failure_module" +): Record | null { + if (!flow.value || typeof flow.value !== "object") { + return null; + } + const module = (flow.value as Record)[key]; + return module && typeof module === "object" ? (module as Record) : null; +} + +function getModuleType(module: Record): string | null { + const value = module.value; + if (!value || typeof value !== "object") { + return null; + } + return typeof (value as Record).type === "string" + ? ((value as Record).type) + : null; +} + +function getModulePath(module: Record): string | null { + const value = module.value; + if (!value || typeof value !== "object") { + return null; + } + + return typeof (value as Record).path === "string" + ? ((value as Record).path) + : null; +} + +function hasSuspendConfig(module: Record): boolean { + return typeof module.suspend === "object" && module.suspend !== null; +} + +function getSuspendRequiredEvents(module: Record): number | null { + const suspend = isObjectRecord(module.suspend) ? module.suspend : null; + return typeof suspend?.required_events === "number" ? suspend.required_events : null; +} + +function getSuspendResumeStringFields(module: Record): string[] { + const suspend = isObjectRecord(module.suspend) ? module.suspend : null; + const resumeForm = isObjectRecord(suspend?.resume_form) ? suspend.resume_form : null; + const schema = isObjectRecord(resumeForm?.schema) ? resumeForm.schema : null; + const properties = isObjectRecord(schema?.properties) ? schema.properties : null; + if (!properties) { + return []; + } + + return Object.entries(properties).flatMap(([field, property]) => { + if (!isObjectRecord(property) || property.type !== "string") { + return []; + } + return [field]; + }); +} + +function appStatesEqual(left: AppFilesState, right: AppFilesState): boolean { + return fileMapsEqual(left.frontend, right.frontend) && fileMapsEqual(stringifyBackend(left.backend), stringifyBackend(right.backend)); +} + +function stringifyBackend(backend: Record): Record { + const result: Record = {}; + for (const [key, value] of Object.entries(backend)) { + result[key] = JSON.stringify(value); + } + return result; +} + +function fileMapsEqual(left: Record, right: Record): boolean { + const leftEntries = Object.entries(left).sort(([a], [b]) => a.localeCompare(b)); + const rightEntries = Object.entries(right).sort(([a], [b]) => a.localeCompare(b)); + if (leftEntries.length !== rightEntries.length) { + return false; + } + return leftEntries.every(([key, value], index) => { + const [otherKey, otherValue] = rightEntries[index]; + return key === otherKey && normalizeText(value) === normalizeText(otherValue); + }); +} diff --git a/ai_evals/fixtures/cli/expected/bun-hello-flow-punctuation/f/evals/hello__flow/flow.yaml b/ai_evals/fixtures/cli/expected/bun-hello-flow-punctuation/f/evals/hello__flow/flow.yaml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/ai_evals/fixtures/cli/expected/bun-hello-flow-punctuation/f/evals/hello__flow/hello.ts b/ai_evals/fixtures/cli/expected/bun-hello-flow-punctuation/f/evals/hello__flow/hello.ts new file mode 100644 index 0000000000..e69de29bb2 diff --git a/ai_evals/fixtures/cli/expected/bun-hello-flow/f/evals/hello__flow/flow.yaml b/ai_evals/fixtures/cli/expected/bun-hello-flow/f/evals/hello__flow/flow.yaml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/ai_evals/fixtures/cli/expected/bun-hello-flow/f/evals/hello__flow/hello.ts b/ai_evals/fixtures/cli/expected/bun-hello-flow/f/evals/hello__flow/hello.ts new file mode 100644 index 0000000000..8ce89048c9 --- /dev/null +++ b/ai_evals/fixtures/cli/expected/bun-hello-flow/f/evals/hello__flow/hello.ts @@ -0,0 +1,2 @@ +main(name: string) +greeting: `Hello, ${name}!` diff --git a/ai_evals/fixtures/cli/expected/bun-hello-script-uppercase/f/evals/hello.ts b/ai_evals/fixtures/cli/expected/bun-hello-script-uppercase/f/evals/hello.ts new file mode 100644 index 0000000000..e69de29bb2 diff --git a/ai_evals/fixtures/cli/expected/bun-hello-script/f/evals/hello.ts b/ai_evals/fixtures/cli/expected/bun-hello-script/f/evals/hello.ts new file mode 100644 index 0000000000..e489a7b3fc --- /dev/null +++ b/ai_evals/fixtures/cli/expected/bun-hello-script/f/evals/hello.ts @@ -0,0 +1,3 @@ +export async function main(name: string) { + return { greeting: `Hello, ${name}!` }; +} diff --git a/ai_evals/fixtures/cli/expected/flow-reuse-existing-script/f/evals/reuse_greeting__flow/flow.yaml b/ai_evals/fixtures/cli/expected/flow-reuse-existing-script/f/evals/reuse_greeting__flow/flow.yaml new file mode 100644 index 0000000000..b24c7ba77a --- /dev/null +++ b/ai_evals/fixtures/cli/expected/flow-reuse-existing-script/f/evals/reuse_greeting__flow/flow.yaml @@ -0,0 +1,2 @@ +type: script +path: f/lib/format_greeting diff --git a/ai_evals/fixtures/cli/expected/flow-reuse-existing-script/f/lib/format_greeting.ts b/ai_evals/fixtures/cli/expected/flow-reuse-existing-script/f/lib/format_greeting.ts new file mode 100644 index 0000000000..e489a7b3fc --- /dev/null +++ b/ai_evals/fixtures/cli/expected/flow-reuse-existing-script/f/lib/format_greeting.ts @@ -0,0 +1,3 @@ +export async function main(name: string) { + return { greeting: `Hello, ${name}!` }; +} diff --git a/ai_evals/fixtures/cli/expected/python-add-numbers-script/f/evals/add_numbers.py b/ai_evals/fixtures/cli/expected/python-add-numbers-script/f/evals/add_numbers.py new file mode 100644 index 0000000000..cbf4ed11cb --- /dev/null +++ b/ai_evals/fixtures/cli/expected/python-add-numbers-script/f/evals/add_numbers.py @@ -0,0 +1,2 @@ +def main( +return {"total": a + b} diff --git a/ai_evals/fixtures/cli/initial/bun-hello-flow-punctuation/f/evals/hello__flow/flow.yaml b/ai_evals/fixtures/cli/initial/bun-hello-flow-punctuation/f/evals/hello__flow/flow.yaml new file mode 100644 index 0000000000..65a93ca42a --- /dev/null +++ b/ai_evals/fixtures/cli/initial/bun-hello-flow-punctuation/f/evals/hello__flow/flow.yaml @@ -0,0 +1,20 @@ +summary: Simple greeting flow +schema: + type: object + properties: + name: + type: string + description: Name to greet + required: + - name +value: + modules: + - id: hello_step + value: + type: rawscript + language: bun + content: !inline hello.ts + input_transforms: + name: + type: javascript + expr: flow_input.name diff --git a/ai_evals/fixtures/cli/initial/bun-hello-flow-punctuation/f/evals/hello__flow/hello.ts b/ai_evals/fixtures/cli/initial/bun-hello-flow-punctuation/f/evals/hello__flow/hello.ts new file mode 100644 index 0000000000..e489a7b3fc --- /dev/null +++ b/ai_evals/fixtures/cli/initial/bun-hello-flow-punctuation/f/evals/hello__flow/hello.ts @@ -0,0 +1,3 @@ +export async function main(name: string) { + return { greeting: `Hello, ${name}!` }; +} diff --git a/ai_evals/fixtures/cli/initial/bun-hello-script-uppercase/f/evals/hello.ts b/ai_evals/fixtures/cli/initial/bun-hello-script-uppercase/f/evals/hello.ts new file mode 100644 index 0000000000..e489a7b3fc --- /dev/null +++ b/ai_evals/fixtures/cli/initial/bun-hello-script-uppercase/f/evals/hello.ts @@ -0,0 +1,3 @@ +export async function main(name: string) { + return { greeting: `Hello, ${name}!` }; +} diff --git a/ai_evals/fixtures/cli/initial/flow-reuse-existing-script/f/lib/format_greeting.ts b/ai_evals/fixtures/cli/initial/flow-reuse-existing-script/f/lib/format_greeting.ts new file mode 100644 index 0000000000..e489a7b3fc --- /dev/null +++ b/ai_evals/fixtures/cli/initial/flow-reuse-existing-script/f/lib/format_greeting.ts @@ -0,0 +1,3 @@ +export async function main(name: string) { + return { greeting: `Hello, ${name}!` }; +} diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/createFolder/main.ts b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/createFolder/main.ts similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/createFolder/main.ts rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/createFolder/main.ts diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/createFolder/meta.json b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/createFolder/meta.json similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/createFolder/meta.json rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/createFolder/meta.json diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/deleteItem/main.ts b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/deleteItem/main.ts similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/deleteItem/main.ts rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/deleteItem/main.ts diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/deleteItem/meta.json b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/deleteItem/meta.json similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/deleteItem/meta.json rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/deleteItem/meta.json diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/listFiles/main.ts b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/listFiles/main.ts similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/listFiles/main.ts rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/listFiles/main.ts diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/listFiles/meta.json b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/listFiles/meta.json similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/listFiles/meta.json rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/listFiles/meta.json diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/listFolders/main.ts b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/listFolders/main.ts similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/listFolders/main.ts rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/listFolders/main.ts diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/listFolders/meta.json b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/listFolders/meta.json similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/listFolders/meta.json rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/listFolders/meta.json diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/moveItem/main.ts b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/moveItem/main.ts similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/moveItem/main.ts rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/moveItem/main.ts diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/moveItem/meta.json b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/moveItem/meta.json similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/moveItem/meta.json rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/moveItem/meta.json diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/renameItem/main.ts b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/renameItem/main.ts similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/renameItem/main.ts rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/renameItem/main.ts diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/renameItem/meta.json b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/renameItem/meta.json similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/renameItem/meta.json rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/renameItem/meta.json diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/components/Breadcrumb.tsx b/ai_evals/fixtures/frontend/app/initial/file_manager/frontend/components/Breadcrumb.tsx similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/components/Breadcrumb.tsx rename to ai_evals/fixtures/frontend/app/initial/file_manager/frontend/components/Breadcrumb.tsx diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/components/FileItem.tsx b/ai_evals/fixtures/frontend/app/initial/file_manager/frontend/components/FileItem.tsx similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/components/FileItem.tsx rename to ai_evals/fixtures/frontend/app/initial/file_manager/frontend/components/FileItem.tsx diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/components/FileList.tsx b/ai_evals/fixtures/frontend/app/initial/file_manager/frontend/components/FileList.tsx similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/components/FileList.tsx rename to ai_evals/fixtures/frontend/app/initial/file_manager/frontend/components/FileList.tsx diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/components/FolderTree.tsx b/ai_evals/fixtures/frontend/app/initial/file_manager/frontend/components/FolderTree.tsx similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/components/FolderTree.tsx rename to ai_evals/fixtures/frontend/app/initial/file_manager/frontend/components/FolderTree.tsx diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/components/Toolbar.tsx b/ai_evals/fixtures/frontend/app/initial/file_manager/frontend/components/Toolbar.tsx similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/components/Toolbar.tsx rename to ai_evals/fixtures/frontend/app/initial/file_manager/frontend/components/Toolbar.tsx diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/index.tsx b/ai_evals/fixtures/frontend/app/initial/file_manager/frontend/index.tsx similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/index.tsx rename to ai_evals/fixtures/frontend/app/initial/file_manager/frontend/index.tsx diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/addToCart/main.ts b/ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/addToCart/main.ts similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/addToCart/main.ts rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/addToCart/main.ts diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/addToCart/meta.json b/ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/addToCart/meta.json similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/addToCart/meta.json rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/addToCart/meta.json diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/calculateTotal/main.ts b/ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/calculateTotal/main.ts similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/calculateTotal/main.ts rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/calculateTotal/main.ts diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/calculateTotal/meta.json b/ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/calculateTotal/meta.json similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/calculateTotal/meta.json rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/calculateTotal/meta.json diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/getProducts/main.ts b/ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/getProducts/main.ts similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/getProducts/main.ts rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/getProducts/main.ts diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/getProducts/meta.json b/ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/getProducts/meta.json similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/getProducts/meta.json rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/getProducts/meta.json diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/removeFromCart/main.ts b/ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/removeFromCart/main.ts similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/removeFromCart/main.ts rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/removeFromCart/main.ts diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/removeFromCart/meta.json b/ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/removeFromCart/meta.json similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/removeFromCart/meta.json rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/removeFromCart/meta.json diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/frontend/components/Cart.tsx b/ai_evals/fixtures/frontend/app/initial/shopping_cart/frontend/components/Cart.tsx similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/frontend/components/Cart.tsx rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/frontend/components/Cart.tsx diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/frontend/components/ProductCard.tsx b/ai_evals/fixtures/frontend/app/initial/shopping_cart/frontend/components/ProductCard.tsx similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/frontend/components/ProductCard.tsx rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/frontend/components/ProductCard.tsx diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/frontend/components/ProductList.tsx b/ai_evals/fixtures/frontend/app/initial/shopping_cart/frontend/components/ProductList.tsx similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/frontend/components/ProductList.tsx rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/frontend/components/ProductList.tsx diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/frontend/index.tsx b/ai_evals/fixtures/frontend/app/initial/shopping_cart/frontend/index.tsx similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/frontend/index.tsx rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/frontend/index.tsx diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/test1_counter_app/backend/decrementCounter/main.ts b/ai_evals/fixtures/frontend/app/initial/test1_counter_app/backend/decrementCounter/main.ts similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/test1_counter_app/backend/decrementCounter/main.ts rename to ai_evals/fixtures/frontend/app/initial/test1_counter_app/backend/decrementCounter/main.ts diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/test1_counter_app/backend/decrementCounter/meta.json b/ai_evals/fixtures/frontend/app/initial/test1_counter_app/backend/decrementCounter/meta.json similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/test1_counter_app/backend/decrementCounter/meta.json rename to ai_evals/fixtures/frontend/app/initial/test1_counter_app/backend/decrementCounter/meta.json diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/test1_counter_app/backend/incrementCounter/main.ts b/ai_evals/fixtures/frontend/app/initial/test1_counter_app/backend/incrementCounter/main.ts similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/test1_counter_app/backend/incrementCounter/main.ts rename to ai_evals/fixtures/frontend/app/initial/test1_counter_app/backend/incrementCounter/main.ts diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/test1_counter_app/backend/incrementCounter/meta.json b/ai_evals/fixtures/frontend/app/initial/test1_counter_app/backend/incrementCounter/meta.json similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/test1_counter_app/backend/incrementCounter/meta.json rename to ai_evals/fixtures/frontend/app/initial/test1_counter_app/backend/incrementCounter/meta.json diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/test1_counter_app/frontend/index.tsx b/ai_evals/fixtures/frontend/app/initial/test1_counter_app/frontend/index.tsx similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/test1_counter_app/frontend/index.tsx rename to ai_evals/fixtures/frontend/app/initial/test1_counter_app/frontend/index.tsx diff --git a/ai_evals/fixtures/frontend/flow/expected/test0_sum_two_numbers.json b/ai_evals/fixtures/frontend/flow/expected/test0_sum_two_numbers.json new file mode 100644 index 0000000000..30b2133004 --- /dev/null +++ b/ai_evals/fixtures/frontend/flow/expected/test0_sum_two_numbers.json @@ -0,0 +1,31 @@ +{ + "summary": "", + "value": { + "modules": [ + { + "id": "sum_numbers", + "value": { + "type": "rawscript", + "language": "bun", + "content": "export async function main(a: number, b: number) {\n return a + b;\n}", + "input_transforms": { + "a": { + "type": "javascript", + "expr": "flow_input.a" + }, + "b": { + "type": "javascript", + "expr": "flow_input.b" + } + } + } + } + ] + }, + "schema": { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "properties": {}, + "required": [], + "type": "object" + } +} diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test1.json b/ai_evals/fixtures/frontend/flow/expected/test1.json similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test1.json rename to ai_evals/fixtures/frontend/flow/expected/test1.json diff --git a/ai_evals/fixtures/frontend/flow/expected/test10_while_loop_counter.json b/ai_evals/fixtures/frontend/flow/expected/test10_while_loop_counter.json new file mode 100644 index 0000000000..328cf34651 --- /dev/null +++ b/ai_evals/fixtures/frontend/flow/expected/test10_while_loop_counter.json @@ -0,0 +1,30 @@ +{ + "value": { + "modules": [ + { + "id": "count_until_target", + "value": { + "type": "whileloopflow" + } + }, + { + "id": "return_final_count", + "value": { + "type": "rawscript" + } + } + ] + }, + "schema": { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "target": { + "type": "number" + } + }, + "required": [ + "target" + ] + } +} diff --git a/ai_evals/fixtures/frontend/flow/expected/test11_preprocessor_failure.json b/ai_evals/fixtures/frontend/flow/expected/test11_preprocessor_failure.json new file mode 100644 index 0000000000..c1a79f269d --- /dev/null +++ b/ai_evals/fixtures/frontend/flow/expected/test11_preprocessor_failure.json @@ -0,0 +1,36 @@ +{ + "value": { + "preprocessor_module": { + "id": "preprocessor", + "value": { + "type": "rawscript" + } + }, + "failure_module": { + "id": "failure", + "value": { + "type": "rawscript" + } + }, + "modules": [ + { + "id": "process_event", + "value": { + "type": "rawscript" + } + } + ] + }, + "schema": { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "payload": { + "type": "string" + } + }, + "required": [ + "payload" + ] + } +} diff --git a/ai_evals/fixtures/frontend/flow/expected/test12_approval_step.json b/ai_evals/fixtures/frontend/flow/expected/test12_approval_step.json new file mode 100644 index 0000000000..5970c3ef1c --- /dev/null +++ b/ai_evals/fixtures/frontend/flow/expected/test12_approval_step.json @@ -0,0 +1,44 @@ +{ + "value": { + "modules": [ + { + "id": "request_approval", + "suspend": { + "required_events": 1, + "resume_form": { + "schema": { + "approver_comment": { + "type": "string" + } + } + } + }, + "value": { + "type": "rawscript" + } + }, + { + "id": "finalize_purchase", + "value": { + "type": "rawscript" + } + } + ] + }, + "schema": { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "requester_email": { + "type": "string" + }, + "amount": { + "type": "number" + } + }, + "required": [ + "requester_email", + "amount" + ] + } +} diff --git a/ai_evals/fixtures/frontend/flow/expected/test1_reuse_existing_script.json b/ai_evals/fixtures/frontend/flow/expected/test1_reuse_existing_script.json new file mode 100644 index 0000000000..f5ab58c476 --- /dev/null +++ b/ai_evals/fixtures/frontend/flow/expected/test1_reuse_existing_script.json @@ -0,0 +1,39 @@ +{ + "value": { + "modules": [ + { + "id": "sum_numbers", + "value": { + "type": "script", + "path": "f/evals/add_two_numbers.ts", + "input_transforms": { + "a": { + "type": "javascript", + "expr": "flow_input.a" + }, + "b": { + "type": "javascript", + "expr": "flow_input.b" + } + } + } + } + ] + }, + "schema": { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "a": { + "type": "number" + }, + "b": { + "type": "number" + } + }, + "required": [ + "a", + "b" + ] + } +} diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test2.json b/ai_evals/fixtures/frontend/flow/expected/test2.json similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test2.json rename to ai_evals/fixtures/frontend/flow/expected/test2.json diff --git a/ai_evals/fixtures/frontend/flow/expected/test2_call_existing_subflow.json b/ai_evals/fixtures/frontend/flow/expected/test2_call_existing_subflow.json new file mode 100644 index 0000000000..33021252a9 --- /dev/null +++ b/ai_evals/fixtures/frontend/flow/expected/test2_call_existing_subflow.json @@ -0,0 +1,39 @@ +{ + "value": { + "modules": [ + { + "id": "call_add_numbers", + "value": { + "type": "flow", + "path": "f/evals/add_numbers_flow", + "input_transforms": { + "a": { + "type": "javascript", + "expr": "flow_input.a" + }, + "b": { + "type": "javascript", + "expr": "flow_input.b" + } + } + } + } + ] + }, + "schema": { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "a": { + "type": "number" + }, + "b": { + "type": "number" + } + }, + "required": [ + "a", + "b" + ] + } +} diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test3.json b/ai_evals/fixtures/frontend/flow/expected/test3.json similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test3.json rename to ai_evals/fixtures/frontend/flow/expected/test3.json diff --git a/ai_evals/fixtures/frontend/flow/expected/test3_branchone_routing.json b/ai_evals/fixtures/frontend/flow/expected/test3_branchone_routing.json new file mode 100644 index 0000000000..912919a435 --- /dev/null +++ b/ai_evals/fixtures/frontend/flow/expected/test3_branchone_routing.json @@ -0,0 +1,24 @@ +{ + "value": { + "modules": [ + { + "id": "route_by_tier", + "value": { + "type": "branchone" + } + } + ] + }, + "schema": { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "tier": { + "type": "string" + } + }, + "required": [ + "tier" + ] + } +} diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test4.json b/ai_evals/fixtures/frontend/flow/expected/test4.json similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test4.json rename to ai_evals/fixtures/frontend/flow/expected/test4.json diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test5_modify_simple.json b/ai_evals/fixtures/frontend/flow/expected/test5_modify_simple.json similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test5_modify_simple.json rename to ai_evals/fixtures/frontend/flow/expected/test5_modify_simple.json diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test6_modify_medium.json b/ai_evals/fixtures/frontend/flow/expected/test6_modify_medium.json similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test6_modify_medium.json rename to ai_evals/fixtures/frontend/flow/expected/test6_modify_medium.json diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test7_modify_complex.json b/ai_evals/fixtures/frontend/flow/expected/test7_modify_complex.json similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test7_modify_complex.json rename to ai_evals/fixtures/frontend/flow/expected/test7_modify_complex.json diff --git a/ai_evals/fixtures/frontend/flow/initial/test1_reuse_existing_script_initial.json b/ai_evals/fixtures/frontend/flow/initial/test1_reuse_existing_script_initial.json new file mode 100644 index 0000000000..6540a36c26 --- /dev/null +++ b/ai_evals/fixtures/frontend/flow/initial/test1_reuse_existing_script_initial.json @@ -0,0 +1,29 @@ +{ + "workspace": { + "scripts": [ + { + "path": "f/evals/add_two_numbers.ts", + "summary": "Add two numbers", + "description": "Returns the sum of two numeric inputs.", + "language": "bun", + "schema": { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "a": { + "type": "number" + }, + "b": { + "type": "number" + } + }, + "required": [ + "a", + "b" + ] + }, + "content": "export async function main(a: number, b: number) {\n return a + b;\n}\n" + } + ] + } +} diff --git a/ai_evals/fixtures/frontend/flow/initial/test2_call_existing_subflow_initial.json b/ai_evals/fixtures/frontend/flow/initial/test2_call_existing_subflow_initial.json new file mode 100644 index 0000000000..03a29c28e4 --- /dev/null +++ b/ai_evals/fixtures/frontend/flow/initial/test2_call_existing_subflow_initial.json @@ -0,0 +1,49 @@ +{ + "workspace": { + "flows": [ + { + "path": "f/evals/add_numbers_flow", + "summary": "Add two numbers in a subflow", + "description": "Takes two numeric inputs and returns their sum.", + "schema": { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "a": { + "type": "number" + }, + "b": { + "type": "number" + } + }, + "required": [ + "a", + "b" + ] + }, + "value": { + "modules": [ + { + "id": "sum_numbers", + "value": { + "type": "rawscript", + "language": "bun", + "content": "export async function main(a: number, b: number) {\n return a + b;\n}", + "input_transforms": { + "a": { + "type": "javascript", + "expr": "flow_input.a" + }, + "b": { + "type": "javascript", + "expr": "flow_input.b" + } + } + } + } + ] + } + } + ] + } +} diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/initial/test5_initial.json b/ai_evals/fixtures/frontend/flow/initial/test5_initial.json similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/flow/initial/test5_initial.json rename to ai_evals/fixtures/frontend/flow/initial/test5_initial.json diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/initial/test6_initial.json b/ai_evals/fixtures/frontend/flow/initial/test6_initial.json similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/flow/initial/test6_initial.json rename to ai_evals/fixtures/frontend/flow/initial/test6_initial.json diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/initial/test7_initial.json b/ai_evals/fixtures/frontend/flow/initial/test7_initial.json similarity index 100% rename from frontend/src/lib/components/copilot/chat/__tests__/flow/initial/test7_initial.json rename to ai_evals/fixtures/frontend/flow/initial/test7_initial.json diff --git a/ai_evals/fixtures/frontend/script/expected/test1_greet_user.json b/ai_evals/fixtures/frontend/script/expected/test1_greet_user.json new file mode 100644 index 0000000000..b6cd4c7395 --- /dev/null +++ b/ai_evals/fixtures/frontend/script/expected/test1_greet_user.json @@ -0,0 +1,8 @@ +{ + "path": "f/evals/greet_user.ts", + "lang": "bun", + "args": { + "name": "Alice" + }, + "code": "export async function main(name: string) {\n\treturn `Hello, ${name}!`\n}\n" +} diff --git a/ai_evals/fixtures/frontend/script/initial/test1_empty_bun.json b/ai_evals/fixtures/frontend/script/initial/test1_empty_bun.json new file mode 100644 index 0000000000..f1e1e90df7 --- /dev/null +++ b/ai_evals/fixtures/frontend/script/initial/test1_empty_bun.json @@ -0,0 +1,8 @@ +{ + "path": "f/evals/greet_user.ts", + "lang": "bun", + "args": { + "name": "Alice" + }, + "code": "export async function main(name: string) {\n\treturn ''\n}\n" +} diff --git a/ai_evals/history/app.jsonl b/ai_evals/history/app.jsonl new file mode 100644 index 0000000000..3f174ec671 --- /dev/null +++ b/ai_evals/history/app.jsonl @@ -0,0 +1,3 @@ +{"createdAt":"2026-04-10T14:24:42.248Z","gitSha":"8f8b487be517a0bdd318c36857c1d46d5ab0723a","mode":"app","runs":1,"runModel":"anthropic:claude-haiku-4-5-20251001","judgeModel":"claude-sonnet-4-6","caseCount":9,"attemptCount":9,"passedAttempts":7,"passRate":0.7777777777777778,"averageDurationMs":25680.777777777777,"averageJudgeScore":76.55555555555556,"averageTokenUsagePerAttempt":{"prompt":53989.22222222222,"completion":2629.222222222222,"total":56618.444444444445},"failedCaseIds":["app-test8-inventory-tracker-create","app-test9-recipe-book-create"],"cases":[{"id":"app-test1-counter-create","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":11071,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":17912,"completion":1079,"total":18991}},{"id":"app-test2-counter-reset","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":12121,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":19088,"completion":833,"total":19921}},{"id":"app-test3-shopping-cart-quantity","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":25852,"averageJudgeScore":98,"averageTokenUsagePerAttempt":{"prompt":58834,"completion":2446,"total":61280}},{"id":"app-test4-shopping-cart-discount","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":42350,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":90882,"completion":4984,"total":95866}},{"id":"app-test5-file-manager-search","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":29129,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":81980,"completion":2817,"total":84797}},{"id":"app-test6-file-manager-inline-rename","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":51576,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":110023,"completion":6328,"total":116351}},{"id":"app-test7-file-manager-select-all","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":39256,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":72006,"completion":4188,"total":76194}},{"id":"app-test8-inventory-tracker-create","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":10514,"averageJudgeScore":0,"averageTokenUsagePerAttempt":{"prompt":17600,"completion":511,"total":18111}},{"id":"app-test9-recipe-book-create","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":9258,"averageJudgeScore":0,"averageTokenUsagePerAttempt":{"prompt":17578,"completion":477,"total":18055}}]} +{"createdAt":"2026-04-10T14:27:49.271Z","gitSha":"8f8b487be517a0bdd318c36857c1d46d5ab0723a","mode":"app","runs":1,"runModel":"anthropic:claude-opus-4-6","judgeModel":"claude-sonnet-4-6","caseCount":9,"attemptCount":9,"passedAttempts":6,"passRate":0.6666666666666666,"averageDurationMs":57285.666666666664,"averageJudgeScore":82.55555555555556,"averageTokenUsagePerAttempt":{"prompt":54435.77777777778,"completion":3668.6666666666665,"total":58104.444444444445},"failedCaseIds":["app-test7-file-manager-select-all","app-test8-inventory-tracker-create","app-test9-recipe-book-create"],"cases":[{"id":"app-test1-counter-create","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":17930,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":17620,"completion":743,"total":18363}},{"id":"app-test2-counter-reset","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":17852,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":18887,"completion":701,"total":19588}},{"id":"app-test3-shopping-cart-quantity","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":43501,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":38855,"completion":2692,"total":41547}},{"id":"app-test4-shopping-cart-discount","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":60820,"averageJudgeScore":98,"averageTokenUsagePerAttempt":{"prompt":61707,"completion":3420,"total":65127}},{"id":"app-test5-file-manager-search","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":45253,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":67244,"completion":3031,"total":70275}},{"id":"app-test6-file-manager-inline-rename","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":104837,"averageJudgeScore":98,"averageTokenUsagePerAttempt":{"prompt":116979,"completion":6834,"total":123813}},{"id":"app-test7-file-manager-select-all","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":73325,"averageJudgeScore":78,"averageTokenUsagePerAttempt":{"prompt":76351,"completion":5239,"total":81590}},{"id":"app-test8-inventory-tracker-create","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":133705,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":68546,"completion":9881,"total":78427}},{"id":"app-test9-recipe-book-create","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":18348,"averageJudgeScore":0,"averageTokenUsagePerAttempt":{"prompt":23733,"completion":477,"total":24210}}]} +{"createdAt":"2026-04-10T14:29:28.396Z","gitSha":"8f8b487be517a0bdd318c36857c1d46d5ab0723a","mode":"app","runs":1,"runModel":"openai:gpt-4o","judgeModel":"claude-sonnet-4-6","caseCount":9,"attemptCount":9,"passedAttempts":5,"passRate":0.5555555555555556,"averageDurationMs":31682.555555555555,"averageJudgeScore":73.11111111111111,"averageTokenUsagePerAttempt":{"prompt":27221.222222222223,"completion":1564.6666666666667,"total":28785.88888888889},"failedCaseIds":["app-test6-file-manager-inline-rename","app-test7-file-manager-select-all","app-test8-inventory-tracker-create","app-test9-recipe-book-create"],"cases":[{"id":"app-test1-counter-create","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":9911,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":8116,"completion":525,"total":8641}},{"id":"app-test2-counter-reset","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":15146,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":13096,"completion":576,"total":13672}},{"id":"app-test3-shopping-cart-quantity","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":31146,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":42424,"completion":1691,"total":44115}},{"id":"app-test4-shopping-cart-discount","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":49382,"averageJudgeScore":92,"averageTokenUsagePerAttempt":{"prompt":35785,"completion":3345,"total":39130}},{"id":"app-test5-file-manager-search","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":62963,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":46902,"completion":3590,"total":50492}},{"id":"app-test6-file-manager-inline-rename","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":24203,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":33121,"completion":498,"total":33619}},{"id":"app-test7-file-manager-select-all","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":74058,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":46026,"completion":3591,"total":49617}},{"id":"app-test8-inventory-tracker-create","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":6757,"averageJudgeScore":0,"averageTokenUsagePerAttempt":{"prompt":7770,"completion":165,"total":7935}},{"id":"app-test9-recipe-book-create","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":11577,"averageJudgeScore":0,"averageTokenUsagePerAttempt":{"prompt":11751,"completion":101,"total":11852}}]} diff --git a/ai_evals/history/cli.jsonl b/ai_evals/history/cli.jsonl new file mode 100644 index 0000000000..45be5669fa --- /dev/null +++ b/ai_evals/history/cli.jsonl @@ -0,0 +1,2 @@ +{"createdAt":"2026-04-10T14:25:39.106Z","gitSha":"8f8b487be517a0bdd318c36857c1d46d5ab0723a","mode":"cli","runs":1,"runModel":"anthropic:haiku","judgeModel":"claude-sonnet-4-6","caseCount":6,"attemptCount":6,"passedAttempts":6,"passRate":1,"averageDurationMs":21746,"averageJudgeScore":99.16666666666667,"averageTokenUsagePerAttempt":null,"failedCaseIds":[],"cases":[{"id":"bun-hello-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":16588,"averageJudgeScore":100,"averageTokenUsagePerAttempt":null},{"id":"bun-hello-flow","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":27642,"averageJudgeScore":100,"averageTokenUsagePerAttempt":null},{"id":"python-add-numbers-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":23640,"averageJudgeScore":100,"averageTokenUsagePerAttempt":null},{"id":"bun-hello-script-uppercase","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":19379,"averageJudgeScore":100,"averageTokenUsagePerAttempt":null},{"id":"bun-hello-flow-punctuation","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":21993,"averageJudgeScore":100,"averageTokenUsagePerAttempt":null},{"id":"flow-reuse-existing-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":21234,"averageJudgeScore":95,"averageTokenUsagePerAttempt":null}]} +{"createdAt":"2026-04-10T14:28:09.045Z","gitSha":"8f8b487be517a0bdd318c36857c1d46d5ab0723a","mode":"cli","runs":1,"runModel":"anthropic:opus","judgeModel":"claude-sonnet-4-6","caseCount":6,"attemptCount":6,"passedAttempts":6,"passRate":1,"averageDurationMs":24988.833333333332,"averageJudgeScore":99.66666666666667,"averageTokenUsagePerAttempt":null,"failedCaseIds":[],"cases":[{"id":"bun-hello-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":22034,"averageJudgeScore":100,"averageTokenUsagePerAttempt":null},{"id":"bun-hello-flow","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":28030,"averageJudgeScore":100,"averageTokenUsagePerAttempt":null},{"id":"python-add-numbers-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":16668,"averageJudgeScore":100,"averageTokenUsagePerAttempt":null},{"id":"bun-hello-script-uppercase","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":21269,"averageJudgeScore":100,"averageTokenUsagePerAttempt":null},{"id":"bun-hello-flow-punctuation","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":30126,"averageJudgeScore":100,"averageTokenUsagePerAttempt":null},{"id":"flow-reuse-existing-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":31806,"averageJudgeScore":98,"averageTokenUsagePerAttempt":null}]} diff --git a/ai_evals/history/flow.jsonl b/ai_evals/history/flow.jsonl new file mode 100644 index 0000000000..9bdb9a6c77 --- /dev/null +++ b/ai_evals/history/flow.jsonl @@ -0,0 +1,3 @@ +{"createdAt":"2026-04-10T14:25:16.664Z","gitSha":"8f8b487be517a0bdd318c36857c1d46d5ab0723a","mode":"flow","runs":1,"runModel":"anthropic:claude-haiku-4-5-20251001","judgeModel":"claude-sonnet-4-6","caseCount":13,"attemptCount":13,"passedAttempts":8,"passRate":0.6153846153846154,"averageDurationMs":33424.692307692305,"averageJudgeScore":82.61538461538461,"averageTokenUsagePerAttempt":{"prompt":131901,"completion":3121.230769230769,"total":135022.23076923078},"failedCaseIds":["flow-test6-ai-agent-tools","flow-test7-simple-modification","flow-test9-parallel-refactor","flow-test10-while-loop-counter","flow-test11-preprocessor-and-failure-handler"],"cases":[{"id":"flow-test0-sum-two-numbers","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":16943,"averageJudgeScore":98,"averageTokenUsagePerAttempt":{"prompt":126615,"completion":839,"total":127454}},{"id":"flow-test1-reuse-existing-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":15220,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":75614,"completion":805,"total":76419}},{"id":"flow-test2-call-existing-subflow","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":15699,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":76182,"completion":887,"total":77069}},{"id":"flow-test3-branchone-routing","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":21605,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":62230,"completion":1509,"total":63739}},{"id":"flow-test4-order-processing-loop","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":47228,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":143511,"completion":5443,"total":148954}},{"id":"flow-test5-parallel-data-pipeline","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":81870,"averageJudgeScore":92,"averageTokenUsagePerAttempt":{"prompt":194542,"completion":12409,"total":206951}},{"id":"flow-test6-ai-agent-tools","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":51878,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":142071,"completion":5720,"total":147791}},{"id":"flow-test7-simple-modification","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":49113,"averageJudgeScore":42,"averageTokenUsagePerAttempt":{"prompt":318525,"completion":2702,"total":321227}},{"id":"flow-test8-branching-in-loop","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":18244,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":78441,"completion":979,"total":79420}},{"id":"flow-test9-parallel-refactor","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":49485,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":135237,"completion":5467,"total":140704}},{"id":"flow-test10-while-loop-counter","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":21210,"averageJudgeScore":90,"averageTokenUsagePerAttempt":{"prompt":127844,"completion":1179,"total":129023}},{"id":"flow-test11-preprocessor-and-failure-handler","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":25142,"averageJudgeScore":42,"averageTokenUsagePerAttempt":{"prompt":128648,"completion":1337,"total":129985}},{"id":"flow-test12-approval-step","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":20884,"averageJudgeScore":90,"averageTokenUsagePerAttempt":{"prompt":105253,"completion":1300,"total":106553}}]} +{"createdAt":"2026-04-10T14:57:17.513Z","gitSha":"2a58402cfc5c320748839e92b51a1291b937bf26","mode":"flow","runs":1,"runModel":"anthropic:claude-opus-4-6","judgeModel":"claude-sonnet-4-6","caseCount":13,"attemptCount":13,"passedAttempts":8,"passRate":0.6153846153846154,"averageDurationMs":58074.53846153846,"averageJudgeScore":87.53846153846153,"averageTokenUsagePerAttempt":{"prompt":125452.76923076923,"completion":2957.769230769231,"total":128410.53846153847},"failedCaseIds":["flow-test4-order-processing-loop","flow-test6-ai-agent-tools","flow-test7-simple-modification","flow-test10-while-loop-counter","flow-test11-preprocessor-and-failure-handler"],"cases":[{"id":"flow-test0-sum-two-numbers","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":26967,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":103796,"completion":634,"total":104430}},{"id":"flow-test1-reuse-existing-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":29009,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":75507,"completion":743,"total":76250}},{"id":"flow-test2-call-existing-subflow","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":26828,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":76172,"completion":807,"total":76979}},{"id":"flow-test3-branchone-routing","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":44418,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":130440,"completion":1787,"total":132227}},{"id":"flow-test4-order-processing-loop","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":82185,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":116133,"completion":4905,"total":121038}},{"id":"flow-test5-parallel-data-pipeline","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":110344,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":122092,"completion":6980,"total":129072}},{"id":"flow-test6-ai-agent-tools","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":119901,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":154916,"completion":8908,"total":163824}},{"id":"flow-test7-simple-modification","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":44333,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":109935,"completion":1536,"total":111471}},{"id":"flow-test8-branching-in-loop","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":54247,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":136872,"completion":2638,"total":139510}},{"id":"flow-test9-parallel-refactor","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":63274,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":137794,"completion":3686,"total":141480}},{"id":"flow-test10-while-loop-counter","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":38813,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":105075,"completion":1157,"total":106232}},{"id":"flow-test11-preprocessor-and-failure-handler","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":77267,"averageJudgeScore":52,"averageTokenUsagePerAttempt":{"prompt":256547,"completion":3398,"total":259945}},{"id":"flow-test12-approval-step","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":37383,"averageJudgeScore":90,"averageTokenUsagePerAttempt":{"prompt":105607,"completion":1272,"total":106879}}]} +{"createdAt":"2026-04-10T14:29:52.249Z","gitSha":"8f8b487be517a0bdd318c36857c1d46d5ab0723a","mode":"flow","runs":1,"runModel":"openai:gpt-4o","judgeModel":"claude-sonnet-4-6","caseCount":13,"attemptCount":13,"passedAttempts":6,"passRate":0.46153846153846156,"averageDurationMs":29841.53846153846,"averageJudgeScore":68.46153846153847,"averageTokenUsagePerAttempt":{"prompt":72815.92307692308,"completion":770.7692307692307,"total":73586.69230769231},"failedCaseIds":["flow-test5-parallel-data-pipeline","flow-test6-ai-agent-tools","flow-test7-simple-modification","flow-test9-parallel-refactor","flow-test10-while-loop-counter","flow-test11-preprocessor-and-failure-handler","flow-test12-approval-step"],"cases":[{"id":"flow-test0-sum-two-numbers","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":20059,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":64091,"completion":265,"total":64356}},{"id":"flow-test1-reuse-existing-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":20728,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":46594,"completion":270,"total":46864}},{"id":"flow-test2-call-existing-subflow","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":21533,"averageJudgeScore":98,"averageTokenUsagePerAttempt":{"prompt":46859,"completion":232,"total":47091}},{"id":"flow-test3-branchone-routing","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":29004,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":64593,"completion":568,"total":65161}},{"id":"flow-test4-order-processing-loop","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":36250,"averageJudgeScore":95,"averageTokenUsagePerAttempt":{"prompt":66346,"completion":1259,"total":67605}},{"id":"flow-test5-parallel-data-pipeline","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":46151,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":104676,"completion":1698,"total":106374}},{"id":"flow-test6-ai-agent-tools","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":72403,"averageJudgeScore":62,"averageTokenUsagePerAttempt":{"prompt":105280,"completion":2216,"total":107496}},{"id":"flow-test7-simple-modification","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":41599,"averageJudgeScore":20,"averageTokenUsagePerAttempt":{"prompt":103053,"completion":707,"total":103760}},{"id":"flow-test8-branching-in-loop","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":23352,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":97955,"completion":468,"total":98423}},{"id":"flow-test9-parallel-refactor","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":19341,"averageJudgeScore":0,"averageTokenUsagePerAttempt":{"prompt":12254,"completion":1057,"total":13311}},{"id":"flow-test10-while-loop-counter","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":16143,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":64480,"completion":445,"total":64925}},{"id":"flow-test11-preprocessor-and-failure-handler","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":24231,"averageJudgeScore":52,"averageTokenUsagePerAttempt":{"prompt":106068,"completion":472,"total":106540}},{"id":"flow-test12-approval-step","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":17146,"averageJudgeScore":30,"averageTokenUsagePerAttempt":{"prompt":64358,"completion":363,"total":64721}}]} diff --git a/ai_evals/history/script.jsonl b/ai_evals/history/script.jsonl new file mode 100644 index 0000000000..779ddeda4c --- /dev/null +++ b/ai_evals/history/script.jsonl @@ -0,0 +1,3 @@ +{"createdAt":"2026-04-10T14:23:51.580Z","gitSha":"8f8b487be517a0bdd318c36857c1d46d5ab0723a","mode":"script","runs":1,"runModel":"anthropic:claude-haiku-4-5-20251001","judgeModel":"claude-sonnet-4-6","caseCount":1,"attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":12112,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":48134,"completion":452,"total":48586},"failedCaseIds":[],"cases":[{"id":"script-test1-greet-user","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":12112,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":48134,"completion":452,"total":48586}}]} +{"createdAt":"2026-04-10T14:24:18.129Z","gitSha":"8f8b487be517a0bdd318c36857c1d46d5ab0723a","mode":"script","runs":1,"runModel":"anthropic:claude-opus-4-6","judgeModel":"claude-sonnet-4-6","caseCount":1,"attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":16595,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":38264,"completion":254,"total":38518},"failedCaseIds":[],"cases":[{"id":"script-test1-greet-user","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":16595,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":38264,"completion":254,"total":38518}}]} +{"createdAt":"2026-04-10T14:24:41.534Z","gitSha":"8f8b487be517a0bdd318c36857c1d46d5ab0723a","mode":"script","runs":1,"runModel":"openai:gpt-4o","judgeModel":"claude-sonnet-4-6","caseCount":1,"attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":13643,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":28961,"completion":137,"total":29098},"failedCaseIds":[],"cases":[{"id":"script-test1-greet-user","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":13643,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":28961,"completion":137,"total":29098}}]} diff --git a/ai_evals/modes/app.ts b/ai_evals/modes/app.ts new file mode 100644 index 0000000000..66bf139c44 --- /dev/null +++ b/ai_evals/modes/app.ts @@ -0,0 +1,79 @@ +import { loadAppFixture } from "../adapters/frontend/core/app/appFixtureLoader"; +import type { AppFiles } from "../../frontend/src/lib/components/copilot/chat/app/core"; +import type { FrontendEvalModelConfig } from "../core/models"; +import { validateAppState, type AppFilesState } from "../core/validators"; +import type { BenchmarkArtifactFile, ModeRunner } from "../core/types"; +import { runAppEval } from "../adapters/frontend/core/app/appEvalRunner"; +import { DEFAULT_FRONTEND_EVAL_MODEL, getFrontendApiKey } from "./frontendCommon"; + +export function createAppModeRunner( + modelConfig: FrontendEvalModelConfig = DEFAULT_FRONTEND_EVAL_MODEL +): ModeRunner { + return { + mode: "app", + concurrency: 5, + judgeThreshold: 80, + async loadInitial(path) { + return path ? (await loadAppFixture(path)) : undefined; + }, + async loadExpected(path) { + return path ? (await loadAppFixture(path)) : undefined; + }, + async run(prompt, initial, context) { + const result = await runAppEval(prompt, getFrontendApiKey(modelConfig.provider), { + initialFrontend: initial?.frontend, + initialBackend: initial?.backend as AppFiles["backend"] | undefined, + provider: modelConfig.provider, + model: modelConfig.model, + runContext: context, + }); + + return { + success: result.success, + actual: result.files as AppFilesState, + error: result.error, + assistantMessageCount: result.assistantMessageCount, + toolCallCount: result.toolCallCount, + toolsUsed: result.toolsUsed, + skillsInvoked: [], + tokenUsage: result.tokenUsage, + }; + }, + validate({ actual, initial, expected }) { + return validateAppState({ actual, initial, expected }); + }, + buildArtifacts(actual): BenchmarkArtifactFile[] { + const artifacts: BenchmarkArtifactFile[] = [ + { + path: "app.json", + content: JSON.stringify(actual, null, 2) + "\n", + }, + ]; + + for (const [filePath, content] of Object.entries(actual.frontend)) { + artifacts.push({ + path: `frontend${filePath.startsWith("/") ? filePath : `/${filePath}`}`, + content, + }); + } + + for (const [key, runnable] of Object.entries(actual.backend)) { + artifacts.push({ + path: `backend/${key}/meta.json`, + content: JSON.stringify(runnable, null, 2) + "\n", + }); + + const inlineContent = runnable.inlineScript?.content; + if (inlineContent) { + const extension = runnable.inlineScript?.language === "python3" ? "py" : "ts"; + artifacts.push({ + path: `backend/${key}/main.${extension}`, + content: inlineContent, + }); + } + } + + return artifacts; + }, + }; +} diff --git a/ai_evals/modes/cli.ts b/ai_evals/modes/cli.ts new file mode 100644 index 0000000000..718983f2c3 --- /dev/null +++ b/ai_evals/modes/cli.ts @@ -0,0 +1,162 @@ +import { mkdtemp, rm, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import path from "node:path"; +import { join } from "node:path"; +import { readFile } from "node:fs/promises"; +import { writeAiGuidanceFiles } from "../../cli/src/guidance/writer.ts"; +import type { CliEvalModelConfig } from "../core/models"; +import { + DEFAULT_CLI_EVAL_MODEL, + formatCliRunModelLabel, + getGeneratedSkillsSource, + runPromptAndCapture, +} from "../adapters/cli/runtime"; +import { copyDirectory, readDirectoryFiles } from "../core/files"; +import { validateCliWorkspace } from "../core/validators"; +import type { BenchmarkArtifactFile, ModeRunner } from "../core/types"; + +const IGNORE_WORKSPACE_FILES = new Set([".claude", "AGENTS.md", "CLAUDE.md", "rt.d.ts"]); + +interface CliWorkspaceFixture { + sourceDir: string; + files: Record; +} + +interface CliRunActual { + assistantOutput: string; + workspaceFiles: Record; +} + +const CLAUDE_PROJECT_PREAMBLE = [ + "Follow the project instructions from AGENTS.md exactly.", + "Before creating or modifying any Windmill entity, you MUST invoke the relevant Skill tool and follow it.", + "Use the skill guidance for file layout, implementation details, and the exact next commands to tell the user.", + "Do not skip the Skill step.", + "You are running inside an automated benchmark harness, not an interactive user session.", + "Act autonomously and complete the requested file changes directly in the workspace.", + "Do not ask for confirmation, do not ask the user to save or create files manually, and do not wait for approval.", + "Do not respond with a plan when you can make the change directly.", + "Only describe what was done after you have written the files.", +].join(" "); + +export function createCliModeRunner( + modelConfig: CliEvalModelConfig = DEFAULT_CLI_EVAL_MODEL +): ModeRunner { + return { + mode: "cli", + concurrency: 1, + judgeThreshold: 80, + async loadInitial(path) { + return path + ? { + sourceDir: path, + files: await readDirectoryFiles(path), + } + : undefined; + }, + async loadExpected(path) { + return path + ? { + sourceDir: path, + files: await readDirectoryFiles(path), + } + : undefined; + }, + async run(prompt, initial, _context) { + const workspaceDir = await mkdtemp(join(tmpdir(), "wmill-cli-benchmark-")); + + try { + if (initial) { + await copyDirectory(initial.sourceDir, workspaceDir); + } + await writeAiGuidanceFiles({ + targetDir: workspaceDir, + nonDottedPaths: true, + overwriteProjectGuidance: true, + skillsSourcePath: getGeneratedSkillsSource(), + }); + await writeFile(join(workspaceDir, "rt.d.ts"), "export namespace RT {}\n", "utf8"); + + const renderedPrompt = await renderPrompt(prompt, workspaceDir); + const run = await runPromptAndCapture(renderedPrompt, workspaceDir, 6, modelConfig); + const workspaceFiles = await readDirectoryFiles(workspaceDir, { ignore: IGNORE_WORKSPACE_FILES }); + + return { + success: true, + actual: { + assistantOutput: run.output, + workspaceFiles, + }, + assistantMessageCount: run.assistantMessageCount, + toolCallCount: run.toolsUsed.length, + toolsUsed: run.toolsUsed.map((entry) => entry.tool), + skillsInvoked: run.skillsInvoked, + tokenUsage: run.tokenUsage ?? null, + }; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + return { + success: false, + actual: { + assistantOutput: "", + workspaceFiles: {}, + }, + error: message, + assistantMessageCount: 0, + toolCallCount: 0, + toolsUsed: [], + skillsInvoked: [], + tokenUsage: null, + }; + } finally { + await rm(workspaceDir, { recursive: true, force: true }); + } + }, + validate({ actual, initial, expected }) { + return validateCliWorkspace({ + actualFiles: actual.workspaceFiles, + expectedFiles: expected?.files, + initialFiles: initial?.files, + }); + }, + buildArtifacts(actual): BenchmarkArtifactFile[] { + const artifacts: BenchmarkArtifactFile[] = [ + { + path: "assistant-output.txt", + content: `${actual.assistantOutput}\n`, + }, + ]; + + for (const [filePath, content] of Object.entries(actual.workspaceFiles)) { + artifacts.push({ + path: filePath, + content, + }); + } + + return artifacts; + }, + }; +} + +export function getCliRunModelLabel( + modelConfig: CliEvalModelConfig = DEFAULT_CLI_EVAL_MODEL +): string { + return formatCliRunModelLabel(modelConfig); +} + +async function renderPrompt(prompt: string, workspaceDir: string): Promise { + const renderedUserPrompt = prompt.replaceAll("{{workspace_root}}", workspaceDir); + const agentsInstructions = await readFile(path.join(workspaceDir, "AGENTS.md"), "utf8"); + + return [ + "# Project Instructions", + agentsInstructions.trim(), + "", + "# Benchmark Harness", + CLAUDE_PROJECT_PREAMBLE, + "", + "# User Request", + renderedUserPrompt, + ].join("\n"); +} diff --git a/ai_evals/modes/flow.ts b/ai_evals/modes/flow.ts new file mode 100644 index 0000000000..36dee80658 --- /dev/null +++ b/ai_evals/modes/flow.ts @@ -0,0 +1,104 @@ +import { readJsonFile } from "../core/files"; +import type { FrontendEvalModelConfig } from "../core/models"; +import { validateFlowState, type FlowState } from "../core/validators"; +import type { BenchmarkArtifactFile, ModeRunner } from "../core/types"; +import { + runFlowEval, + type FlowFixture, +} from "../adapters/frontend/core/flow/flowEvalRunner"; +import type { FlowWorkspaceFixtures } from "../adapters/frontend/core/flow/fileHelpers"; +import { DEFAULT_FRONTEND_EVAL_MODEL, getFrontendApiKey } from "./frontendCommon"; + +interface FlowInitialFixture { + flow?: FlowFixture; + workspace?: FlowWorkspaceFixtures; +} + +export function createFlowModeRunner( + modelConfig: FrontendEvalModelConfig = DEFAULT_FRONTEND_EVAL_MODEL +): ModeRunner { + return { + mode: "flow", + concurrency: 5, + judgeThreshold: 80, + async loadInitial(path) { + if (!path) { + return undefined; + } + return normalizeFlowInitialFixture(await readJsonFile(path)); + }, + async loadExpected(path) { + if (!path) { + return undefined; + } + return normalizeFlowStateFixture(await readJsonFile(path)); + }, + async run(prompt, initial, context) { + const result = await runFlowEval(prompt, getFrontendApiKey(modelConfig.provider), { + initialFlow: initial?.flow, + workspaceFixtures: initial?.workspace, + provider: modelConfig.provider, + model: modelConfig.model, + runContext: context, + }); + + return { + success: result.success, + actual: normalizeFlowStateFixture(result.flow), + error: result.error, + assistantMessageCount: result.assistantMessageCount, + toolCallCount: result.toolCallCount, + toolsUsed: result.toolsUsed, + skillsInvoked: [], + tokenUsage: result.tokenUsage, + }; + }, + validate({ evalCase, actual, initial, expected }) { + return validateFlowState({ + actual, + initial: initial?.flow, + expected, + validate: evalCase.validate, + }); + }, + buildArtifacts(actual): BenchmarkArtifactFile[] { + return [ + { + path: "flow.json", + content: JSON.stringify(actual, null, 2) + "\n", + }, + ]; + }, + }; +} + +function normalizeFlowInitialFixture(value: unknown): FlowInitialFixture { + if (isObject(value) && ("flow" in value || "workspace" in value)) { + const fixture = value as { + flow?: FlowFixture; + workspace?: FlowWorkspaceFixtures; + }; + return { + flow: fixture.flow, + workspace: fixture.workspace, + }; + } + + return { + flow: normalizeFlowStateFixture(value), + }; +} + +function normalizeFlowStateFixture(value: unknown): FlowState { + if (!isObject(value)) { + return {}; + } + if ("flow" in value && isObject((value as { flow?: unknown }).flow)) { + return (value as { flow: FlowState }).flow; + } + return value as FlowState; +} + +function isObject(value: unknown): value is Record { + return typeof value === "object" && value !== null && !Array.isArray(value); +} diff --git a/ai_evals/modes/frontendCommon.test.ts b/ai_evals/modes/frontendCommon.test.ts new file mode 100644 index 0000000000..cac10ffcab --- /dev/null +++ b/ai_evals/modes/frontendCommon.test.ts @@ -0,0 +1,28 @@ +import { afterEach, describe, expect, it } from "bun:test"; +import { getFrontendApiKey } from "./frontendCommon"; + +const ORIGINAL_ENV = { + ANTHROPIC_API_KEY: process.env.ANTHROPIC_API_KEY, + OPENAI_API_KEY: process.env.OPENAI_API_KEY, + GEMINI_API_KEY: process.env.GEMINI_API_KEY, +}; + +afterEach(() => { + process.env.ANTHROPIC_API_KEY = ORIGINAL_ENV.ANTHROPIC_API_KEY; + process.env.OPENAI_API_KEY = ORIGINAL_ENV.OPENAI_API_KEY; + process.env.GEMINI_API_KEY = ORIGINAL_ENV.GEMINI_API_KEY; +}); + +describe("getFrontendApiKey", () => { + it("reads the Gemini API key for googleai models", () => { + process.env.GEMINI_API_KEY = "gemini-test-key"; + expect(getFrontendApiKey("googleai")).toBe("gemini-test-key"); + }); + + it("throws a provider-specific error when the key is missing", () => { + delete process.env.GEMINI_API_KEY; + expect(() => getFrontendApiKey("googleai")).toThrow( + "GEMINI_API_KEY is required for frontend evals" + ); + }); +}); diff --git a/ai_evals/modes/frontendCommon.ts b/ai_evals/modes/frontendCommon.ts new file mode 100644 index 0000000000..2619d21821 --- /dev/null +++ b/ai_evals/modes/frontendCommon.ts @@ -0,0 +1,23 @@ +import { + getFrontendEvalModel, + resolveEvalModel, + type FrontendEvalModelConfig, +} from "../core/models"; + +export const DEFAULT_FRONTEND_EVAL_MODEL: FrontendEvalModelConfig = getFrontendEvalModel( + resolveEvalModel("flow") +); + +export function getFrontendApiKey(provider: FrontendEvalModelConfig["provider"]): string { + const envName = + provider === "anthropic" + ? "ANTHROPIC_API_KEY" + : provider === "googleai" + ? "GEMINI_API_KEY" + : "OPENAI_API_KEY"; + const apiKey = process.env[envName]; + if (!apiKey) { + throw new Error(`${envName} is required for frontend evals`); + } + return apiKey; +} diff --git a/ai_evals/modes/script.ts b/ai_evals/modes/script.ts new file mode 100644 index 0000000000..f3ab232cc3 --- /dev/null +++ b/ai_evals/modes/script.ts @@ -0,0 +1,61 @@ +import { readJsonFile } from "../core/files"; +import type { FrontendEvalModelConfig } from "../core/models"; +import { validateScriptState } from "../core/validators"; +import type { BenchmarkArtifactFile, ModeRunner } from "../core/types"; +import { runScriptEval } from "../adapters/frontend/core/script/scriptEvalRunner"; +import type { ScriptEvalState } from "../adapters/frontend/core/script/fileHelpers"; +import { DEFAULT_FRONTEND_EVAL_MODEL, getFrontendApiKey } from "./frontendCommon"; + +export function createScriptModeRunner( + modelConfig: FrontendEvalModelConfig = DEFAULT_FRONTEND_EVAL_MODEL +): ModeRunner { + return { + mode: "script", + concurrency: 5, + judgeThreshold: 80, + async loadInitial(path) { + return path ? await readJsonFile(path) : undefined; + }, + async loadExpected(path) { + return path ? await readJsonFile(path) : undefined; + }, + async run(prompt, initial, context) { + if (!initial) { + throw new Error("Script evals require an initial script fixture"); + } + + const result = await runScriptEval(prompt, getFrontendApiKey(modelConfig.provider), { + initialScript: initial, + provider: modelConfig.provider, + model: modelConfig.model, + runContext: context, + }); + + return { + success: result.success, + actual: result.script, + error: result.error, + assistantMessageCount: result.assistantMessageCount, + toolCallCount: result.toolCallCount, + toolsUsed: result.toolsUsed, + skillsInvoked: [], + tokenUsage: result.tokenUsage, + }; + }, + validate({ actual, initial, expected }) { + return validateScriptState({ actual, initial, expected }); + }, + buildArtifacts(actual): BenchmarkArtifactFile[] { + return [ + { + path: "script.json", + content: JSON.stringify(actual, null, 2) + "\n", + }, + { + path: actual.path, + content: actual.code, + }, + ]; + }, + }; +} diff --git a/ai_evals/package.json b/ai_evals/package.json new file mode 100644 index 0000000000..569562ad3c --- /dev/null +++ b/ai_evals/package.json @@ -0,0 +1,19 @@ +{ + "name": "windmill-ai-evals", + "private": true, + "type": "module", + "scripts": { + "cli": "bun cli/index.ts" + }, + "dependencies": { + "@anthropic-ai/claude-agent-sdk": "^0.2.25", + "@anthropic-ai/sdk": "^0.39.0", + "commander": "^14.0.3", + "openai": "^6.9.1", + "yaml": "^2.8.3" + }, + "devDependencies": { + "@types/bun": "latest", + "typescript": "^5.0.0" + } +} diff --git a/cli/README.md b/cli/README.md index ee3b68c35f..478aa2a936 100644 --- a/cli/README.md +++ b/cli/README.md @@ -110,6 +110,28 @@ source <(wmill completions zsh) ## Development +### AI Guidance Variants + +`wmill init` can now materialize alternate AI guidance bundles without changing +the generated defaults in the repo, but this is exposed as internal env-var +overrides rather than public CLI flags. + +Examples: + +```bash +WMILL_INIT_AI_SKILLS_SOURCE=/path/to/custom/skills wmill init --use-default +WMILL_INIT_AI_SKILLS_SOURCE=/path/to/custom/skills WMILL_INIT_AI_AGENTS_SOURCE=/path/to/AGENTS.md wmill init --use-default +WMILL_INIT_AI_SKILLS_SOURCE=/path/to/custom/skills WMILL_INIT_AI_CLAUDE_SOURCE=/path/to/CLAUDE.md wmill init --use-default +``` + +This is the same guidance-writing path used by the benchmark CLI under +`ai_evals/`, so the benchmark harness and `wmill init` now generate the same +project guidance shape: + +- `AGENTS.md` +- `CLAUDE.md` +- `.claude/skills/*` + ### Testing with a local `windmill-yaml-validator` To test local changes to the validator before publishing, use `npm link`: diff --git a/cli/TESTING.md b/cli/TESTING.md index 542baab368..235a95c4b1 100644 --- a/cli/TESTING.md +++ b/cli/TESTING.md @@ -29,6 +29,20 @@ binary and starts a shared backend instance. Examples: `sync_pull_push`, `dev_server`, `standalone_commands` +## AI Benchmark Caveats + +The repo-level benchmark CLI lives under `ai_evals/`, but it currently depends on +mocked frontend flow execution in a few places. Treat `flow` benchmark passes as +artifact-shape signal, not full runtime correctness, when either of these apply: + +- deterministic flow validation does not currently reject syntactically invalid + `rawscript` module bodies +- frontend benchmark calls to `test_run_flow` and `test_run_step` return mocked + completed jobs for `mock-job-id-*` workspaces instead of executing the flow + +If a prompt change depends on flow wiring or script runtime behavior, verify it +with additional validation or a real run before trusting the benchmark result. + ## Environment Variables | Variable | Purpose | Default | diff --git a/cli/src/commands/init/init.ts b/cli/src/commands/init/init.ts index 345eda8e77..519f572394 100644 --- a/cli/src/commands/init/init.ts +++ b/cli/src/commands/init/init.ts @@ -1,4 +1,4 @@ -import { stat, writeFile, rm, mkdir } from "node:fs/promises"; +import { stat, writeFile, rm } from "node:fs/promises"; import { colors } from "@cliffy/ansi/colors"; import { Command } from "@cliffy/command"; import { Confirm } from "@cliffy/prompt/confirm"; @@ -16,23 +16,14 @@ import { type Workspace, } from "../workspace/workspace.ts"; import { generateRTNamespace } from "../resource-type/resource-type.ts"; -import { SKILLS, SKILL_CONTENT, SCHEMAS, SCHEMA_MAPPINGS } from "../../guidance/skills.ts"; -import { generateAgentsMdContent } from "../../guidance/core.ts"; +import { + WMILL_INIT_AI_AGENTS_SOURCE_ENV, + WMILL_INIT_AI_CLAUDE_SOURCE_ENV, + WMILL_INIT_AI_SKILLS_SOURCE_ENV, + writeAiGuidanceFiles, +} from "../../guidance/writer.ts"; import { generateCommentedTemplate } from "./template.ts"; -/** - * Format a YAML schema for inclusion in skill markdown files. - */ -function formatSchemaForMarkdown(schemaYaml: string, schemaName: string, filePattern: string): string { - return `## ${schemaName} (\`${filePattern}\`) - -Must be a YAML file that adheres to the following schema: - -\`\`\`yaml -${schemaYaml.trim()} -\`\`\``; -} - export interface InitOptions { useDefault?: boolean; useBackend?: boolean; @@ -255,88 +246,24 @@ async function initAction(opts: InitOptions) { // Create guidance files (AGENTS.md, CLAUDE.md, and Claude skills) try { - // Generate skills reference section for AGENTS.md - const skills_base_dir = ".claude/skills"; - const skillsReference = SKILLS.map( - (s) => `- \`${skills_base_dir}/${s.name}/SKILL.md\` - ${s.description}` - ).join("\n"); + const guidanceResult = await writeAiGuidanceFiles({ + targetDir: ".", + nonDottedPaths, + overwriteProjectGuidance: false, + skillsSourcePath: process.env[WMILL_INIT_AI_SKILLS_SOURCE_ENV], + agentsSourcePath: process.env[WMILL_INIT_AI_AGENTS_SOURCE_ENV], + claudeSourcePath: process.env[WMILL_INIT_AI_CLAUDE_SOURCE_ENV], + }); - // Create AGENTS.md file with minimal instructions - if (!(await stat("AGENTS.md").catch(() => null))) { - await writeFile( - "AGENTS.md", - generateAgentsMdContent(skillsReference), "utf-8" - ); + if (guidanceResult.agentsWritten) { log.info(colors.green("Created AGENTS.md")); } - - // Create CLAUDE.md file, referencing AGENTS.md - if (!(await stat("CLAUDE.md").catch(() => null))) { - await writeFile( - "CLAUDE.md", - `Instructions are in @AGENTS.md -`, "utf-8" - ); + if (guidanceResult.claudeWritten) { log.info(colors.green("Created CLAUDE.md")); } - - // Create .claude/skills/ directory and skill files - try { - await mkdir(".claude/skills", { recursive: true }); - - await Promise.all( - SKILLS.map(async (skill) => { - const skillDir = `.claude/skills/${skill.name}`; - await mkdir(skillDir, { recursive: true }); - - let skillContent = SKILL_CONTENT[skill.name]; - if (skillContent) { - // Replace placeholders with actual suffixes based on nonDottedPaths - if (nonDottedPaths) { - skillContent = skillContent - .replaceAll("{{FLOW_SUFFIX}}", "__flow") - .replaceAll("{{APP_SUFFIX}}", "__app") - .replaceAll("{{RAW_APP_SUFFIX}}", "__raw_app") - .replaceAll("{{INLINE_SCRIPT_NAMING}}", "Inline script files should NOT include `.inline_script.` in their names (e.g. use `a.ts`, not `a.inline_script.ts`)."); - } else { - skillContent = skillContent - .replaceAll("{{FLOW_SUFFIX}}", ".flow") - .replaceAll("{{APP_SUFFIX}}", ".app") - .replaceAll("{{RAW_APP_SUFFIX}}", ".raw_app") - .replaceAll("{{INLINE_SCRIPT_NAMING}}", "Inline script files use the `.inline_script.` naming convention (e.g. `a.inline_script.ts`)."); - } - // Check if this skill has schemas that need to be appended - const schemaMappings = SCHEMA_MAPPINGS[skill.name]; - if (schemaMappings && schemaMappings.length > 0) { - // Combine base content with schemas - const schemaDocs = schemaMappings - .map((mapping) => { - const schemaYaml = SCHEMAS[mapping.schemaKey]; - if (schemaYaml) { - return formatSchemaForMarkdown(schemaYaml, mapping.name, mapping.filePattern); - } - return null; - }) - .filter((doc): doc is string => doc !== null); - - if (schemaDocs.length > 0) { - skillContent = skillContent + "\n\n" + schemaDocs.join("\n\n"); - } - } - - await writeFile(`${skillDir}/SKILL.md`, skillContent, "utf-8"); - } - }) - ); - - log.info(colors.green(`Created .claude/skills/ with ${SKILLS.length} skills`)); - } catch (skillError) { - if (skillError instanceof Error) { - log.warn(`Could not create skills: ${skillError.message}`); - } else { - log.warn(`Could not create skills: ${skillError}`); - } - } + log.info( + colors.green(`Created .claude/skills/ with ${guidanceResult.skillCount} skills`) + ); } catch (error) { if (error instanceof Error) { log.warn(`Could not create guidance files: ${error.message}`); diff --git a/cli/src/guidance/writer.ts b/cli/src/guidance/writer.ts new file mode 100644 index 0000000000..0acab52784 --- /dev/null +++ b/cli/src/guidance/writer.ts @@ -0,0 +1,269 @@ +import { cp, mkdir, readdir, readFile, stat, writeFile } from "node:fs/promises"; +import { join } from "node:path"; +import { generateAgentsMdContent } from "./core.ts"; +import { + SCHEMAS, + SCHEMA_MAPPINGS, + SKILLS, + SKILL_CONTENT, + type SkillMetadata, +} from "./skills.ts"; + +type ResolvedSkillMetadata = SkillMetadata & { + directoryName: string; +}; + +export interface WriteAiGuidanceOptions { + targetDir: string; + nonDottedPaths?: boolean; + overwriteProjectGuidance?: boolean; + skillsSourcePath?: string; + agentsSourcePath?: string; + claudeSourcePath?: string; +} + +export interface WriteAiGuidanceResult { + agentsWritten: boolean; + claudeWritten: boolean; + skillCount: number; +} + +export const WMILL_INIT_AI_SKILLS_SOURCE_ENV = "WMILL_INIT_AI_SKILLS_SOURCE"; +export const WMILL_INIT_AI_AGENTS_SOURCE_ENV = "WMILL_INIT_AI_AGENTS_SOURCE"; +export const WMILL_INIT_AI_CLAUDE_SOURCE_ENV = "WMILL_INIT_AI_CLAUDE_SOURCE"; + +const CLAUDE_MD_DEFAULT = "Instructions are in @AGENTS.md\n"; + +export async function writeAiGuidanceFiles( + options: WriteAiGuidanceOptions +): Promise { + const nonDottedPaths = options.nonDottedPaths ?? true; + const skillMetadata = options.skillsSourcePath + ? await readSkillMetadataFromDirectory(options.skillsSourcePath) + : getGeneratedSkillMetadata(); + + const agentsWritten = await writeProjectGuidanceFile({ + targetPath: join(options.targetDir, "AGENTS.md"), + overwrite: options.overwriteProjectGuidance ?? false, + content: + options.agentsSourcePath != null + ? await readFile(options.agentsSourcePath, "utf8") + : generateAgentsMdContent(buildSkillsReference(skillMetadata)), + }); + + const claudeWritten = await writeProjectGuidanceFile({ + targetPath: join(options.targetDir, "CLAUDE.md"), + overwrite: options.overwriteProjectGuidance ?? false, + content: + options.claudeSourcePath != null + ? await readFile(options.claudeSourcePath, "utf8") + : CLAUDE_MD_DEFAULT, + }); + + if (options.skillsSourcePath) { + await copySkillsFromSource(options.targetDir, options.skillsSourcePath); + } else { + await writeGeneratedSkills(options.targetDir, nonDottedPaths); + } + + return { + agentsWritten, + claudeWritten, + skillCount: skillMetadata.length, + }; +} + +function buildSkillsReference( + skills: Pick[] +): string { + return skills + .map((skill) => `- \`.claude/skills/${skill.directoryName}/SKILL.md\` - ${skill.description}`) + .join("\n"); +} + +async function copySkillsFromSource( + targetDir: string, + skillsSourcePath: string +): Promise { + const skillsDir = await ensureSkillsDirectory(targetDir); + await copyDirectoryContents(skillsSourcePath, skillsDir); + return await readSkillMetadataFromDirectory(skillsDir); +} + +async function writeGeneratedSkills( + targetDir: string, + nonDottedPaths: boolean +): Promise { + const skillsDir = await ensureSkillsDirectory(targetDir); + + await Promise.all( + SKILLS.map(async (skill) => { + const skillDir = join(skillsDir, skill.name); + await mkdir(skillDir, { recursive: true }); + await writeFile( + join(skillDir, "SKILL.md"), + renderGeneratedSkillContent(skill.name, nonDottedPaths), + "utf8" + ); + }) + ); + + return SKILLS.map((skill) => ({ + ...skill, + directoryName: skill.name, + })); +} + +function getGeneratedSkillMetadata(): ResolvedSkillMetadata[] { + return SKILLS.map((skill) => ({ + ...skill, + directoryName: skill.name, + })); +} + +async function ensureSkillsDirectory(targetDir: string): Promise { + const skillsDir = join(targetDir, ".claude", "skills"); + await mkdir(skillsDir, { recursive: true }); + return skillsDir; +} + +async function copyDirectoryContents(sourceDir: string, targetDir: string): Promise { + const entries = await readdir(sourceDir, { withFileTypes: true }); + + await Promise.all( + entries.map(async (entry) => { + await cp(join(sourceDir, entry.name), join(targetDir, entry.name), { + recursive: true, + force: true, + }); + }) + ); +} + +function renderGeneratedSkillContent(skillName: string, nonDottedPaths: boolean): string { + let skillContent = SKILL_CONTENT[skillName]; + if (!skillContent) { + throw new Error(`Missing generated skill content for ${skillName}`); + } + + if (nonDottedPaths) { + skillContent = skillContent + .replaceAll("{{FLOW_SUFFIX}}", "__flow") + .replaceAll("{{APP_SUFFIX}}", "__app") + .replaceAll("{{RAW_APP_SUFFIX}}", "__raw_app") + .replaceAll( + "{{INLINE_SCRIPT_NAMING}}", + "Inline script files should NOT include `.inline_script.` in their names (e.g. use `a.ts`, not `a.inline_script.ts`)." + ); + } else { + skillContent = skillContent + .replaceAll("{{FLOW_SUFFIX}}", ".flow") + .replaceAll("{{APP_SUFFIX}}", ".app") + .replaceAll("{{RAW_APP_SUFFIX}}", ".raw_app") + .replaceAll( + "{{INLINE_SCRIPT_NAMING}}", + "Inline script files use the `.inline_script.` naming convention (e.g. `a.inline_script.ts`)." + ); + } + + const schemaMappings = SCHEMA_MAPPINGS[skillName]; + if (!schemaMappings || schemaMappings.length === 0) { + return skillContent; + } + + const schemaDocs = schemaMappings + .map((mapping) => { + const schemaYaml = SCHEMAS[mapping.schemaKey]; + if (!schemaYaml) { + return null; + } + return formatSchemaForMarkdown(schemaYaml, mapping.name, mapping.filePattern); + }) + .filter((entry): entry is string => entry !== null); + + if (schemaDocs.length === 0) { + return skillContent; + } + + return `${skillContent}\n\n${schemaDocs.join("\n\n")}`; +} + +async function readSkillMetadataFromDirectory(skillsDir: string): Promise { + const entries = await readdir(skillsDir, { withFileTypes: true }); + const skills: ResolvedSkillMetadata[] = []; + + for (const entry of entries.sort((left, right) => left.name.localeCompare(right.name))) { + if (!entry.isDirectory()) { + continue; + } + + const skillPath = join(skillsDir, entry.name, "SKILL.md"); + if (!(await stat(skillPath).catch(() => null))) { + continue; + } + + const content = await readFile(skillPath, "utf8"); + skills.push(parseSkillMetadata(content, entry.name)); + } + + return skills; +} + +function parseSkillMetadata(content: string, fallbackName: string): ResolvedSkillMetadata { + const frontMatterMatch = content.match(/^---\s*\n([\s\S]*?)\n---/); + if (!frontMatterMatch) { + return { + name: fallbackName, + description: `Skill loaded from ${fallbackName}`, + directoryName: fallbackName, + }; + } + + let name = fallbackName; + let description = `Skill loaded from ${fallbackName}`; + + for (const line of frontMatterMatch[1].split("\n")) { + const separatorIndex = line.indexOf(":"); + if (separatorIndex === -1) { + continue; + } + + const key = line.slice(0, separatorIndex).trim(); + const value = line.slice(separatorIndex + 1).trim(); + + if (key === "name" && value) { + name = value; + } else if (key === "description" && value) { + description = value; + } + } + + return { name, description, directoryName: fallbackName }; +} + +async function writeProjectGuidanceFile(options: { + targetPath: string; + content: string; + overwrite: boolean; +}): Promise { + if (!options.overwrite && (await stat(options.targetPath).catch(() => null))) { + return false; + } + + await writeFile(options.targetPath, options.content, "utf8"); + return true; +} + +function formatSchemaForMarkdown( + schemaYaml: string, + schemaName: string, + filePattern: string +): string { + return `## ${schemaName} (\`${filePattern}\`) + +Must be a YAML file that adheres to the following schema: + +\`\`\`yaml +${schemaYaml.trim()} +\`\`\``; +} diff --git a/cli/test-skills/README.md b/cli/test-skills/README.md deleted file mode 100644 index 7ca6599215..0000000000 --- a/cli/test-skills/README.md +++ /dev/null @@ -1,103 +0,0 @@ -# Windmill Skill Invocation Tests - -Test suite for verifying that Claude Code correctly invokes Windmill auto-generated skills based on user prompts. - -## Overview - -This framework tests skill invocation behavior by sending prompts through the Claude Agent SDK and verifying that the expected skills are invoked. Users must provide their own `.claude/skills` folder containing auto-generated Windmill skills. - -## Prerequisites - -- [Bun](https://bun.sh/) installed -- `ANTHROPIC_API_KEY` environment variable set -- Auto-generated Windmill skills placed in `.claude/skills/` - -## User Setup - -1. Create a `test-folder` directory inside `cli/test-skills/` and copy your auto-generated Windmill skills into it: - -``` -cli/test-skills/ -└── test-folder/ - └── .claude/ - └── skills/ - ├── write-flow/ - │ └── SKILL.md - ├── write-script-python3/ - │ └── SKILL.md - ├── write-script-bun/ - │ └── SKILL.md - ├── schedules/ - │ └── SKILL.md - └── triggers/ - └── SKILL.md -``` - -2. Set your API key: -```bash -export ANTHROPIC_API_KEY=your-key-here -``` - -3. Install dependencies and run tests: -```bash -cd cli/test-skills -bun install -bun test -``` - -## Expected Skills - -The tests expect the following auto-generated skills to be present: - -| Skill Name | Purpose | -|------------|---------| -| `write-flow` | Creating Windmill flows/workflows | -| `write-script-python3` | Creating Python scripts | -| `write-script-bun` | Creating TypeScript/Bun scripts | -| `schedules` | Configuring schedules and cron jobs | -| `triggers` | Setting up triggers (webhook, Kafka, etc.) | - -## Test Matrix - -| Prompt | Expected Skill | -|--------|----------------| -| "Create a flow to process user data" | `write-flow` | -| "Build a workflow that fetches and transforms data" | `write-flow` | -| "Write a Python script to fetch API data" | `write-script-python3` | -| "Create a Python function to process CSV files" | `write-script-python3` | -| "Write a TypeScript script using Bun" | `write-script-bun` | -| "Create a Bun script to handle webhooks" | `write-script-bun` | -| "Set up a schedule to run this daily at midnight" | `schedules` | -| "Configure a cron job to run every hour" | `schedules` | -| "Set up a webhook trigger for this flow" | `triggers` | -| "Configure a Kafka trigger" | `triggers` | - -## Running Tests - -Run all tests: -```bash -bun test -``` - -Run only skill invocation tests: -```bash -bun test:skills -``` - -## Test Utilities - -The `src/test-utils.ts` module provides: - -- `runPromptAndCapture(prompt, cwd?, maxTurns)` - Runs a prompt and captures tool invocations -- `wasToolUsed(result, toolName)` - Checks if a specific tool was used -- `wasSkillInvoked(result, skillName)` - Checks if a specific skill was invoked -- `getToolInputs(result, toolName)` - Gets all inputs for a specific tool -- `getTestSkillsDir()` - Returns the test-skills directory path - -## Notes - -- Tests have extended timeouts (120 seconds) due to API latency -- Tests run against the actual Claude API, so they consume API credits -- Tests verify skill invocation, not skill execution -- The working directory for tests is `test-folder/` (where `.claude/skills` should be placed) -- Tests will fail with a clear error if `test-folder/` or `test-folder/.claude/skills/` don't exist diff --git a/cli/test-skills/bun.lock b/cli/test-skills/bun.lock deleted file mode 100644 index 9e4443cca8..0000000000 --- a/cli/test-skills/bun.lock +++ /dev/null @@ -1,61 +0,0 @@ -{ - "lockfileVersion": 1, - "configVersion": 1, - "workspaces": { - "": { - "name": "claude-code-skill-tests", - "dependencies": { - "@anthropic-ai/claude-agent-sdk": "^0.2.25", - }, - "devDependencies": { - "@types/bun": "latest", - "typescript": "^5.0.0", - }, - }, - }, - "packages": { - "@anthropic-ai/claude-agent-sdk": ["@anthropic-ai/claude-agent-sdk@0.2.25", "", { "optionalDependencies": { "@img/sharp-darwin-arm64": "^0.33.5", "@img/sharp-darwin-x64": "^0.33.5", "@img/sharp-linux-arm": "^0.33.5", "@img/sharp-linux-arm64": "^0.33.5", "@img/sharp-linux-x64": "^0.33.5", "@img/sharp-linuxmusl-arm64": "^0.33.5", "@img/sharp-linuxmusl-x64": "^0.33.5", "@img/sharp-win32-x64": "^0.33.5" }, "peerDependencies": { "zod": "^4.0.0" } }, "sha512-YIP3I40+XSkC3zE1Z8KRQY02VA7UfofFamF1cFrLe7FbtCnjpslyDl9coGBh2DAi9xj2yQcKZZf751jEWpB+dQ=="], - - "@img/sharp-darwin-arm64": ["@img/sharp-darwin-arm64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-darwin-arm64": "1.0.4" }, "os": "darwin", "cpu": "arm64" }, "sha512-UT4p+iz/2H4twwAoLCqfA9UH5pI6DggwKEGuaPy7nCVQ8ZsiY5PIcrRvD1DzuY3qYL07NtIQcWnBSY/heikIFQ=="], - - "@img/sharp-darwin-x64": ["@img/sharp-darwin-x64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-darwin-x64": "1.0.4" }, "os": "darwin", "cpu": "x64" }, "sha512-fyHac4jIc1ANYGRDxtiqelIbdWkIuQaI84Mv45KvGRRxSAa7o7d1ZKAOBaYbnepLC1WqxfpimdeWfvqqSGwR2Q=="], - - "@img/sharp-libvips-darwin-arm64": ["@img/sharp-libvips-darwin-arm64@1.0.4", "", { "os": "darwin", "cpu": "arm64" }, "sha512-XblONe153h0O2zuFfTAbQYAX2JhYmDHeWikp1LM9Hul9gVPjFY427k6dFEcOL72O01QxQsWi761svJ/ev9xEDg=="], - - "@img/sharp-libvips-darwin-x64": ["@img/sharp-libvips-darwin-x64@1.0.4", "", { "os": "darwin", "cpu": "x64" }, "sha512-xnGR8YuZYfJGmWPvmlunFaWJsb9T/AO2ykoP3Fz/0X5XV2aoYBPkX6xqCQvUTKKiLddarLaxpzNe+b1hjeWHAQ=="], - - "@img/sharp-libvips-linux-arm": ["@img/sharp-libvips-linux-arm@1.0.5", "", { "os": "linux", "cpu": "arm" }, "sha512-gvcC4ACAOPRNATg/ov8/MnbxFDJqf/pDePbBnuBDcjsI8PssmjoKMAz4LtLaVi+OnSb5FK/yIOamqDwGmXW32g=="], - - "@img/sharp-libvips-linux-arm64": ["@img/sharp-libvips-linux-arm64@1.0.4", "", { "os": "linux", "cpu": "arm64" }, "sha512-9B+taZ8DlyyqzZQnoeIvDVR/2F4EbMepXMc/NdVbkzsJbzkUjhXv/70GQJ7tdLA4YJgNP25zukcxpX2/SueNrA=="], - - "@img/sharp-libvips-linux-x64": ["@img/sharp-libvips-linux-x64@1.0.4", "", { "os": "linux", "cpu": "x64" }, "sha512-MmWmQ3iPFZr0Iev+BAgVMb3ZyC4KeFc3jFxnNbEPas60e1cIfevbtuyf9nDGIzOaW9PdnDciJm+wFFaTlj5xYw=="], - - "@img/sharp-libvips-linuxmusl-arm64": ["@img/sharp-libvips-linuxmusl-arm64@1.0.4", "", { "os": "linux", "cpu": "arm64" }, "sha512-9Ti+BbTYDcsbp4wfYib8Ctm1ilkugkA/uscUn6UXK1ldpC1JjiXbLfFZtRlBhjPZ5o1NCLiDbg8fhUPKStHoTA=="], - - "@img/sharp-libvips-linuxmusl-x64": ["@img/sharp-libvips-linuxmusl-x64@1.0.4", "", { "os": "linux", "cpu": "x64" }, "sha512-viYN1KX9m+/hGkJtvYYp+CCLgnJXwiQB39damAO7WMdKWlIhmYTfHjwSbQeUK/20vY154mwezd9HflVFM1wVSw=="], - - "@img/sharp-linux-arm": ["@img/sharp-linux-arm@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-arm": "1.0.5" }, "os": "linux", "cpu": "arm" }, "sha512-JTS1eldqZbJxjvKaAkxhZmBqPRGmxgu+qFKSInv8moZ2AmT5Yib3EQ1c6gp493HvrvV8QgdOXdyaIBrhvFhBMQ=="], - - "@img/sharp-linux-arm64": ["@img/sharp-linux-arm64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-arm64": "1.0.4" }, "os": "linux", "cpu": "arm64" }, "sha512-JMVv+AMRyGOHtO1RFBiJy/MBsgz0x4AWrT6QoEVVTyh1E39TrCUpTRI7mx9VksGX4awWASxqCYLCV4wBZHAYxA=="], - - "@img/sharp-linux-x64": ["@img/sharp-linux-x64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-x64": "1.0.4" }, "os": "linux", "cpu": "x64" }, "sha512-opC+Ok5pRNAzuvq1AG0ar+1owsu842/Ab+4qvU879ippJBHvyY5n2mxF1izXqkPYlGuP/M556uh53jRLJmzTWA=="], - - "@img/sharp-linuxmusl-arm64": ["@img/sharp-linuxmusl-arm64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-linuxmusl-arm64": "1.0.4" }, "os": "linux", "cpu": "arm64" }, "sha512-XrHMZwGQGvJg2V/oRSUfSAfjfPxO+4DkiRh6p2AFjLQztWUuY/o8Mq0eMQVIY7HJ1CDQUJlxGGZRw1a5bqmd1g=="], - - "@img/sharp-linuxmusl-x64": ["@img/sharp-linuxmusl-x64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-linuxmusl-x64": "1.0.4" }, "os": "linux", "cpu": "x64" }, "sha512-WT+d/cgqKkkKySYmqoZ8y3pxx7lx9vVejxW/W4DOFMYVSkErR+w7mf2u8m/y4+xHe7yY9DAXQMWQhpnMuFfScw=="], - - "@img/sharp-win32-x64": ["@img/sharp-win32-x64@0.33.5", "", { "os": "win32", "cpu": "x64" }, "sha512-MpY/o8/8kj+EcnxwvrP4aTJSWw/aZ7JIGR4aBeZkZw5B7/Jn+tY9/VNwtcoGmdT7GfggGIU4kygOMSbYnOrAbg=="], - - "@types/bun": ["@types/bun@1.3.8", "", { "dependencies": { "bun-types": "1.3.8" } }, "sha512-3LvWJ2q5GerAXYxO2mffLTqOzEu5qnhEAlh48Vnu8WQfnmSwbgagjGZV6BoHKJztENYEDn6QmVd949W4uESRJA=="], - - "@types/node": ["@types/node@25.1.0", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-t7frlewr6+cbx+9Ohpl0NOTKXZNV9xHRmNOvql47BFJKcEG1CxtxlPEEe+gR9uhVWM4DwhnvTF110mIL4yP9RA=="], - - "bun-types": ["bun-types@1.3.8", "", { "dependencies": { "@types/node": "*" } }, "sha512-fL99nxdOWvV4LqjmC+8Q9kW3M4QTtTR1eePs94v5ctGqU8OeceWrSUaRw3JYb7tU3FkMIAjkueehrHPPPGKi5Q=="], - - "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="], - - "undici-types": ["undici-types@7.16.0", "", {}, "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw=="], - - "zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="], - } -} diff --git a/cli/test-skills/package.json b/cli/test-skills/package.json deleted file mode 100644 index 1d839ab23c..0000000000 --- a/cli/test-skills/package.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "name": "claude-code-skill-tests", - "version": "1.0.0", - "type": "module", - "scripts": { - "test": "bun test", - "test:skills": "bun test src/skill-invocation.test.ts" - }, - "dependencies": { - "@anthropic-ai/claude-agent-sdk": "^0.2.25" - }, - "devDependencies": { - "@types/bun": "latest", - "typescript": "^5.0.0" - } -} diff --git a/cli/test-skills/src/skill-invocation.test.ts b/cli/test-skills/src/skill-invocation.test.ts deleted file mode 100644 index 9633eb2ef5..0000000000 --- a/cli/test-skills/src/skill-invocation.test.ts +++ /dev/null @@ -1,91 +0,0 @@ -import { describe, test, expect, beforeAll } from "bun:test"; -import { runPromptAndCapture, wasSkillInvoked, wasToolUsed, validateTestFolder } from "./test-utils"; - -describe("Windmill Skill Invocation", () => { - beforeAll(() => { - if (!process.env.ANTHROPIC_API_KEY) { - throw new Error("ANTHROPIC_API_KEY environment variable is required"); - } - validateTestFolder(); - }); - - describe("Flow Creation", () => { - test("'Create a Windmill flow' should invoke write-flow skill", async () => { - const result = await runPromptAndCapture( - "Create a Windmill flow that fetches data from an API and transforms it. Use placeholder URLs.", - undefined, - 3 - ); - - console.log("Tools used:", result.toolsUsed.map(t => t.tool)); - console.log("Skills invoked:", result.skillsInvoked); - - expect(wasToolUsed(result, "Skill")).toBe(true); - expect(wasSkillInvoked(result, "write-flow")).toBe(true); - }, { timeout: 120000 }); - }); - - describe("Python Script Creation", () => { - test("'Write a Windmill Python script' should invoke write-script-python3 skill", async () => { - const result = await runPromptAndCapture( - "Write a Windmill Python script that fetches data from https://api.example.com/users", - undefined, - 3 - ); - - console.log("Tools used:", result.toolsUsed.map(t => t.tool)); - console.log("Skills invoked:", result.skillsInvoked); - - expect(wasToolUsed(result, "Skill")).toBe(true); - expect(wasSkillInvoked(result, "write-script-python3")).toBe(true); - }, { timeout: 120000 }); - }); - - describe("Bun Script Creation", () => { - test("'Write a Windmill Bun/TypeScript script' should invoke write-script-bun skill", async () => { - const result = await runPromptAndCapture( - "Write a Windmill Bun script that processes JSON data", - undefined, - 3 - ); - - console.log("Tools used:", result.toolsUsed.map(t => t.tool)); - console.log("Skills invoked:", result.skillsInvoked); - - expect(wasToolUsed(result, "Skill")).toBe(true); - expect(wasSkillInvoked(result, "write-script-bun")).toBe(true); - }, { timeout: 120000 }); - }); - - describe("Schedule Configuration", () => { - test("'Create a Windmill schedule' should invoke schedules skill", async () => { - const result = await runPromptAndCapture( - "Create a Windmill schedule that runs a script daily at midnight", - undefined, - 3 - ); - - console.log("Tools used:", result.toolsUsed.map(t => t.tool)); - console.log("Skills invoked:", result.skillsInvoked); - - expect(wasToolUsed(result, "Skill")).toBe(true); - expect(wasSkillInvoked(result, "schedules")).toBe(true); - }, { timeout: 120000 }); - }); - - describe("Trigger Configuration", () => { - test("'Set up a Windmill webhook trigger' should invoke triggers skill", async () => { - const result = await runPromptAndCapture( - "Set up a Windmill HTTP trigger for a flow at /api/webhook", - undefined, - 3 - ); - - console.log("Tools used:", result.toolsUsed.map(t => t.tool)); - console.log("Skills invoked:", result.skillsInvoked); - - expect(wasToolUsed(result, "Skill")).toBe(true); - expect(wasSkillInvoked(result, "triggers")).toBe(true); - }, { timeout: 120000 }); - }); -}); diff --git a/cli/test-skills/src/test-utils.ts b/cli/test-skills/src/test-utils.ts deleted file mode 100644 index dbf2ea4be9..0000000000 --- a/cli/test-skills/src/test-utils.ts +++ /dev/null @@ -1,137 +0,0 @@ -import { query, type Options } from "@anthropic-ai/claude-agent-sdk"; -import { existsSync } from "fs"; -import { join } from "path"; - -export interface ToolInvocation { - tool: string; - input: Record; - timestamp: number; -} - -export interface TestResult { - toolsUsed: ToolInvocation[]; - skillsInvoked: string[]; - output: string; -} - -/** - * Get the test-skills directory path - */ -export function getTestSkillsDir(): string { - return new URL("..", import.meta.url).pathname; -} - -/** - * Get the test-folder directory path (where user places .claude/skills) - */ -export function getTestFolder(): string { - return join(getTestSkillsDir(), "test-folder"); -} - -/** - * Validate that test-folder exists and has .claude/skills - * Throws an error if validation fails - */ -export function validateTestFolder(): void { - const testFolder = getTestFolder(); - const skillsFolder = join(testFolder, ".claude", "skills"); - - if (!existsSync(testFolder)) { - throw new Error( - `test-folder does not exist at: ${testFolder}\n` + - `Please create it and add your .claude/skills directory inside.` - ); - } - - if (!existsSync(skillsFolder)) { - throw new Error( - `.claude/skills directory not found in test-folder at: ${skillsFolder}\n` + - `Please add your auto-generated Windmill skills to test-folder/.claude/skills/` - ); - } -} - -/** - * Runs a prompt through the Claude Agent SDK and captures tool invocations - * Uses test-folder as cwd where user-provided skills are located - */ -export async function runPromptAndCapture( - prompt: string, - cwd?: string, - maxTurns: number = 3 -): Promise { - const workingDir = cwd ?? getTestFolder(); - const toolsUsed: ToolInvocation[] = []; - const skillsInvoked: string[] = []; - let output = ""; - - const options: Options = { - cwd: workingDir, - model: "haiku", - maxTurns, - settingSources: ["project"], // Required to load Skills from filesystem - allowedTools: ["Skill", "Read", "Glob", "Grep", "Bash", "Write", "Edit"], - }; - - for await (const message of query({ prompt, options })) { - if (message.type === "assistant") { - // The assistant message has a BetaMessage which contains content blocks - const content = message.message?.content; - if (Array.isArray(content)) { - for (const block of content) { - if (block.type === "tool_use") { - const toolInvocation: ToolInvocation = { - tool: block.name, - input: block.input as Record, - timestamp: Date.now(), - }; - toolsUsed.push(toolInvocation); - - // Check if this is a Skill tool invocation - if (block.name === "Skill" && typeof block.input === "object" && block.input !== null) { - const skillInput = block.input as { skill?: string }; - if (skillInput.skill) { - skillsInvoked.push(skillInput.skill); - } - } - } else if (block.type === "text") { - output += block.text; - } - } - } - } else if (message.type === "result") { - // Capture final result if available - const resultMessage = message as { result?: string }; - if (typeof resultMessage.result === "string") { - output += resultMessage.result; - } - } - } - - return { - toolsUsed, - skillsInvoked, - output, - }; -} - -/** - * Helper to check if a specific tool was used - */ -export function wasToolUsed(result: TestResult, toolName: string): boolean { - return result.toolsUsed.some((t) => t.tool === toolName); -} - -/** - * Helper to check if a specific skill was invoked - */ -export function wasSkillInvoked(result: TestResult, skillName: string): boolean { - return result.skillsInvoked.some((s) => s === skillName || s.includes(skillName)); -} - -/** - * Helper to get all tool inputs for a specific tool - */ -export function getToolInputs(result: TestResult, toolName: string): Record[] { - return result.toolsUsed.filter((t) => t.tool === toolName).map((t) => t.input); -} diff --git a/cli/test-skills/tsconfig.json b/cli/test-skills/tsconfig.json deleted file mode 100644 index 45f0069307..0000000000 --- a/cli/test-skills/tsconfig.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "compilerOptions": { - "target": "ES2022", - "module": "ESNext", - "moduleResolution": "bundler", - "strict": true, - "esModuleInterop": true, - "skipLibCheck": true, - "forceConsistentCasingInFileNames": true, - "outDir": "./dist", - "rootDir": "./src", - "declaration": true, - "types": ["bun-types"] - }, - "include": ["src/**/*"], - "exclude": ["node_modules", "dist"] -} diff --git a/cli/test/guidance_writer_unit.test.ts b/cli/test/guidance_writer_unit.test.ts new file mode 100644 index 0000000000..dbe541e65a --- /dev/null +++ b/cli/test/guidance_writer_unit.test.ts @@ -0,0 +1,148 @@ +import { describe, expect, test } from "bun:test"; +import { mkdtemp, mkdir, readFile, rm, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { writeAiGuidanceFiles } from "../src/guidance/writer.ts"; + +async function withTempDir(fn: (tempDir: string) => Promise): Promise { + const tempDir = await mkdtemp(join(tmpdir(), "wmill_guidance_writer_")); + try { + await fn(tempDir); + } finally { + await rm(tempDir, { recursive: true, force: true }); + } +} + +async function writeSkill( + rootDir: string, + skillName: string, + content: string +): Promise { + const skillPath = join(rootDir, skillName, "SKILL.md"); + await mkdir(join(rootDir, skillName), { recursive: true }); + await writeFile(skillPath, content, "utf8"); + return skillPath; +} + +describe("writeAiGuidanceFiles", () => { + test("preserves custom skills when refreshing generated guidance", async () => { + await withTempDir(async (tempDir) => { + const skillsDir = join(tempDir, ".claude", "skills"); + const customSkillContent = `--- +name: custom-skill +description: Custom skill +--- + +Preserve me. +`; + const staleGeneratedContent = "stale generated skill"; + + const customSkillPath = await writeSkill(skillsDir, "custom-skill", customSkillContent); + const generatedSkillPath = await writeSkill(skillsDir, "write-flow", staleGeneratedContent); + + await writeAiGuidanceFiles({ + targetDir: tempDir, + overwriteProjectGuidance: false, + }); + + expect(await readFile(customSkillPath, "utf8")).toBe(customSkillContent); + + const generatedSkillContent = await readFile(generatedSkillPath, "utf8"); + expect(generatedSkillContent).not.toBe(staleGeneratedContent); + expect(generatedSkillContent).toContain("name: write-flow"); + }); + }); + + test("preserves custom skills when copying a skill bundle from source", async () => { + await withTempDir(async (tempDir) => { + const skillsDir = join(tempDir, ".claude", "skills"); + const customSkillContent = `--- +name: custom-skill +description: Custom skill +--- + +Keep this local skill. +`; + const sourceSkillContent = `--- +name: write-flow +description: Replacement flow skill +--- + +Copied from source. +`; + const bundleOnlySkillContent = `--- +name: bundle-only +description: Bundle only skill +--- + +Copied from source bundle. +`; + + const customSkillPath = await writeSkill(skillsDir, "custom-skill", customSkillContent); + const existingGeneratedSkillPath = await writeSkill(skillsDir, "write-flow", "old content"); + const sourceSkillsDir = join(tempDir, "source-skills"); + + await writeSkill(sourceSkillsDir, "write-flow", sourceSkillContent); + const bundleOnlySkillPath = await writeSkill( + sourceSkillsDir, + "bundle-only", + bundleOnlySkillContent + ); + + await writeAiGuidanceFiles({ + targetDir: tempDir, + overwriteProjectGuidance: false, + skillsSourcePath: sourceSkillsDir, + }); + + expect(await readFile(customSkillPath, "utf8")).toBe(customSkillContent); + expect(await readFile(existingGeneratedSkillPath, "utf8")).toBe(sourceSkillContent); + expect(await readFile(bundleOnlySkillPath.replace(sourceSkillsDir, skillsDir), "utf8")).toBe( + bundleOnlySkillContent + ); + }); + }); + + test("builds AGENTS skill references from copied directory names", async () => { + await withTempDir(async (tempDir) => { + const sourceSkillsDir = join(tempDir, "source-skills"); + await writeSkill( + sourceSkillsDir, + "custom-folder", + `--- +name: write-flow +description: Custom bundle skill +--- + +Copied from source bundle. +` + ); + + await writeAiGuidanceFiles({ + targetDir: tempDir, + overwriteProjectGuidance: false, + skillsSourcePath: sourceSkillsDir, + }); + + const agentsMd = await readFile(join(tempDir, "AGENTS.md"), "utf8"); + expect(agentsMd).toContain(".claude/skills/custom-folder/SKILL.md"); + expect(agentsMd).not.toContain(".claude/skills/write-flow/SKILL.md"); + }); + }); + + test("writes AGENTS.md and CLAUDE.md even if skills creation fails", async () => { + await withTempDir(async (tempDir) => { + await writeFile(join(tempDir, ".claude"), "not a directory\n", "utf8"); + + await expect( + writeAiGuidanceFiles({ + targetDir: tempDir, + overwriteProjectGuidance: false, + }) + ).rejects.toThrow(); + + expect(await readFile(join(tempDir, "AGENTS.md"), "utf8")).toContain(".claude/skills/"); + expect(await readFile(join(tempDir, "CLAUDE.md"), "utf8")).toContain("@AGENTS.md"); + }); + }); +}); diff --git a/docs/failing-tests.md b/docs/failing-tests.md new file mode 100644 index 0000000000..d0ae44f109 --- /dev/null +++ b/docs/failing-tests.md @@ -0,0 +1,33 @@ +# Failing Tests + +This file tracks benchmark cases that still fail or need follow-up validation. + +## Flow + +- `flow-test6-ai-agent-tools` + Latest failing run: `ai_evals/results/2026-04-09T11-25-24.107Z__flow` + Issues: + final output does not include the actions or tool-result details the prompt asks for + `open_support_ticket` contains a syntax bug + +- `flow-test7-simple-modification` + Latest failing run: `ai_evals/results/2026-04-09T11-25-24.107Z__flow` + Issues: + `validate_data` was added, but the failure behavior still does not match the requested contract + `save_results` throws instead of returning a graceful structured result + +- `flow-test11-preprocessor-and-failure-handler` + Latest failing run: `ai_evals/results/2026-04-09T11-25-24.107Z__flow` + Issues: + the model creates regular `preprocessor` and `failure` modules + it does not use Windmill's special top-level `preprocessor_module` and `failure_module` + +## Needs Reconfirmation + +- `flow-test4-order-processing-loop` + Full-suite failing run: `ai_evals/results/2026-04-09T11-25-24.107Z__flow` + Follow-up passing run after prompt improvement: `ai_evals/results/2026-04-09T13-29-15.877Z__flow` + Note: + this case failed on invalid `branchone` downstream result access + it passed after adding explicit branch-output guidance to the flow prompt + rerun the full flow suite to confirm the fix holds in the broader benchmark diff --git a/docs/system-prompt-testing-plan.md b/docs/system-prompt-testing-plan.md new file mode 100644 index 0000000000..9b12f1c5e0 --- /dev/null +++ b/docs/system-prompt-testing-plan.md @@ -0,0 +1,1000 @@ +# System Prompt And Skill Output Testing Plan + +Historical note: + +- This file is a planning document and no longer matches the current benchmark CLI in every detail. +- The current source of truth is [ai_evals/README.md](/home/farhad/windmill__worktrees/prompt-testing-plan/ai_evals/README.md) and [system-prompt-testing-status.md](/home/farhad/windmill__worktrees/prompt-testing-plan/docs/system-prompt-testing-status.md). +- In particular, the current tool no longer has the old variants, compare, or history workflow described below. + +## Goal + +Build a single testing strategy that answers one question reliably: + +> Given a user task, how good is the artifact produced by our AI system? + +This plan is intentionally focused on **black-box output evaluation**, not on unit testing frontend or CLI internals. + +The intended end state is a **new repo-level benchmark CLI** that runs a shared +eval suite across multiple surfaces. + +That benchmark CLI should be the main entrypoint for: + +- running one case +- running a benchmark set +- comparing baseline vs candidate variants +- writing benchmark history snapshots + +Frontend and Windmill CLI are not meant to become separate testing products. +They should be implemented as adapters behind this shared benchmark CLI. + +The system under test is: + +- Frontend AI Chat in `script`, `flow`, and `app` modes +- CLI local development experience driven by generated guidance and skills + +The artifact under test is: + +- Script code +- Flow JSON / module structure +- Raw app files and backend runnables +- Files and project artifacts produced in a local CLI workspace + +## Non-Goals + +This plan does **not** treat the following as the main testing target: + +- Unit testing helper functions, stores, or tool wrapper internals +- UI rendering behavior, DOM interactions, or component-level correctness +- `wmill init` correctness as a standalone product area +- Backend route correctness except where it affects prompt delivery or AI configuration + +Those may still need lightweight tests, but they are not the core of prompt reliability evaluation. + +## Core Principles + +### 1. Black-box evaluation only + +The runner should provide an input task to the real system setup, let it run, collect the final artifact, and score the result. + +In practice, this runner should be exposed through the new repo-level benchmark +CLI rather than through separate ad hoc test commands for each surface. + +### 2. Headless execution + +Frontend evaluation must be fully decoupled from the browser UI. It should exercise prompt assembly, tool selection, and tool execution logic without mounting Svelte components or clicking through the app. + +### 3. Real prompt environment + +All evals must use the same prompt-building path, tool definitions, and skill content that production uses, or a clearly defined variant of them. + +### 4. Artifact-first scoring + +The main score is based on the produced artifact, not on intermediate transcripts. + +### 5. Reliability over one-off success + +A prompt is not "good" because it passed once. Reliability means pass rate across repeated runs and across a representative case set. + +### 6. Track benchmark history over time + +The suite must not only evaluate the current output. It must also produce a +git-tracked benchmark history so the team can see whether the system is +improving over time. + +This history should focus on official benchmark snapshots, not on every local +experiment. + +### 7. Shared corpus, separate adapters + +Frontend and CLI should share the same evaluation corpus format when possible, but each surface should have its own execution adapter. + +### 8. CLI first, UI last + +The CLI should be the first surface brought to a high-confidence benchmark +state. + +It is the cleanest foundation for the suite because it produces direct files in +an isolated workspace, has less ambiguity than the frontend, and is easier to +score deterministically. + +Frontend should reuse the benchmark model proven on the CLI rather than define +a parallel testing philosophy. + +### 9. UI comes last + +The testing suite must exist and be trustworthy before building a studio UI on top of it. + +## Current State + +## Shared Prompt Source Of Truth + +The repo already has the right content split: + +- `system_prompts/` is the shared source of truth for core Windmill prompt content +- frontend adds chat-specific tool instructions on top +- CLI materializes guidance and skill content from generated outputs + +This is a strong foundation for a shared eval suite. + +## Execution Priority + +Even though the repo already has useful frontend eval scaffolding, the +implementation priority should be: + +1. build the repo-level benchmark CLI and use the Windmill CLI adapter as the + first implementation behind it +2. make the CLI artifact-evaluation path excellent +3. stabilize shared scoring, reporting, and benchmark history around that path +4. bring frontend onto the same benchmark model through the same benchmark CLI +5. build the UI only after the underlying suite is trustworthy + +This keeps the hardest product question focused on artifact quality rather than +on UI workflow. + +## Benchmark CLI As The Main Product + +The testing suite should have one primary interface: + +- a new repo-level benchmark CLI + +The benchmark CLI should be able to run: + +- Windmill CLI evals +- frontend evals +- shared reporting and comparison commands + +Illustrative command shape: + +```bash +ai-evals run --surface cli --case bun-hello-script +ai-evals run --surface frontend-flow --case support-flow +ai-evals compare --surface cli --variant baseline --variant candidate-a +ai-evals history latest +``` + +The exact binary name can change, but the architecture should not: + +- one benchmark CLI +- shared case loader +- shared scoring +- shared history writer +- separate surface adapters underneath + +## Temporary Bootstrap Code + +This bootstrap phase is now complete for frontend `flow`, `app`, and `script`. + +Frontend AI benchmark ownership has moved into `ai_evals/`, and the frontend +source tree no longer owns a separate AI benchmark suite under +`frontend/.../__tests__/...`. + +Benchmark authors should only need the repo-level benchmark CLI to run the +long-term suite. + +The only temporary frontend-specific piece that remains is a thin Vitest/Vite +loader bridge so the benchmark runner can import the production chat modules in +the same module/runtime environment they already expect. + +## Frontend: What Exists Today + +The current frontend benchmark path is **decoupled from the UI** and now owned +by `ai_evals`. + +They currently: + +- run through the shared headless chat loop +- use production prompt builders +- use production tool definitions +- use benchmark-owned helper adapters that write to temp workspaces on disk +- execute through the frontend module/runtime environment only as a loader bridge + +This means the current frontend evals are now a proper benchmark adapter, +not a frontend test suite. + +That is the correct direction. + +### Frontend Architecture Notes + +There are three categories of code involved: + +- shared production logic: + - production system prompt builders + - production tool definitions + - production `runChatLoop` +- benchmark-only infrastructure: + - case loading + - variant loading + - judge scoring + - benchmark result shaping + - history/reporting integration +- alternate helper adapters: + - production helpers mutate UI/editor state + - benchmark helpers mutate temp-workspace files + +This is important because the benchmark suite is **not** meant to duplicate the +frontend chat logic. It is meant to reuse the production chat loop and tool +definitions while swapping the execution backend from UI state to filesystem +state. + +## Frontend: What Is Missing + +### Coverage gaps + +- `script` is now exposed through the shared benchmark CLI, but it only has initial case coverage. +- Existing frontend coverage is still too small relative to the target benchmark corpus. + +### Reliability gaps + +- Frontend flow and app can already run with pass/fail results and repeated runs through the shared benchmark CLI. +- The remaining gap is turning that into stronger routine reliability gating with better deterministic validators and broader routine case coverage. +- Frontend reliability reporting is still less mature than the intended end state for official CI tiers and richer failure triage. + +### Prompt-iteration gaps + +- Frontend prompt variants are file-backed now, but the repo only ships baseline manifests by default. +- Creating and curating meaningful frontend candidate variants is still a mostly manual workflow compared with the CLI snapshot flow. +- Frontend prompt comparison exists through the shared `compare` command, but it still needs broader routine use and better variant coverage. + +### Artifact-validation gaps + +- The current flow and app helpers are file-backed now, but several effects are still lightweight and should become more realistic over time. +- Linting and runnable validation are currently too lightweight in the eval path. +- Datatable interactions are mocked rather than validated as output constraints. +- The suite does not yet enforce a strong deterministic validator layer before using an LLM judge. + +### Corpus gaps + +- Frontend surfaces already use shared case manifests under `ai_evals/cases/frontend/`. +- The remaining gap is breadth and representativeness, not the absence of a shared corpus. +- Cases still need richer metadata, stronger deterministic constraints, and a larger regression library built from real failures. + +### Reporting gaps + +- Frontend runs already emit the shared benchmark result shape and can write official history snapshots through the shared benchmark CLI. +- There is still no rich leaderboard or trend-oriented debugging workflow for frontend surfaces specifically. +- There is still no strong "worst failures first" report for debugging regressions. + +## Frontend: Perfect Testing Logic + +The perfect frontend testing logic is: + +Frontend should not be the place where the benchmark philosophy is invented. + +It should consume the shared case format, validator model, reporting format, +and history format already proven through the CLI path. + +### 1. Stay fully headless + +Do not mount the chat UI. + +Do not click through the frontend. + +Do not use Playwright for prompt evaluation. + +The runner should directly invoke: + +- the production system message builder +- the production user message builder +- the production tool list +- the production chat loop + +It is acceptable for the benchmark adapter to use the frontend Vitest/Vite +runtime as a thin loader bridge when production chat modules still depend on +that environment, as long as: + +- the benchmark entrypoint remains the shared benchmark CLI +- the benchmark logic and fixtures live under `ai_evals` +- the frontend source tree does not own a separate benchmark suite + +This keeps the suite decorrelated from the frontend UI while still testing the real AI logic. + +### 2. Test the three frontend AI surfaces separately + +#### Script mode + +Input: + +- user prompt +- optional initial script +- optional context such as selected workspace runnables or DB references + +Output: + +- final script code + +Scoring: + +- deterministic validators first +- LLM judge second + +Deterministic validators should include: + +- expected entrypoint present +- syntax / parse validity +- language-appropriate compile or lint check where feasible +- required behaviors or structures present +- forbidden patterns absent + +#### Flow mode + +Input: + +- user prompt +- optional initial flow +- optional schema +- optional workspace context + +Output: + +- final flow definition + +Scoring: + +- flow JSON is structurally valid +- expected module types exist +- expected branches / loops / tools exist +- schema shape matches required inputs +- required data flow connections are present +- LLM judge scores completeness and overall quality + +#### App mode + +Input: + +- user prompt +- optional initial app +- optional workspace context + +Output: + +- final frontend files +- final backend runnables + +Scoring: + +- expected files and runnables exist +- file structure is coherent +- app bundle / lint checks pass where feasible in headless mode +- required UI/backend behaviors are represented in the artifact +- LLM judge scores completeness and product quality + +### 3. Use repeated runs, not single runs + +Each case should run more than once. + +Recommended starting point: + +- PR smoke run: 2 runs per case on a small curated subset +- nightly reliability run: 5 to 10 runs per case on the full benchmark set + +Primary metric: + +- pass rate + +Secondary metrics: + +- average deterministic score +- average judge score +- worst-case judge score +- latency +- total tool calls + +### 4. Keep tool traces as diagnostics only + +Tool usage matters for debugging, but it should not be the primary score. + +The suite should record: + +- tool names +- tool arguments +- iteration count +- model/provider + +But the main question remains: + +> Was the final artifact good? + +### 5. Make prompt variants easy to test + +Prompt candidates should not require editing test code. + +The suite should support a file-based prompt variant workflow. + +Example direction: + +- `ai_evals/variants/frontend/script/baseline.md` +- `ai_evals/variants/frontend/script/candidate-a.md` +- `ai_evals/variants/frontend/flow/baseline.md` +- `ai_evals/variants/frontend/app/baseline.md` + +Each variant should be runnable side by side against the same case set. + +### 6. Separate benchmark cases from test code + +Benchmark cases should live in data files, not inline in test files. + +Each case should define: + +- surface +- user prompt +- initial artifact if any +- required constraints +- forbidden constraints +- judge rubric +- tags + +This makes the benchmark editable by prompt authors without changing runner logic. + +## CLI: What Exists Today + +The current CLI tests prove only one narrow property: + +> Given a prompt, does the model invoke the expected skill? + +That is useful as a smoke signal, but it is far from sufficient for output evaluation. + +The current CLI setup also depends on manual preparation of a `.claude/skills` folder, which makes repeated benchmarking and prompt iteration much harder than necessary. + +## CLI: What Is Missing + +### Output-evaluation gap + +- The current suite does not score the artifact produced by the CLI workflow. +- It only checks whether a skill was invoked. +- It does not verify that the resulting files are good. + +### Automation gap + +- The current setup requires manual copying of generated skills into a test folder. +- That makes the suite too fragile and too manual for rapid prompt iteration. + +### Reliability gap + +- There is no repeated-run measurement. +- There is no pass-rate metric. +- There is no baseline vs candidate comparison workflow. + +### Prompt-variant gap + +- There is no first-class way to test alternate skill bundles or alternate generated guidance. +- There is no clean candidate flow for "I changed skill content, show me whether reliability improved." + +### Corpus gap + +- CLI cases are not aligned with frontend benchmark cases. +- There is no shared benchmark language describing the task, initial state, and expected artifact. + +### Reporting gap + +- There is no stable output report for artifact comparison. +- There is no failure clustering by skill bundle, task family, or model. + +## CLI: Perfect Testing Logic + +The perfect CLI testing logic is: + +This should be the reference implementation for the suite. + +### 1. Evaluate the final artifact, not the skill invocation + +Skill invocation should be kept as diagnostic metadata only. + +The primary output should be the files produced in a temporary workspace. + +Example CLI artifacts: + +- generated script files +- generated flow files +- raw app project files +- schedule / trigger config files +- AGENTS / guidance files only when they are directly relevant to the task + +### 2. Create the workspace automatically + +The runner should create a fresh temporary project for every case. + +It should seed that workspace with: + +- initial files for the benchmark case +- the current generated CLI guidance and skills +- any fixture data required by the task + +It should never depend on a manually maintained test folder. + +### 3. Materialize the exact skill bundle under test + +The runner should be able to test: + +- the current production skill bundle +- a candidate skill bundle built from prompt changes + +For CLI, a "prompt variant" is effectively a skill-bundle variant. + +That means the suite should support alternate generated skill content without requiring ad hoc manual copies. + +### 4. Score the final workspace + +The scoring approach should match the frontend philosophy: + +- deterministic validators first +- LLM judge second + +Deterministic validators for CLI should include: + +- expected files created +- expected file names and locations +- required content patterns present +- expected artifact type produced +- optional parse / lint / compile validation where feasible + +### 5. Run repeated benchmarks + +The CLI should use the same reliability logic as frontend: + +- benchmark set +- repeated runs +- pass rate +- baseline vs candidate comparison + +### 6. Keep skill traces as diagnostics + +Record: + +- invoked skills +- order of invocation +- turns +- file changes + +But do not let that replace artifact evaluation. + +## Perfect Shared Benchmark Model + +The frontend and CLI should share the same benchmark concept. + +Each evaluation case should define: + +- `id` +- `surface` +- `user_prompt` +- `initial_state` +- `workspace_context` +- `artifact_checks` +- `judge_rubric` +- `tags` + +The same task should be runnable on multiple surfaces when it makes sense. + +This gives direct comparability between: + +- frontend script vs CLI script +- frontend flow vs CLI flow +- frontend app vs CLI app + +## Recommended Benchmark Categories + +The first benchmark set should be broad, but not huge. + +Recommended initial size: + +- 20 to 30 core cases + +Recommended categories: + +- from-scratch script creation +- script modification +- from-scratch flow creation +- flow modification +- from-scratch raw app creation +- raw app modification +- reuse of workspace assets +- tasks requiring datatable awareness +- tasks requiring constraints or edge-case handling +- known regressions from real failures + +Every category should contain both: + +- "easy success" cases +- "high ambiguity" cases + +This is essential for measuring reliability rather than only measuring best-case demos. + +## Scoring Model + +The suite should use three layers. + +## Layer 1: Deterministic Validators + +This is the hard gate. + +Examples: + +- parse succeeds +- artifact shape is valid +- required entrypoint exists +- expected files exist +- required module types exist +- expected inputs / schema fields exist +- forbidden patterns are absent + +If layer 1 fails, the run is a failure. + +## Layer 2: Task-Specific Validators + +These are stronger artifact checks derived from the benchmark case. + +Examples: + +- flow contains a loop and a conditional branch +- app includes a reset button path and backend wiring +- script performs the requested transformation + +These should still be deterministic whenever possible. + +## Layer 3: LLM Judge + +Use an LLM judge only after deterministic validation. + +The judge should answer: + +- Did the artifact satisfy the request? +- Is it complete? +- Is it coherent for Windmill? +- How close is it to the intended solution? + +The judge score is valuable, but it should not be the only oracle. + +## Benchmark History + +The suite should persist official benchmark summaries in a git-tracked history +layer so improvements and regressions can be reviewed over time. + +## What Should Be Git-Tracked + +Only official benchmark outputs should be committed: + +- post-merge benchmark snapshots on `main` +- scheduled nightly benchmark snapshots +- manually promoted benchmark snapshots when the team wants to record a result + +Each official snapshot should produce: + +- one detailed run JSON +- one entry in an append-only summary file +- regenerated rollups for trend views + +## What Should Not Be Git-Tracked + +The following should remain local or external by default: + +- raw transcripts +- full model messages +- large generated artifact bundles +- ad hoc local experiments +- temporary comparison runs + +This keeps git history focused on stable benchmark signals instead of noisy +debug output. + +## Reliability Metrics + +Every prompt or skill candidate should be reported with: + +- total cases +- passes +- pass rate +- average judge score +- median judge score +- worst-case judge score +- average latency +- average turns + +Per-case results should also be retained. + +This is the minimum needed to compare: + +- baseline vs candidate +- provider vs provider +- frontend vs CLI + +## Benchmark Metrics + +The history layer should track metrics in four groups. + +## Quality Metrics + +- `pass_rate` +- `deterministic_pass_rate` +- `judge_score_mean` +- `judge_score_median` +- `judge_score_p10` +- `category_pass_rate` + +## Reliability Metrics + +- `runs_per_case` +- `flake_rate` +- `path_consistency` + +## Efficiency Metrics + +- `latency_ms_mean` +- `latency_ms_median` +- `tokens_prompt_mean` +- `tokens_completion_mean` +- `tokens_total_mean` +- `tool_calls_mean` +- `iterations_mean` +- `estimated_cost_mean` +- `cost_per_success` +- `latency_per_success` + +## Provenance Metrics + +- `timestamp` +- `git_sha` +- `suite_version` +- `scoring_version` +- `surface` +- `variant_name` +- `provider` +- `model` +- `judge_model` + +The provenance metrics are essential. Without them, a trend line can mix prompt +changes with upstream model drift and become hard to interpret. + +## Efficiency Score + +The suite should not collapse everything into one number. + +It should track at least three top-level composite scores: + +- `quality_score` +- `efficiency_score` +- `value_score` + +Recommended interpretation: + +- `quality_score`: how good the artifact is +- `efficiency_score`: how fast and cheap the system is relative to peers +- `value_score`: quality-adjusted efficiency + +These composite scores should sit on top of the raw metrics, not replace them. + +## Proposed Suite Architecture + +The suite should be built in six layers. + +## Layer 1: Benchmark Data + +Purpose: + +- define the cases once + +Contents: + +- case files +- reusable initial fixtures +- evaluation metadata + +## Layer 2: Benchmark CLI + +Purpose: + +- provide one shared entrypoint for the suite + +Responsibilities: + +- load cases and variants +- select a surface adapter +- run one case or a benchmark set +- invoke shared scoring and history writing +- expose comparison and history commands + +## Layer 3: Surface Adapters + +Purpose: + +- run a case against one surface + +Adapters: + +- frontend-script adapter +- frontend-flow adapter +- frontend-app adapter +- CLI adapter + +Responsibilities: + +- prepare the correct prompt environment +- prepare the initial artifact state +- run the real model loop +- return the final artifact plus diagnostics + +## Layer 4: Scoring And Reporting + +Purpose: + +- evaluate the final artifact +- aggregate repeated runs +- compare variants + +Responsibilities: + +- deterministic validation +- LLM judging +- pass/fail computation +- result serialization +- comparison reports + +## Layer 5: Benchmark History + +Purpose: + +- preserve official benchmark summaries over time +- support trend analysis and regression review + +Responsibilities: + +- store official run snapshots +- append benchmark summary entries +- generate rollups for charts and dashboards +- keep provenance metadata for every tracked run + +## Layer 6: UI Studio + +Purpose: + +- provide a user interface for the exact same benchmark CLI and runner stack + +Important rule: + +The UI must not define its own execution semantics. + +It must only be a frontend over the same suite used in CI and local benchmarking. + +## Proposed Development Order + +### Phase 1: Stabilize the benchmark model + +Deliverables: + +- shared case schema +- shared result schema +- initial core benchmark set + +### Phase 2: Build the benchmark CLI shell + +Deliverables: + +- repo-level benchmark CLI entrypoint +- `run`, `compare`, and `history` command skeletons +- adapter selection layer +- temporary wiring to the first CLI adapter + +### Phase 3: Replace the CLI smoke suite with real artifact evaluation + +Deliverables: + +- temp-workspace runner +- automatic skill-bundle materialization +- artifact scoring +- repeated-run support +- baseline vs candidate skill-bundle comparison + +### Phase 4: Add shared reporting and benchmark history around the CLI path + +Deliverables: + +- baseline vs candidate reports +- pass-rate summaries +- worst-failure reports +- official run schema +- git-tracked benchmark summary file +- history snapshot writer +- rollup generation for trend charts + +### Phase 5: Finish the frontend black-box harness on top of the shared model + +Deliverables: + +- convert current flow and app evals into proper scored reliability tests +- add script eval support +- add repeated-run support +- add prompt-variant loading from files +- align frontend outputs with the shared result and history format +- expose frontend runs through the same benchmark CLI + +### Phase 6: Add CI tiers + +Deliverables: + +- fast PR smoke benchmark +- fuller nightly benchmark +- official history updates on `main` and scheduled runs +- manual benchmark mode for prompt authors + +### Phase 7: Build the UI studio + +Deliverables: + +- run selector +- variant selector +- per-case comparison view +- artifact diff view +- reliability dashboard +- trend dashboard backed by git-tracked benchmark history + +This phase comes last because the UI is only valuable once the underlying suite is stable and trusted. + +## Proposed Prompt Variant Workflow + +The suite should make it cheap to test new prompt candidates. + +Recommended workflow: + +1. Edit or add a candidate prompt file. +2. Run the benchmark against baseline and candidate. +3. Compare pass rate and score. +4. Inspect worst regressions first. +5. Promote only if the candidate improves the benchmark materially. + +For CLI, the same workflow applies, but the tested unit is the generated skill bundle rather than a single chat system prompt. + +## Suggested Repository Direction + +This plan does not require the UI studio to exist first. + +A reasonable repo structure would be: + +```text +ai_evals/ + cli/ + cases/ + fixtures/ + history/ + runs/ + rollups/ + variants/ + frontend/ + script/ + flow/ + app/ + cli/ + results/ # gitignored + scripts/ + adapters/ + scoring/ + reports/ +``` + +The exact folder names can change, but the architectural split should remain. + +## What "Done" Looks Like + +This project is successful when all of the following are true: + +- one repo-level benchmark CLI is the primary way to run prompt evals +- frontend prompt behavior is tested headlessly and independently from the UI +- CLI local-dev behavior is tested by evaluating the final files it produces +- benchmark cases are shared where possible between frontend and CLI +- prompt and skill candidates can be tested without editing test code +- reliability is reported as pass rate over repeated runs +- baseline vs candidate comparisons are easy to run and inspect +- the UI studio is only a thin interface over the same trusted runner + +## Final Recommendation + +The current frontend evals should be treated as a useful starting point, not the finished solution. + +They already prove that the repo can test AI behavior without coupling to the browser UI. + +The main work now is: + +- build the repo-level benchmark CLI as the durable entrypoint +- replace CLI invocation checks with artifact evaluation +- make the CLI path the reference benchmark implementation +- unify frontend under that same benchmark model +- make frontend evals complete and reliability-oriented only after the shared + scoring model is stable +- build the UI only after the suite is strong enough to stand on its own diff --git a/docs/system-prompt-testing-status.md b/docs/system-prompt-testing-status.md new file mode 100644 index 0000000000..ef9be36161 --- /dev/null +++ b/docs/system-prompt-testing-status.md @@ -0,0 +1,129 @@ +# System Prompt Testing Status + +This document describes the benchmark tool that exists today. It is the current +truth for `ai_evals/`. + +The longer planning document in +[system-prompt-testing-plan.md](/home/farhad/windmill__worktrees/prompt-testing-plan/docs/system-prompt-testing-plan.md) +still contains useful background, but parts of its workflow are now historical +because the old variants/history system was removed. + +## Current Tool + +There is one repo-level benchmark CLI under `ai_evals/` with three commands: + +- `bun run cli -- models` +- `bun run cli -- cases [mode]` +- `bun run cli -- run [caseIds...]` + +Supported modes: + +- `cli` +- `flow` +- `script` +- `app` + +Public `run` options: + +- `--runs ` +- `--output ` +- `--model ` +- `--verbose` +- `--record` + +There is no variant workflow and no compare command in the current tool. +Tracked history is intentionally minimal: `run --record` appends one compact +summary line to `ai_evals/history/.jsonl`. This is only allowed for +full-suite runs, not selected case ids. History lines include average token +usage when the benchmark mode reports it, plus average judge score and per-case +duration/judge/token usage summaries. + +## How It Works + +Each attempt runs: + +1. the current production prompts, tools, and guidance from this checkout +2. deterministic validation +3. LLM judging + +Results are written locally under `ai_evals/results/` as: + +- a summary JSON file +- a sibling artifacts directory containing the generated flow/script/app/workspace + +If `--record` is used, the CLI also appends a compact JSONL summary line to the +tracked file for that mode under `ai_evals/history/`. + +## Current Architecture + +- `ai_evals/cases/`: one YAML manifest per mode +- `ai_evals/fixtures/`: initial and expected fixtures +- `ai_evals/core/`: shared case loading, model resolution, validation, judging, and result writing +- `ai_evals/history/`: optional tracked pass-rate history written by `run --record`, one JSONL file per mode +- `ai_evals/modes/`: one runner per mode + +Execution model: + +- `flow`, `script`, and `app` reuse the production frontend chat loop and production tool definitions through the frontend Vitest bridge +- `cli` creates a temp workspace, writes the current checkout guidance into it, and runs the Anthropic agent SDK against that workspace + +## Case Model + +Each case is intentionally small: + +- `prompt` +- optional `initial` +- optional `expected` +- optional `validate` + +`validate` is mainly used for stronger deterministic checks where exact fixture +matching would be too strict, especially for `flow` creation cases. + +Examples of current deterministic checks: + +- schema contains one of several accepted input shapes +- `results.*` references resolve +- required code/input characteristics exist in some module +- expected workspace files are created in `cli` mode + +## Model Selection + +Model aliases are resolved through a shared registry in `ai_evals/core/models.ts`. + +Current aliases: + +- `haiku` +- `sonnet` +- `opus` +- `4o` + +Notes: + +- the `models` command also shows accepted alias spellings such as `gpt-4o` and `claude-opus-4.6` +- frontend modes can use Anthropic and OpenAI-backed aliases +- `cli` mode is Anthropic-only because it runs through the Anthropic agent SDK +- the judge model is separate and currently defaults to `claude-sonnet-4-6` + +## What Is Working Well + +- one simple local benchmark CLI +- real production execution paths instead of synthetic prompt variants +- local result and artifact persistence by default +- live frontend progress output +- reusable flow/script/app/cli runners under one tool +- deterministic validation can now catch real runtime-invalid flow wiring + +## What Still Needs Work + +- broader case coverage across all four modes +- stronger deterministic validators for more cases, especially app/script semantics +- clearer per-case validation metadata as the corpus grows +- CI automation for smoke and nightly runs + +## Recommended Next Focus + +The next high-value work is: + +1. add more realistic benchmark cases +2. keep simplifying deterministic validators so they check correctness, not one exact implementation +3. add CI only after the local benchmark signal is trustworthy diff --git a/frontend/src/lib/components/copilot/chat/AIChatManager.svelte.ts b/frontend/src/lib/components/copilot/chat/AIChatManager.svelte.ts index bb2ad5686b..faf5da8c2b 100644 --- a/frontend/src/lib/components/copilot/chat/AIChatManager.svelte.ts +++ b/frontend/src/lib/components/copilot/chat/AIChatManager.svelte.ts @@ -450,7 +450,9 @@ class AIChatManager { } else if (this.mode === AIMode.FLOW) { return prepareFlowUserMessage( pendingPrompt, - this.flowAiChatHelpers!.getFlowAndSelectedId() + this.flowAiChatHelpers!.getFlowAndSelectedId(), + [], + this.flowAiChatHelpers!.inlineScriptSession ) } else if (this.mode === AIMode.NAVIGATOR) { return prepareNavigatorUserMessage(pendingPrompt) @@ -648,7 +650,8 @@ class AIChatManager { userMessage = prepareFlowUserMessage( oldInstructions, this.flowAiChatHelpers!.getFlowAndSelectedId(), - oldSelectedContext + oldSelectedContext, + this.flowAiChatHelpers!.inlineScriptSession ) break case AIMode.NAVIGATOR: diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/appChat.eval.test.ts b/frontend/src/lib/components/copilot/chat/__tests__/app/appChat.eval.test.ts deleted file mode 100644 index a42ee1f099..0000000000 --- a/frontend/src/lib/components/copilot/chat/__tests__/app/appChat.eval.test.ts +++ /dev/null @@ -1,303 +0,0 @@ -import { describe, expect, it } from 'vitest' -import { runVariantComparison, writeAppComparisonResults } from './appEvalRunner' -import { BASELINE_VARIANT, STREAMLINED_VARIANT } from './variants' -import { loadAppFixtureForEval } from './appFixtureLoader' -// @ts-ignore - Node.js path -import { dirname, join } from 'path' -// @ts-ignore - Node.js url -import { fileURLToPath } from 'url' -import type { AIProvider } from '$lib/gen/types.gen' - -// Get API keys from environment - tests will be skipped if none are set -// @ts-ignore -const OPENAI_API_KEY = process.env.OPENAI_API_KEY -// @ts-ignore -const ANTHROPIC_API_KEY = process.env.ANTHROPIC_API_KEY - -const hasAnyKey = OPENAI_API_KEY || ANTHROPIC_API_KEY -const describeWithApiKey = hasAnyKey ? describe : describe.skip - -// Get __dirname equivalent for ES modules -const __filename = fileURLToPath(import.meta.url) -const __dirname = dirname(__filename) - -// Build model variants based on available keys -interface ModelVariant { - model: string - provider: AIProvider - apiKey: string -} - -const MODEL_VARIANTS: ModelVariant[] = [ - ...(OPENAI_API_KEY - ? [{ model: 'gpt-4o', provider: 'openai' as AIProvider, apiKey: OPENAI_API_KEY }] - : []), - ...(ANTHROPIC_API_KEY - ? [ - { - model: 'claude-haiku-4-5-20241022', - provider: 'anthropic' as AIProvider, - apiKey: ANTHROPIC_API_KEY - } - ] - : []) -] - -const VARIANTS = [ - ...MODEL_VARIANTS.map((mv) => ({ - ...BASELINE_VARIANT, - model: mv.model, - name: `baseline-${mv.provider}-${mv.model}`, - _provider: mv.provider, - _apiKey: mv.apiKey - })), - ...MODEL_VARIANTS.map((mv) => ({ - ...STREAMLINED_VARIANT, - model: mv.model, - name: `streamlined-${mv.provider}-${mv.model}`, - _provider: mv.provider, - _apiKey: mv.apiKey - })) -] - -describeWithApiKey('App Chat LLM Evaluation', () => { - const TEST_TIMEOUT = 120_000 - if (!hasAnyKey) { - console.warn('No API keys set (OPENAI_API_KEY or ANTHROPIC_API_KEY), skipping tests') - } - - it( - 'test1: creates a simple counter app', - async () => { - const USER_PROMPT = `Create a counter app with increment/decrement buttons` - const results = await runVariantComparison( - USER_PROMPT, - VARIANTS, - VARIANTS[0]._apiKey, - undefined, - VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })) - ) - const { summaryPath, appPaths } = await writeAppComparisonResults(USER_PROMPT, results) - console.log(`\nResults written to: ${summaryPath}`) - console.log(`App files: ${appPaths.join(', ')}`) - - expect(true).toBe(true) - }, - TEST_TIMEOUT - ) - - it( - 'test2: modifies existing counter app to add reset button', - async () => { - const { initialFrontend, initialBackend } = await loadAppFixtureForEval( - join(__dirname, 'initial', 'test1_counter_app') - ) - - const USER_PROMPT = `Add a reset button that sets the counter back to 0` - const results = await runVariantComparison( - USER_PROMPT, - VARIANTS, - VARIANTS[0]._apiKey, - { - initialFrontend, - initialBackend - }, - VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })) - ) - const { summaryPath, appPaths } = await writeAppComparisonResults(USER_PROMPT, results) - console.log(`\nResults written to: ${summaryPath}`) - console.log(`App files: ${appPaths.join(', ')}`) - - expect(true).toBe(true) - }, - TEST_TIMEOUT - ) - - // ==================== Shopping Cart Tests ==================== - - it( - 'test3: shopping cart - add quantity selector', - async () => { - const { initialFrontend, initialBackend } = await loadAppFixtureForEval( - join(__dirname, 'initial', 'shopping_cart') - ) - - const USER_PROMPT = `Add a quantity selector (+ and - buttons) to each cart item so users can adjust quantities without removing and re-adding items` - const results = await runVariantComparison( - USER_PROMPT, - VARIANTS, - VARIANTS[0]._apiKey, - { - initialFrontend, - initialBackend - }, - VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })) - ) - - const { summaryPath, appPaths } = await writeAppComparisonResults(USER_PROMPT, results) - console.log(`\nResults written to: ${summaryPath}`) - console.log(`App files: ${appPaths.join(', ')}`) - - expect(true).toBe(true) - }, - TEST_TIMEOUT - ) - - it( - 'test4: shopping cart - add discount code', - async () => { - const { initialFrontend, initialBackend } = await loadAppFixtureForEval( - join(__dirname, 'initial', 'shopping_cart') - ) - - const USER_PROMPT = `Add a discount code input field in the cart. When the code "SAVE10" is entered, apply a 10% discount to the total` - const results = await runVariantComparison( - USER_PROMPT, - VARIANTS, - VARIANTS[0]._apiKey, - { - initialFrontend, - initialBackend - }, - VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })) - ) - - const { summaryPath, appPaths } = await writeAppComparisonResults(USER_PROMPT, results) - console.log(`\nResults written to: ${summaryPath}`) - console.log(`App files: ${appPaths.join(', ')}`) - - expect(true).toBe(true) - }, - TEST_TIMEOUT - ) - - // ==================== File Manager Tests ==================== - - it( - 'test5: file manager - add search bar', - async () => { - const { initialFrontend, initialBackend } = await loadAppFixtureForEval( - join(__dirname, 'initial', 'file_manager') - ) - - const USER_PROMPT = `Add a search bar in the toolbar that filters files and folders by name as the user types` - const results = await runVariantComparison( - USER_PROMPT, - VARIANTS, - VARIANTS[0]._apiKey, - { - initialFrontend, - initialBackend - }, - VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })) - ) - - const { summaryPath, appPaths } = await writeAppComparisonResults(USER_PROMPT, results) - console.log(`\nResults written to: ${summaryPath}`) - console.log(`App files: ${appPaths.join(', ')}`) - - expect(true).toBe(true) - }, - TEST_TIMEOUT - ) - - it( - 'test6: file manager - show file details', - async () => { - const { initialFrontend, initialBackend } = await loadAppFixtureForEval( - join(__dirname, 'initial', 'file_manager') - ) - - const USER_PROMPT = `Show file size (formatted as KB/MB) and modified date in the file list for each item` - const results = await runVariantComparison( - USER_PROMPT, - VARIANTS, - VARIANTS[0]._apiKey, - { - initialFrontend, - initialBackend - }, - VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })) - ) - - const { summaryPath, appPaths } = await writeAppComparisonResults(USER_PROMPT, results) - console.log(`\nResults written to: ${summaryPath}`) - console.log(`App files: ${appPaths.join(', ')}`) - - expect(true).toBe(true) - }, - TEST_TIMEOUT - ) - - it( - 'test7: file manager - add select all checkbox', - async () => { - const { initialFrontend, initialBackend } = await loadAppFixtureForEval( - join(__dirname, 'initial', 'file_manager') - ) - - const USER_PROMPT = `Add a "Select All" checkbox in the file list header and individual checkboxes for each file. Add a "Delete Selected" button that appears when items are selected` - const results = await runVariantComparison( - USER_PROMPT, - VARIANTS, - VARIANTS[0]._apiKey, - { - initialFrontend, - initialBackend - }, - VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })) - ) - - const { summaryPath, appPaths } = await writeAppComparisonResults(USER_PROMPT, results) - console.log(`\nResults written to: ${summaryPath}`) - console.log(`App files: ${appPaths.join(', ')}`) - - expect(true).toBe(true) - }, - TEST_TIMEOUT - ) - - // ==================== From-Scratch Creation Tests ==================== - - it( - 'test8: create quiz app from scratch', - async () => { - const USER_PROMPT = `Create a multiple choice quiz app with 5 questions about general knowledge. Show one question at a time with 4 answer options. Track the score and show results at the end with percentage correct.` - const results = await runVariantComparison( - USER_PROMPT, - VARIANTS, - VARIANTS[0]._apiKey, - undefined, - VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })) - ) - - const { summaryPath, appPaths } = await writeAppComparisonResults(USER_PROMPT, results) - console.log(`\nResults written to: ${summaryPath}`) - console.log(`App files: ${appPaths.join(', ')}`) - - expect(true).toBe(true) - }, - TEST_TIMEOUT - ) - - it( - 'test9: create recipe book from scratch', - async () => { - const USER_PROMPT = `Create a recipe book app where users can add recipes with a name, ingredients list, and instructions. Include a search bar to filter recipes by name and the ability to delete recipes.` - const results = await runVariantComparison( - USER_PROMPT, - VARIANTS, - VARIANTS[0]._apiKey, - undefined, - VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })) - ) - - const { summaryPath, appPaths } = await writeAppComparisonResults(USER_PROMPT, results) - console.log(`\nResults written to: ${summaryPath}`) - console.log(`App files: ${appPaths.join(', ')}`) - - expect(true).toBe(true) - }, - TEST_TIMEOUT - ) -}) diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/appEvalComparison.ts b/frontend/src/lib/components/copilot/chat/__tests__/app/appEvalComparison.ts deleted file mode 100644 index e6c795d445..0000000000 --- a/frontend/src/lib/components/copilot/chat/__tests__/app/appEvalComparison.ts +++ /dev/null @@ -1,171 +0,0 @@ -import Anthropic from '@anthropic-ai/sdk' -import type { AppFiles, BackendRunnable } from '../../app/core' -import { BASE_EVALUATOR_RESPONSE_FORMAT } from '../shared' -import type { EvaluationResult } from '../shared' - -/** - * Expected app structure for evaluation. - */ -export interface ExpectedApp { - frontend: Record - backend: Record -} - -/** - * Initial app state for evaluation context. - */ -export interface InitialApp { - frontend: Record - backend: Record -} - -/** - * System prompt for evaluating app generation without a reference expected app. - * Evaluates based on user request fulfillment and appropriate modifications to initial state. - */ -const APP_GENERATION_EVALUATOR_SYSTEM_PROMPT = `You are an expert evaluator for Windmill Raw App definitions. Your task is to evaluate a generated app based on: -1. The original user request/prompt -2. The initial app state (if any) - this is what the app looked like before the AI made changes - -## Windmill Raw App Context -- Raw Apps consist of frontend files and backend runnables -- Frontend files are TypeScript/JavaScript files bundled with esbuild (entrypoint: index.tsx) -- Backend runnables can be: inline scripts (TypeScript/Python), workspace scripts, workspace flows, or hub scripts -- Frontend calls backend using \`await backend.(args...)\` -- Each backend runnable has a key (identifier), name (description), type, and configuration - -## Backend Runnable Types -- **inline**: Custom code with \`inlineScript.language\` and \`inlineScript.content\` -- **script**: Workspace script reference with \`path\` -- **flow**: Workspace flow reference with \`path\` -- **hubscript**: Hub script reference with \`path\` - -## Evaluation Criteria -1. **User Request Fulfillment**: Does the generated app address ALL requirements from the user's original prompt? - - Are all requested features implemented? - - Does the frontend UI match the requirements? - - Are the correct backend runnables created? -2. **Appropriate Modifications** (if initial app was provided): - - Were the changes made relevant to the user's request? - - Was existing functionality preserved where appropriate? - - Were only necessary changes made (no unnecessary removals or additions)? -3. **Frontend Structure**: Are the frontend files correctly organized and implemented? - - Is the code valid TypeScript/JavaScript? - - Are components properly structured? - - Are backend calls correctly made? -4. **Backend Structure**: Are the backend runnables correctly configured? - - Do inline scripts have proper main functions? - - Are types and paths correct for non-inline runnables? -5. **Integration**: Does the frontend correctly call the backend? - - Are the runnable keys correctly referenced? - - Are arguments passed correctly? -6. **Code Quality**: Is the code functionally correct and well-structured? - -## Important Notes -- Focus on whether the user's request was fulfilled, not on stylistic preferences -- If an initial app was provided, evaluate the appropriateness of the changes made -- For new apps (no initial state), evaluate completeness and correctness -- Extra helper functions or slightly different approaches can still score high if they accomplish the goal - -${BASE_EVALUATOR_RESPONSE_FORMAT}` - -/** - * Evaluates how well a generated app fulfills the user's request, considering any initial app state. - * Uses Anthropic API directly. - */ -export async function evaluateAppGeneration( - userPrompt: string, - generatedApp: AppFiles, - initialApp?: InitialApp -): Promise { - // @ts-ignore - const apiKey = process.env.ANTHROPIC_API_KEY - if (!apiKey) { - return { - success: false, - resemblanceScore: 0, - statement: 'No API key available for evaluation', - error: 'ANTHROPIC_API_KEY not set' - } - } - - const client = new Anthropic({ apiKey }) - - let userMessage = `## User's Original Request -${userPrompt} - -` - - if (initialApp) { - userMessage += `## Initial App State (before AI modifications) -\`\`\`json -${JSON.stringify(initialApp, null, 2)} -\`\`\` - -` - } else { - userMessage += `## Initial App State -No initial app was provided - this is a new app created from scratch. - -` - } - - userMessage += `## Generated App -\`\`\`json -${JSON.stringify(generatedApp, null, 2)} -\`\`\` - -Please evaluate how well the generated app: -1. Fulfills ALL requirements from the user's original request -2. ${initialApp ? 'Makes appropriate modifications to the initial app state' : 'Implements a complete and correct new app'}` - - try { - const response = await client.messages.create({ - model: 'claude-sonnet-4-5-20250514', - max_tokens: 2048, - system: APP_GENERATION_EVALUATOR_SYSTEM_PROMPT, - messages: [ - { role: 'user', content: userMessage } - ], - temperature: 0 - }) - - const textBlock = response.content.find((block) => block.type === 'text') - const content = textBlock?.text - if (!content) { - return { - success: false, - resemblanceScore: 0, - statement: 'No response from evaluator', - error: 'Empty response from LLM' - } - } - - // Parse JSON response - handle potential markdown code blocks - let jsonContent = content.trim() - if (jsonContent.startsWith('```')) { - jsonContent = jsonContent.replace(/^```(?:json)?\n?/, '').replace(/\n?```$/, '') - } - - const parsed = JSON.parse(jsonContent) as { - resemblanceScore: number - statement: string - missingRequirements?: string[] - } - - return { - success: true, - resemblanceScore: Math.max(0, Math.min(100, Math.round(parsed.resemblanceScore))), - statement: parsed.statement, - missingRequirements: parsed.missingRequirements ?? [] - } - } catch (err) { - const errorMessage = err instanceof Error ? err.message : String(err) - return { - success: false, - resemblanceScore: 0, - statement: 'Evaluation failed', - error: errorMessage - } - } -} diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/appEvalHelpers.ts b/frontend/src/lib/components/copilot/chat/__tests__/app/appEvalHelpers.ts deleted file mode 100644 index 4dbc8d58a0..0000000000 --- a/frontend/src/lib/components/copilot/chat/__tests__/app/appEvalHelpers.ts +++ /dev/null @@ -1,147 +0,0 @@ -import type { - AppAIChatHelpers, - AppFiles, - BackendRunnable, - LintResult, - SelectedContext -} from '../../app/core' - -/** - * Creates an empty lint result (no errors or warnings). - */ -function createEmptyLintResult(): LintResult { - return { - errorCount: 0, - warningCount: 0, - errors: { frontend: {}, backend: {} }, - warnings: { frontend: {}, backend: {} } - } -} - -/** - * Creates mock AppAIChatHelpers for eval testing. - * Tracks app files state in memory and allows tool functions to modify it. - */ -export function createAppEvalHelpers( - initialFrontend: Record = {}, - initialBackend: Record = {} -) { - // In-memory state - let frontend: Record = { ...initialFrontend } - let backend: Record = { ...initialBackend } - let snapshotId = 0 - const snapshots: Map; backend: Record }> = new Map() - - const helpers: AppAIChatHelpers = { - // Frontend file operations - listFrontendFiles: () => Object.keys(frontend), - - getFrontendFile: (path: string) => frontend[path], - - getFrontendFiles: () => ({ ...frontend }), - - setFrontendFile: (path: string, content: string) => { - frontend[path] = content - // Return mock lint result - in real usage this would validate the file - return createEmptyLintResult() - }, - - deleteFrontendFile: (path: string) => { - delete frontend[path] - }, - - // Backend runnable operations - listBackendRunnables: () => { - return Object.entries(backend).map(([key, runnable]) => ({ - key, - name: runnable.name - })) - }, - - getBackendRunnable: (key: string) => backend[key], - - getBackendRunnables: () => ({ ...backend }), - - setBackendRunnable: async (key: string, runnable: BackendRunnable) => { - backend[key] = runnable - // Return mock lint result - in real usage this would validate the runnable - return createEmptyLintResult() - }, - - deleteBackendRunnable: (key: string) => { - delete backend[key] - }, - - // Combined view - getFiles: (): AppFiles => ({ - frontend: { ...frontend }, - backend: { ...backend } - }), - - getSelectedContext: (): SelectedContext => ({ - type: 'none' - }), - - // Snapshot management - snapshot: () => { - const id = ++snapshotId - snapshots.set(id, { - frontend: { ...frontend }, - backend: { ...backend } - }) - return id - }, - - revertToSnapshot: (id: number) => { - const snap = snapshots.get(id) - if (snap) { - frontend = { ...snap.frontend } - backend = { ...snap.backend } - } - }, - - // Linting - lint: () => { - // Return mock lint result - no actual linting in eval - return createEmptyLintResult() - }, - - // Data table operations (mock implementation for testing) - getDatatables: async () => { - // Return empty array for eval testing - no real datatables in test context - return [] - }, - - getAvailableDatatableNames: () => { - // Return empty array for eval testing - no real datatables in test context - return [] - }, - - execDatatableSql: async ( - _datatableName: string, - _sql: string, - _newTable?: { schema: string; name: string } - ) => { - // Return success with empty result for eval testing - return { success: true, result: [] } - }, - - addTableToWhitelist: ( - _datatableName: string, - _schemaName: string, - _tableName: string - ) => { - // No-op for eval testing - tables are not tracked in test context - } - } - - return { - helpers, - getFiles: (): AppFiles => ({ - frontend: { ...frontend }, - backend: { ...backend } - }), - getFrontend: () => ({ ...frontend }), - getBackend: () => ({ ...backend }) - } -} diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/appEvalRunner.ts b/frontend/src/lib/components/copilot/chat/__tests__/app/appEvalRunner.ts deleted file mode 100644 index 2e6a491bce..0000000000 --- a/frontend/src/lib/components/copilot/chat/__tests__/app/appEvalRunner.ts +++ /dev/null @@ -1,177 +0,0 @@ -import type { AppFiles, BackendRunnable, AppAIChatHelpers } from '../../app/core' -import { getAppTools, prepareAppSystemMessage, prepareAppUserMessage } from '../../app/core' -import { createAppEvalHelpers } from './appEvalHelpers' -import { evaluateAppGeneration, type InitialApp } from './appEvalComparison' -import { - runEval, - resolveSystemPrompt, - resolveTools, - resolveModel, - type VariantConfig, - type BaseEvalResult, - type EvaluationResult, - type Tool, - type VariantDefaults -} from '../shared' -import { writeAppComparisonResultsToFolders } from './appResultsWriter' -import type { AIProvider } from '$lib/gen/types.gen' - -// Re-export for convenience -export type { InitialApp } from './appEvalComparison' - -/** - * App-specific evaluation result. - */ -export interface AppEvalResult extends BaseEvalResult { - /** Alias for output to maintain API compatibility */ - files: AppFiles -} - -/** - * Options for running an app evaluation. - */ -export interface AppEvalOptions { - initialFrontend?: Record - initialBackend?: Record - model?: string - customSystemPrompt?: string - maxIterations?: number - variant?: VariantConfig - /** Whether to evaluate the generated app with LLM. Default: true. Set to false to skip evaluation. */ - evaluateWithLLM?: boolean - /** AI provider (inferred from model name if omitted) */ - provider?: AIProvider -} - -/** - * App-specific variant defaults. - */ -const appDefaults: VariantDefaults = { - prepareSystemMessage: prepareAppSystemMessage, - tools: getAppTools() as Tool[] -} - -/** - * Runs an app chat evaluation using the shared chat loop (same code path as production). - */ -export async function runAppEval( - userPrompt: string, - apiKey: string, - options?: AppEvalOptions -): Promise { - const { helpers, getFiles } = createAppEvalHelpers( - options?.initialFrontend ?? {}, - options?.initialBackend ?? {} - ) - - // Resolve variant configuration - const variantName = options?.variant?.name ?? 'baseline' - const systemMessage = resolveSystemPrompt( - options?.variant, - appDefaults, - options?.customSystemPrompt - ) - const { tools } = resolveTools(options?.variant, appDefaults) - const model = resolveModel(options?.variant, options?.model) - - // Build user message - const userMessage = prepareAppUserMessage(userPrompt, helpers.getSelectedContext()) - - // Run the base evaluation - const rawResult = await runEval({ - userPrompt, - systemMessage, - userMessage, - tools, - helpers, - apiKey, - getOutput: getFiles, - options: { - maxIterations: options?.maxIterations, - model, - workspace: 'test-workspace', - provider: options?.provider - } - }) - - // Run LLM evaluation unless explicitly disabled - let evaluationResult: EvaluationResult | undefined - if (options?.evaluateWithLLM !== false) { - const generatedApp = getFiles() - const initialApp: InitialApp | undefined = - options?.initialFrontend || options?.initialBackend - ? { - frontend: options.initialFrontend ?? {}, - backend: options.initialBackend ?? {} - } - : undefined - evaluationResult = await evaluateAppGeneration(userPrompt, generatedApp, initialApp) - } - - return { - ...rawResult, - variantName, - files: rawResult.output, - evaluationResult - } -} - -/** - * Per-variant provider override. - */ -export interface VariantProviderOverride { - provider: AIProvider - apiKey: string -} - -/** - * Runs the same prompt against multiple variants sequentially for comparison. - * Accepts optional per-variant provider/apiKey overrides. - */ -export async function runVariantComparison( - userPrompt: string, - variants: VariantConfig[], - defaultApiKey: string, - baseOptions?: Omit, - providerOverrides?: VariantProviderOverride[] -): Promise { - const results: AppEvalResult[] = await Promise.all( - variants.map(async (variant, i) => { - const override = providerOverrides?.[i] - return await runAppEval(userPrompt, override?.apiKey ?? defaultApiKey, { - ...baseOptions, - variant, - provider: override?.provider ?? baseOptions?.provider - }) - }) - ) - return results -} - -/** - * Writes app comparison results to a folder-based structure. - * Each variant gets its own folder with frontend/, backend/, and details.json. - */ -export async function writeAppComparisonResults( - userPrompt: string, - results: AppEvalResult[], - outputDir?: string -): Promise<{ summaryPath: string; appPaths: string[] }> { - // @ts-ignore - const { dirname, join } = await import('path') - // @ts-ignore - const { fileURLToPath } = await import('url') - - const __filename = fileURLToPath(import.meta.url) - const __dirname = dirname(__filename) - - const resultsDir = outputDir ?? join(__dirname, 'results') - - const result = await writeAppComparisonResultsToFolders({ - userPrompt, - results, - outputDir: resultsDir - }) - - return { summaryPath: result.summaryPath, appPaths: result.variantPaths } -} diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/appResultsWriter.ts b/frontend/src/lib/components/copilot/chat/__tests__/app/appResultsWriter.ts deleted file mode 100644 index a9b3475e96..0000000000 --- a/frontend/src/lib/components/copilot/chat/__tests__/app/appResultsWriter.ts +++ /dev/null @@ -1,247 +0,0 @@ -import type { AppFiles, BackendRunnable } from '../../app/core' -import type { AppEvalResult } from './appEvalRunner' -import { generateTimestamp } from '../shared' - -/** - * Writes frontend files to a folder, preserving directory structure. - * File paths like "/components/Button.tsx" become "frontend/components/Button.tsx" - */ -async function writeFrontendFiles( - frontend: Record, - frontendPath: string -): Promise { - // @ts-ignore - Node.js fs/promises - const { writeFile, mkdir } = await import('fs/promises') - // @ts-ignore - Node.js path - const { join, dirname } = await import('path') - - for (const [filePath, content] of Object.entries(frontend)) { - // Remove leading slash and join with frontend path - const relativePath = filePath.startsWith('/') ? filePath.slice(1) : filePath - const fullPath = join(frontendPath, relativePath) - - // Ensure parent directory exists - await mkdir(dirname(fullPath), { recursive: true }) - - await writeFile(fullPath, content) - } -} - -/** - * Writes backend runnables to a folder structure. - * Each runnable becomes a folder with main.ts/main.py and meta.json - */ -async function writeBackendRunnables( - backend: Record, - backendPath: string -): Promise { - // @ts-ignore - Node.js fs/promises - const { writeFile, mkdir } = await import('fs/promises') - // @ts-ignore - Node.js path - const { join } = await import('path') - - for (const [key, runnable] of Object.entries(backend)) { - const runnablePath = join(backendPath, key) - await mkdir(runnablePath, { recursive: true }) - - // Write meta.json - const meta: { name: string; language?: string; type?: string; path?: string } = { - name: runnable.name - } - - if (runnable.type === 'inline' && runnable.inlineScript) { - meta.language = runnable.inlineScript.language - - // Write main file - const extension = runnable.inlineScript.language === 'python3' ? 'py' : 'ts' - const mainPath = join(runnablePath, `main.${extension}`) - await writeFile(mainPath, runnable.inlineScript.content) - } else { - // For non-inline runnables, store type and path in meta - meta.type = runnable.type - if (runnable.path) { - meta.path = runnable.path - } - } - - const metaPath = join(runnablePath, 'meta.json') - await writeFile(metaPath, JSON.stringify(meta, null, '\t')) - } -} - -/** - * Writes app files (frontend + backend) to a folder structure. - */ -async function writeAppToFolder(appFiles: AppFiles, folderPath: string): Promise { - // @ts-ignore - Node.js path - const { join } = await import('path') - - if (Object.keys(appFiles.frontend).length > 0) { - await writeFrontendFiles(appFiles.frontend, join(folderPath, 'frontend')) - } - - if (Object.keys(appFiles.backend).length > 0) { - await writeBackendRunnables(appFiles.backend, join(folderPath, 'backend')) - } -} - -/** - * Parameters for writing app comparison results. - */ -export interface WriteAppResultsParams { - userPrompt: string - results: AppEvalResult[] - outputDir: string -} - -/** - * Writes app comparison results to a folder-based structure. - * - * Creates: - * ``` - * results/{timestamp}/ - * ├── summary.md - * └── {variant_name}/ - * ├── details.json # Metadata (toolsCalled, evaluationResult, etc.) - * ├── frontend/ # Frontend files - * │ └── index.tsx - * └── backend/ # Backend runnables - * └── myFunction/ - * ├── main.ts - * └── meta.json - * ``` - */ -export async function writeAppComparisonResultsToFolders( - params: WriteAppResultsParams -): Promise<{ summaryPath: string; variantPaths: string[] }> { - // @ts-ignore - Node.js fs/promises - const { writeFile, mkdir } = await import('fs/promises') - // @ts-ignore - Node.js path - const { join } = await import('path') - - const { userPrompt, results, outputDir } = params - const timestamp = generateTimestamp() - - // Ensure results directory exists - await mkdir(outputDir, { recursive: true }) - const resultFolder = join(outputDir, timestamp) - await mkdir(resultFolder, { recursive: true }) - - // Check if any results have evaluation data - const hasEvaluation = results.some((r) => r.evaluationResult) - - // Build summary markdown - const summaryLines: string[] = [ - `# App Eval Results - ${timestamp}`, - '', - '## User Prompt', - '```', - userPrompt.trim(), - '```', - '', - '## Results', - '' - ] - - // Add results table header based on whether evaluation data exists - if (hasEvaluation) { - summaryLines.push( - '| Variant | Success | Total Tokens | Tool Calls | Iterations | Resemblance Score |' - ) - summaryLines.push( - '|---------|---------|--------------|------------|------------|-------------------|' - ) - } else { - summaryLines.push('| Variant | Success | Total Tokens | Tool Calls | Iterations |') - summaryLines.push('|---------|---------|--------------|------------|------------|') - } - - for (const result of results) { - const baseRow = `| ${result.variantName} | ${result.success} | ${result.tokenUsage.total} | ${result.toolsCalled.length} | ${result.iterations}` - if (hasEvaluation) { - const score = result.evaluationResult?.resemblanceScore ?? 'N/A' - summaryLines.push(`${baseRow} | ${score} |`) - } else { - summaryLines.push(`${baseRow} |`) - } - } - - // Add evaluation details section if available - if (hasEvaluation) { - summaryLines.push('') - summaryLines.push('## Evaluation Details') - summaryLines.push('') - for (const result of results) { - if (result.evaluationResult) { - summaryLines.push(`### ${result.variantName}`) - summaryLines.push('') - summaryLines.push(`**Score:** ${result.evaluationResult.resemblanceScore}/100`) - summaryLines.push('') - summaryLines.push(`**Statement:** ${result.evaluationResult.statement}`) - summaryLines.push('') - if ( - result.evaluationResult.missingRequirements && - result.evaluationResult.missingRequirements.length > 0 - ) { - summaryLines.push('**Missing Requirements:**') - for (const req of result.evaluationResult.missingRequirements) { - summaryLines.push(`- ${req}`) - } - summaryLines.push('') - } - if (result.evaluationResult.error) { - summaryLines.push(`**Error:** ${result.evaluationResult.error}`) - summaryLines.push('') - } - } - } - } - - // Add errors section for failed variants - const failedResults = results.filter((r) => !r.success && r.error) - if (failedResults.length > 0) { - summaryLines.push('') - summaryLines.push('## Errors') - summaryLines.push('') - for (const result of failedResults) { - summaryLines.push(`### ${result.variantName}`) - summaryLines.push('') - summaryLines.push('```') - summaryLines.push(result.error!) - summaryLines.push('```') - summaryLines.push('') - } - } - - const variantPaths: string[] = [] - - // Write each variant to its own folder - for (const result of results) { - const variantFolder = join(resultFolder, result.variantName) - await mkdir(variantFolder, { recursive: true }) - variantPaths.push(variantFolder) - - // Write details.json (metadata without app files) - const details = { - variantName: result.variantName, - success: result.success, - error: result.error ?? null, - evaluationResult: result.evaluationResult ?? null, - toolsCalled: result.toolsCalled, - toolCallDetails: result.toolCallDetails, - tokenUsage: result.tokenUsage, - iterations: result.iterations, - messages: result.messages - } - await writeFile(join(variantFolder, 'details.json'), JSON.stringify(details, null, '\t')) - - // Write app files to frontend/ and backend/ folders - await writeAppToFolder(result.files, variantFolder) - } - - // Write summary markdown file - const summaryPath = join(resultFolder, 'summary.md') - await writeFile(summaryPath, summaryLines.join('\n')) - - return { summaryPath, variantPaths } -} diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/variants/baseline.ts b/frontend/src/lib/components/copilot/chat/__tests__/app/variants/baseline.ts deleted file mode 100644 index 558424e972..0000000000 --- a/frontend/src/lib/components/copilot/chat/__tests__/app/variants/baseline.ts +++ /dev/null @@ -1,12 +0,0 @@ -import type { VariantConfig } from '../../shared' - -/** - * Baseline variant - uses the production system prompt and all tools. - * This is the default configuration that matches the actual app chat implementation. - */ -export const BASELINE_VARIANT: VariantConfig = { - name: 'baseline', - description: 'Production configuration with default system prompt and all tools', - systemPrompt: { type: 'default' }, - tools: { type: 'default' } -} diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/variants/index.ts b/frontend/src/lib/components/copilot/chat/__tests__/app/variants/index.ts deleted file mode 100644 index b49c56123d..0000000000 --- a/frontend/src/lib/components/copilot/chat/__tests__/app/variants/index.ts +++ /dev/null @@ -1,6 +0,0 @@ -// Re-export all variant configurations -export { BASELINE_VARIANT } from './baseline' -export { STREAMLINED_VARIANT } from './streamlined' - -// Re-export types for convenience -export type { VariantConfig } from '../../shared' diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/variants/streamlined.ts b/frontend/src/lib/components/copilot/chat/__tests__/app/variants/streamlined.ts deleted file mode 100644 index 515db0a756..0000000000 --- a/frontend/src/lib/components/copilot/chat/__tests__/app/variants/streamlined.ts +++ /dev/null @@ -1,144 +0,0 @@ -import type { VariantConfig } from '../../shared' -import type { Tool } from '../../shared' -import type { AppAIChatHelpers } from '../../../app/core' -import { getAppTools } from '../../../app/core' - -// Tool names to remove (batch-fetch tools) -const TOOLS_TO_REMOVE = ['get_files', 'get_frontend_files', 'get_backend_runnables'] - -/** - * Build the streamlined tools by filtering out batch-fetch tools. - */ -function buildStreamlinedTools(): Tool[] { - const defaultTools = getAppTools() - return defaultTools.filter((t) => !TOOLS_TO_REMOVE.includes(t.def.function.name)) -} - -/** - * Streamlined system prompt - simplified instructions focused on: - * 1. Reading relevant files first - * 2. Making changes with appropriate tools - * 3. Using lint at the end to fix errors - */ -const STREAMLINED_SYSTEM_PROMPT = `You are a helpful assistant that creates and edits apps on the Windmill platform. Apps are defined as a collection of files that contains both the frontend and the backend. - -## App Structure - -### Frontend -- The frontend is bundled using esbuild with entrypoint \`index.tsx\` -- Frontend files are managed separately from backend runnables -- The \`wmill.d.ts\` file is generated automatically from the backend runnables shape - -### Backend -Backend runnables can be of different types: -- **inline**: Custom code written directly in the app (TypeScript/Bun or Python) -- **script**: Reference to a workspace script by path -- **flow**: Reference to a workspace flow by path -- **hubscript**: Reference to a hub script by path - -Frontend calls backend using \`await backend.(args...)\`. - -For inline scripts, the code must have a \`main\` function as its entrypoint. - -## Available Tools - -### File Management -- \`list_frontend_files()\`: List all frontend file paths (use this first to see what exists) -- \`get_frontend_file(path)\`: Get content of a specific frontend file -- \`set_frontend_file(path, content)\`: Create or update a frontend file. Returns lint diagnostics. -- \`delete_frontend_file(path)\`: Delete a frontend file -- \`list_backend_runnables()\`: List all backend runnable keys and names (use this first to see what exists) -- \`get_backend_runnable(key)\`: Get full configuration of a specific backend runnable -- \`set_backend_runnable(key, name, type, ...)\`: Create or update a backend runnable. Returns lint diagnostics. -- \`delete_backend_runnable(key)\`: Delete a backend runnable - -### Linting -- \`lint()\`: Lint all files. Returns errors/warnings grouped by frontend/backend. - -### Discovery -- \`search_workspace(query, type)\`: Search workspace scripts and flows -- \`search_hub_scripts(query)\`: Search hub scripts - -## Backend Runnable Configuration - -When creating a backend runnable with \`set_backend_runnable\`: - -1. **For inline scripts** (type: "inline"): - \`\`\` - { - key: "myFunction", - name: "Does something useful", - type: "inline", - inlineScript: { - language: "bun", // or "python3" - content: "export async function main(arg1: string) { return result; }" - } - } - \`\`\` - -2. **For workspace scripts** (type: "script"): - \`\`\` - { - key: "sendEmail", - name: "Send email via SMTP", - type: "script", - path: "f/folder/send_email", - staticInputs: { smtp_server: "mail.example.com" } // optional pre-filled inputs - } - \`\`\` - -3. **For workspace flows** (type: "flow"): - \`\`\` - { - key: "processOrder", - name: "Process customer order", - type: "flow", - path: "f/folder/process_order_flow" - } - \`\`\` - -4. **For hub scripts** (type: "hubscript"): - \`\`\` - { - key: "slackMessage", - name: "Send Slack message", - type: "hubscript", - path: "hub/123/slack/send_message" - } - \`\`\` - -## Instructions - -1. Start by reading relevant files to understand the current state -2. Make changes using the appropriate tools -3. Use \`lint()\` at the end to check for and fix any errors - -Windmill expects all backend runnable calls to use an object parameter structure. For example for: -\`\`\`typescript -export async function main(arg1: string, arg2: string, arg3: number, arg4: { field1: string, field2: number }) { - ... -} -\`\`\` - -You would call it like this: -\`\`\`typescript -await backend.myFunction({ arg1: 'value1', arg2: 'value2', arg3: 3, arg4: { field1: 'value1', field2: 2 } }) -\`\`\` -If the runnable has no parameters, you can call it without an object: -\`\`\`typescript -await backend.myFunction() -\`\`\` - -When you are using the windmill-client, do not forget that as id for variables or resources, those are path that are of the form 'u//' or 'f//'. -` - -/** - * Streamlined variant - removes batch-fetch tools and uses simplified instructions. - * Forces the model to read individual files before making changes. - */ -export const STREAMLINED_VARIANT: VariantConfig = { - name: 'streamlined', - description: 'No batch tools - forces reading individual files before making changes', - systemPrompt: { type: 'custom', content: STREAMLINED_SYSTEM_PROMPT }, - tools: { type: 'custom', tools: buildStreamlinedTools() } -} diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/flowChat.eval.test.ts b/frontend/src/lib/components/copilot/chat/__tests__/flow/flowChat.eval.test.ts deleted file mode 100644 index de9b8e5f43..0000000000 --- a/frontend/src/lib/components/copilot/chat/__tests__/flow/flowChat.eval.test.ts +++ /dev/null @@ -1,449 +0,0 @@ -import { describe, it, expect } from 'vitest' -import { runVariantComparison, writeFlowComparisonResults, type ExpectedFlow } from './flowEvalRunner' -import { BASELINE_VARIANT, MINIMAL_SINGLE_TOOL_VARIANT } from './variants' -// @ts-ignore - JSON import -import expectedTest1 from './expected/test1.json' -// @ts-ignore - JSON import -import expectedTest2 from './expected/test2.json' -// @ts-ignore - JSON import -import expectedTest3 from './expected/test3.json' -// @ts-ignore - JSON import -import expectedTest4 from './expected/test4.json' -// @ts-ignore - JSON import -import expectedTest5 from './expected/test5_modify_simple.json' -// @ts-ignore - JSON import -import expectedTest6 from './expected/test6_modify_medium.json' -// @ts-ignore - JSON import -import expectedTest7 from './expected/test7_modify_complex.json' -// @ts-ignore - JSON import -import initialTest5 from './initial/test5_initial.json' -// @ts-ignore - JSON import -import initialTest6 from './initial/test6_initial.json' -// @ts-ignore - JSON import -import initialTest7 from './initial/test7_initial.json' -import type { FlowModule } from '$lib/gen' -import type { AIProvider } from '$lib/gen/types.gen' - -// Get API keys from environment - tests will be skipped if none are set -// @ts-ignore -const OPENAI_API_KEY = process.env.OPENAI_API_KEY -// @ts-ignore -const ANTHROPIC_API_KEY = process.env.ANTHROPIC_API_KEY - -const hasAnyKey = OPENAI_API_KEY || ANTHROPIC_API_KEY -const describeWithApiKey = hasAnyKey ? describe : describe.skip - -// Build model variants based on available keys -interface ModelVariant { - model: string - provider: AIProvider - apiKey: string -} - -const MODEL_VARIANTS: ModelVariant[] = [ - ...(OPENAI_API_KEY - ? [{ model: 'gpt-4o', provider: 'openai' as AIProvider, apiKey: OPENAI_API_KEY }] - : []), - ...(ANTHROPIC_API_KEY - ? [ - { - model: 'claude-haiku-4-5-20241022', - provider: 'anthropic' as AIProvider, - apiKey: ANTHROPIC_API_KEY - } - ] - : []) -] - -const VARIANTS = [ - ...MODEL_VARIANTS.map((mv) => ({ - ...BASELINE_VARIANT, - model: mv.model, - name: `baseline-${mv.provider}-${mv.model}`, - _provider: mv.provider, - _apiKey: mv.apiKey - })), - ...MODEL_VARIANTS.map((mv) => ({ - ...MINIMAL_SINGLE_TOOL_VARIANT, - model: mv.model, - name: `minimal-single-tool-${mv.provider}-${mv.model}`, - _provider: mv.provider, - _apiKey: mv.apiKey - })) -] - -describeWithApiKey('Flow Chat LLM Evaluation', () => { - const TEST_TIMEOUT = 120_000 - if (!hasAnyKey) { - console.warn('No API keys set (OPENAI_API_KEY or ANTHROPIC_API_KEY), skipping tests') - } - - it( - 'test1: user role-based actions with loop and branches', - async () => { - const USER_PROMPT = ` -THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES - -STEP 1: Fetch mock users from api -STEP 2: Filter only active users: -STEP 3: Loop on all users -STEP 4: Do branches based on user's role, do different action based on that. Roles are admin, user, moderator -STEP 5: Return action taken for each user -` - const results = await runVariantComparison( - USER_PROMPT, - VARIANTS, - VARIANTS[0]._apiKey, - { - expectedFlow: expectedTest1 as ExpectedFlow - }, - VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })) - ) - - // Write results to files - const { summaryPath, flowPaths } = await writeFlowComparisonResults(USER_PROMPT, results) - console.log(`\nResults written to: ${summaryPath}`) - console.log(`Flow files: ${flowPaths.join(', ')}`) - - // Assert all variants succeeded - for (const result of results) { - expect(true).toBe(true) - - // Log evaluation results - if (result.evaluationResult) { - console.log( - `[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100` - ) - console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`) - if ( - result.evaluationResult.missingRequirements && - result.evaluationResult.missingRequirements.length > 0 - ) { - console.log( - `[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}` - ) - } - } - } - }, - TEST_TIMEOUT - ) - - it( - 'test2: e-commerce order processing with inventory check and branching', - async () => { - const USER_PROMPT = ` -THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES - -STEP 1: Receive order data from input (order has items array with name/price/quantity, customer_email, shipping_address) -STEP 2: Validate order - check all items have valid price > 0 and quantity > 0, return validation result -STEP 3: Calculate order total with 8% tax rate -STEP 4: Check inventory for each item (loop through items, return mock availability) -STEP 5: Branch based on inventory - if all items available, create shipment record; otherwise create backorder record -STEP 6: Send confirmation (mock email to customer_email) -STEP 7: Return final order summary with status -` - const results = await runVariantComparison( - USER_PROMPT, - VARIANTS, - VARIANTS[0]._apiKey, - { - expectedFlow: expectedTest2 as ExpectedFlow - }, - VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })) - ) - - const { summaryPath, flowPaths } = await writeFlowComparisonResults(USER_PROMPT, results) - console.log(`\nResults written to: ${summaryPath}`) - console.log(`Flow files: ${flowPaths.join(', ')}`) - - for (const result of results) { - expect(true).toBe(true) - - if (result.evaluationResult) { - console.log( - `[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100` - ) - console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`) - if ( - result.evaluationResult.missingRequirements && - result.evaluationResult.missingRequirements.length > 0 - ) { - console.log( - `[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}` - ) - } - } - } - }, - TEST_TIMEOUT - ) - - it( - 'test3: data pipeline with parallel processing and quality-based routing', - async () => { - const USER_PROMPT = ` -THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES - -STEP 1: Fetch list of data sources from configuration (return mock array of 3 source objects with id and url) -STEP 2: For each data source in parallel: - - Fetch raw data from the source (mock fetch returning sample records) - - Transform/clean the data (filter out invalid entries) - - Validate the transformed data (return validation score 0-100) -STEP 3: Aggregate all validated data into single dataset with combined records -STEP 4: Calculate overall data quality score (average of all validation scores) -STEP 5: Branch based on quality score: - - If score >= 90: Store in primary database and return success - - If score >= 70 and < 90: Store in secondary database with warning flag - - If score < 70: Store in quarantine and send alert -STEP 6: Return processing report with statistics (total records, quality score, destination) -` - const results = await runVariantComparison( - USER_PROMPT, - VARIANTS, - VARIANTS[0]._apiKey, - { - expectedFlow: expectedTest3 as ExpectedFlow - }, - VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })) - ) - - const { summaryPath, flowPaths } = await writeFlowComparisonResults(USER_PROMPT, results) - console.log(`\nResults written to: ${summaryPath}`) - console.log(`Flow files: ${flowPaths.join(', ')}`) - - for (const result of results) { - expect(true).toBe(true) - - if (result.evaluationResult) { - console.log( - `[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100` - ) - console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`) - if ( - result.evaluationResult.missingRequirements && - result.evaluationResult.missingRequirements.length > 0 - ) { - console.log( - `[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}` - ) - } - } - } - }, - TEST_TIMEOUT - ) - - it( - 'test4: AI agent with tools for customer support', - async () => { - const USER_PROMPT = ` -THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES - -Create a customer support flow with an AI agent: - -STEP 1: Receive customer query from input (customer_id string, query_text string) -STEP 2: Fetch customer profile and order history (mock data based on customer_id) -STEP 3: Use an AI agent to handle the customer query. The agent should have access to these tools: - - lookup_order: Takes order_id, returns order details (mock data) - - check_refund_eligibility: Takes order_id, returns eligibility status and reason - - create_support_ticket: Takes description and priority (low/medium/high), returns ticket_id - - search_faq: Takes search_query, returns relevant FAQ answers - The agent should use the customer profile context and respond helpfully. -STEP 4: Log the interaction to audit trail (customer_id, query, response summary) -STEP 5: Return the agent's response and any actions taken -` - const results = await runVariantComparison( - USER_PROMPT, - VARIANTS, - VARIANTS[0]._apiKey, - { - expectedFlow: expectedTest4 as ExpectedFlow - }, - VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })) - ) - - const { summaryPath, flowPaths } = await writeFlowComparisonResults(USER_PROMPT, results) - console.log(`\nResults written to: ${summaryPath}`) - console.log(`Flow files: ${flowPaths.join(', ')}`) - - for (const result of results) { - expect(true).toBe(true) - - if (result.evaluationResult) { - console.log( - `[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100` - ) - console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`) - if ( - result.evaluationResult.missingRequirements && - result.evaluationResult.missingRequirements.length > 0 - ) { - console.log( - `[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}` - ) - } - } - } - }, - TEST_TIMEOUT - ) - - // ==================== MODIFICATION TESTS ==================== - // These tests evaluate the LLM's ability to modify existing flows - - it( - 'test5: simple modification - add validation step to existing flow', - async () => { - const USER_PROMPT = ` -THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES - -Modify this existing flow to add error handling: -- Add a new step after process_data called "validate_data" to validate the processed data -- The validation step should check if the data array is not empty -- If validation fails (empty array), it should return an error object with message "No data to save" -- If validation passes, return the data for the next step -- Update save_results to handle the validation result appropriately -` - const results = await runVariantComparison( - USER_PROMPT, - VARIANTS, - VARIANTS[0]._apiKey, - { - initialModules: initialTest5.value.modules as FlowModule[], - initialSchema: initialTest5.schema, - expectedFlow: expectedTest5 as ExpectedFlow - }, - VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })) - ) - - const { summaryPath, flowPaths } = await writeFlowComparisonResults(USER_PROMPT, results) - console.log(`\nResults written to: ${summaryPath}`) - console.log(`Flow files: ${flowPaths.join(', ')}`) - - for (const result of results) { - expect(true).toBe(true) - - if (result.evaluationResult) { - console.log( - `[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100` - ) - console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`) - if ( - result.evaluationResult.missingRequirements && - result.evaluationResult.missingRequirements.length > 0 - ) { - console.log( - `[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}` - ) - } - } - } - }, - TEST_TIMEOUT - ) - - it( - 'test6: medium modification - add branching inside existing loop', - async () => { - const USER_PROMPT = ` -THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES - -Modify the order processing loop to handle different order types: -- Inside the loop_orders, replace the simple process_order step with branching based on order.type -- For type "express": add a step called handle_express that marks as priority and calculates express shipping cost ($15.99) -- For type "standard": add a step called handle_standard that calculates standard shipping cost ($5.99) -- For type "pickup": add a step called handle_pickup that marks as no shipping required (cost $0) -- Move the original process_order step to the default branch for unknown order types -- Each branch step should return the orderId, shipping cost, and shipping type -` - const results = await runVariantComparison( - USER_PROMPT, - VARIANTS, - VARIANTS[0]._apiKey, - { - initialModules: initialTest6.value.modules as FlowModule[], - initialSchema: initialTest6.schema, - expectedFlow: expectedTest6 as ExpectedFlow - }, - VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })) - ) - - const { summaryPath, flowPaths } = await writeFlowComparisonResults(USER_PROMPT, results) - console.log(`\nResults written to: ${summaryPath}`) - console.log(`Flow files: ${flowPaths.join(', ')}`) - - for (const result of results) { - expect(true).toBe(true) - - if (result.evaluationResult) { - console.log( - `[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100` - ) - console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`) - if ( - result.evaluationResult.missingRequirements && - result.evaluationResult.missingRequirements.length > 0 - ) { - console.log( - `[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}` - ) - } - } - } - }, - TEST_TIMEOUT - ) - - it( - 'test7: complex modification - refactor sequential to parallel execution', - async () => { - const USER_PROMPT = ` -THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES - -Refactor this flow for better performance by parallelizing the enrichment steps: -- The three enrichment steps (enrich_price, enrich_inventory, enrich_reviews) currently run sequentially -- Wrap them in a parallel branch (branchall) called "parallel_enrichment" so they run concurrently -- Each enrichment step should include basic error handling with try/catch that returns a fallback value if it fails -- Update the combine_data step to receive results from the parallel branch (results.parallel_enrichment returns an array of branch results) -- The combine_data step should check if any enrichment used a fallback value and set a hasFallbacks flag -- Keep get_item as the first step and return_result as the last step unchanged -` - const results = await runVariantComparison( - USER_PROMPT, - VARIANTS, - VARIANTS[0]._apiKey, - { - initialModules: initialTest7.value.modules as FlowModule[], - initialSchema: initialTest7.schema, - expectedFlow: expectedTest7 as ExpectedFlow - }, - VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })) - ) - - const { summaryPath, flowPaths } = await writeFlowComparisonResults(USER_PROMPT, results) - console.log(`\nResults written to: ${summaryPath}`) - console.log(`Flow files: ${flowPaths.join(', ')}`) - - for (const result of results) { - expect(true).toBe(true) - - if (result.evaluationResult) { - console.log( - `[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100` - ) - console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`) - if ( - result.evaluationResult.missingRequirements && - result.evaluationResult.missingRequirements.length > 0 - ) { - console.log( - `[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}` - ) - } - } - } - }, - TEST_TIMEOUT - ) -}) diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalComparison.ts b/frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalComparison.ts deleted file mode 100644 index 4c2b41d577..0000000000 --- a/frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalComparison.ts +++ /dev/null @@ -1,68 +0,0 @@ -import type { FlowModule } from '$lib/gen' -import { evaluateWithLLM, BASE_EVALUATOR_RESPONSE_FORMAT } from '../shared' -import type { EvaluationResult } from '../shared' - -/** - * Expected flow structure for evaluation. - */ -export interface ExpectedFlow { - summary?: string - value: { - modules: FlowModule[] - } - schema?: Record -} - -/** - * Flow-specific evaluator system prompt. - */ -const FLOW_EVALUATOR_SYSTEM_PROMPT = `You are an expert evaluator for Windmill flow definitions. Your task is to evaluate a generated flow against: -1. The original user request/prompt -2. An expected reference flow - -## Windmill Flow Context -- Flows consist of modules (steps) that execute sequentially -- Module types include: rawscript, forloopflow, branchone, branchall, script, flow, aiagent -- Each module has an id, value (containing type and config), and may have input_transforms -- input_transforms connect modules using expressions like "results.previous_step". Valid input_transforms are: static, javascript. Valid variables in javascript expressions are: results, flow_input, flow_input.iter.value (for forloopflow), flow_input.iter.index (for forloopflow). -- forloopflow contains nested modules that execute per iteration with access to flow_input.iter.value -- branchone executes first matching branch, branchall executes all matching branches -- Branches have conditional expressions (expr) that determine execution -- aiagent modules contain tools array with tool definitions - -## Evaluation Criteria -1. **User Request Fulfillment**: Does the generated flow address ALL requirements from the user's original prompt? - - Are all requested steps present? - - Are the requested features implemented (loops, branches, specific logic)? - - Does the schema match what the user requested for inputs? -2. **Structure**: Are the module types and nesting structure appropriate for the task? -3. **Logic**: Does the flow accomplish the intended logical task? -4. **Connections**: Are input_transforms connecting data correctly between steps? -5. **Completeness**: Are all required steps present with no major omissions? -6. **Code Quality**: Is the code functionally correct (exact syntax doesn't need to match)? - -## Important Notes -- Minor differences in variable names, code formatting, or exact wording are acceptable -- Focus on functional equivalence, not character-by-character matching -- The generated flow should achieve the same outcome as described in the user request -- Extra helper steps or slightly different approaches can still score high if they accomplish the goal -- If the user requested specific module types (like aiagent), verify they are used correctly - -${BASE_EVALUATOR_RESPONSE_FORMAT}` - -/** - * Evaluates how well a generated flow matches an expected flow and user request using an LLM. - * Returns a resemblance score (0-100), a qualitative statement, and any missing requirements. - */ -export async function evaluateFlowComparison( - generatedFlow: ExpectedFlow, - expectedFlow: ExpectedFlow, - userPrompt: string -): Promise { - return evaluateWithLLM({ - userPrompt, - generatedOutput: generatedFlow, - expectedOutput: expectedFlow, - evaluatorSystemPrompt: FLOW_EVALUATOR_SYSTEM_PROMPT - }) -} diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalHelpers.ts b/frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalHelpers.ts deleted file mode 100644 index 39d6ebd5e1..0000000000 --- a/frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalHelpers.ts +++ /dev/null @@ -1,104 +0,0 @@ -import type { FlowAIChatHelpers } from '../../flow/core' -import type { FlowModule, InputTransform } from '$lib/gen' -import type { ExtendedOpenFlow } from '$lib/components/flows/types' -import { findModuleById } from '../../shared' -import { inlineScriptStore, restoreInlineScriptReferences } from '../../flow/inlineScriptsUtils' - -/** - * Creates mock FlowAIChatHelpers for eval testing. - * Tracks flow state in memory and allows tool functions to modify it. - */ -export function createFlowEvalHelpers( - initialModules: FlowModule[] = [], - initialSchema?: Record -) { - let flow: ExtendedOpenFlow = { - value: { modules: structuredClone(initialModules) }, - summary: '', - schema: initialSchema ?? { - $schema: 'https://json-schema.org/draft/2020-12/schema', - properties: {}, - required: [], - type: 'object' - } - } - - const helpers: FlowAIChatHelpers = { - getFlowAndSelectedId: () => ({ flow, selectedId: '' }), - - getModules: (id?: string) => { - if (!id) return flow.value.modules - const module = findModuleById(flow.value.modules, id) - return module ? [module] : [] - }, - - setSnapshot: () => { - // No-op for eval - we don't need snapshot tracking - }, - - revertToSnapshot: () => { - // No-op for eval - }, - - setCode: async (id: string, code: string) => { - const module = findModuleById(flow.value.modules, id) - if (module && module.value.type === 'rawscript') { - module.value.content = code - } - // Keep store coherent for subsequent set_flow_json calls with references - inlineScriptStore.set(id, code) - }, - - setFlowJson: async ( - modules: FlowModule[] | undefined, - schema: Record | undefined - ) => { - if (modules) { - // Restore inline script references back to full content - const restoredModules = restoreInlineScriptReferences(modules) - flow.value.modules = restoredModules - } - - // Update schema if provided - if (schema !== undefined) { - flow.schema = schema - } - }, - - getFlowInputsSchema: async () => flow.schema ?? {}, - - updateExprsToSet: (_id: string, _inputTransforms: Record) => { - // No-op for eval - UI-only functionality - }, - - acceptAllModuleActions: () => { - // No-op for eval - }, - - rejectAllModuleActions: () => { - // No-op for eval - }, - - hasPendingChanges: () => false, - - selectStep: (_id: string) => { - // No-op for eval - }, - - testFlow: async () => { - // Return mock job ID - we don't actually run flows in eval - return 'mock-job-id-' + Date.now() - }, - - getLintErrors: async () => { - // Return empty lint result for eval - return { errorCount: 0, warningCount: 0, errors: [], warnings: [] } - } - } - - return { - helpers, - getFlow: () => flow, - getModules: () => flow.value.modules - } -} diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalRunner.ts b/frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalRunner.ts deleted file mode 100644 index f3c976950d..0000000000 --- a/frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalRunner.ts +++ /dev/null @@ -1,186 +0,0 @@ -import type { FlowModule } from '$lib/gen' -import type { AIProvider } from '$lib/gen/types.gen' -import type { ExtendedOpenFlow } from '$lib/components/flows/types' -import { flowTools, prepareFlowSystemMessage, prepareFlowUserMessage, type FlowAIChatHelpers } from '../../flow/core' -import { createFlowEvalHelpers } from './flowEvalHelpers' -import { evaluateFlowComparison, type ExpectedFlow } from './flowEvalComparison' -import { - runEval, - resolveSystemPrompt, - resolveTools, - resolveModel, - writeComparisonResults, - type VariantConfig, - type BaseEvalResult, - type EvaluationResult, - type Tool, - type VariantDefaults -} from '../shared' - -// Re-export for convenience -export type { ExpectedFlow } from './flowEvalComparison' - -/** - * Flow-specific evaluation result. - */ -export interface FlowEvalResult extends BaseEvalResult { - /** Alias for output to maintain API compatibility */ - flow: ExtendedOpenFlow -} - -/** - * Options for running a flow evaluation. - */ -export interface FlowEvalOptions { - initialModules?: FlowModule[] - initialSchema?: Record - model?: string - customSystemPrompt?: string - maxIterations?: number - variant?: VariantConfig - expectedFlow?: ExpectedFlow - /** AI provider (inferred from model name if omitted) */ - provider?: AIProvider -} - -/** - * Flow-specific variant defaults. - */ -const flowDefaults: VariantDefaults = { - prepareSystemMessage: prepareFlowSystemMessage, - tools: flowTools as Tool[] -} - -/** - * Runs a flow chat evaluation using the shared chat loop (same code path as production). - */ -export async function runFlowEval( - userPrompt: string, - apiKey: string, - options?: FlowEvalOptions -): Promise { - const { helpers, getFlow } = createFlowEvalHelpers( - options?.initialModules ?? [], - options?.initialSchema - ) - - // Resolve variant configuration - const variantName = options?.variant?.name ?? 'baseline' - const systemMessage = resolveSystemPrompt(options?.variant, flowDefaults, options?.customSystemPrompt) - const { tools } = resolveTools(options?.variant, flowDefaults) - const model = resolveModel(options?.variant, options?.model) - - // Build user message - const userMessage = prepareFlowUserMessage(userPrompt, helpers.getFlowAndSelectedId(), []) - - // Run the base evaluation - const rawResult = await runEval({ - userPrompt, - systemMessage, - userMessage, - tools, - helpers, - apiKey, - getOutput: getFlow, - options: { - maxIterations: options?.maxIterations, - model, - workspace: 'test-workspace', - provider: options?.provider - } - }) - - // Run evaluation if expected flow is provided - let evaluationResult: EvaluationResult | undefined - if (options?.expectedFlow) { - const generatedFlow = getFlow() - evaluationResult = await evaluateFlowComparison( - { - summary: generatedFlow.summary, - value: { modules: generatedFlow.value.modules }, - schema: generatedFlow.schema - }, - options.expectedFlow, - userPrompt - ) - } - - return { - ...rawResult, - variantName, - flow: rawResult.output, - evaluationResult - } -} - -/** - * Per-variant provider override. - */ -export interface VariantProviderOverride { - provider: AIProvider - apiKey: string -} - -/** - * Runs the same prompt against multiple variants sequentially for comparison. - * Accepts optional per-variant provider/apiKey overrides. - */ -export async function runVariantComparison( - userPrompt: string, - variants: VariantConfig[], - defaultApiKey: string, - baseOptions?: Omit, - providerOverrides?: VariantProviderOverride[] -): Promise { - const results: FlowEvalResult[] = await Promise.all( - variants.map(async (variant, i) => { - const override = providerOverrides?.[i] - return await runFlowEval(userPrompt, override?.apiKey ?? defaultApiKey, { - ...baseOptions, - variant, - provider: override?.provider ?? baseOptions?.provider - }) - }) - ) - return results -} - -/** - * Writes flow comparison results to files. - */ -export async function writeFlowComparisonResults( - userPrompt: string, - results: FlowEvalResult[], - outputDir?: string -): Promise<{ summaryPath: string; flowPaths: string[] }> { - // @ts-ignore - const { dirname, join } = await import('path') - // @ts-ignore - const { fileURLToPath } = await import('url') - - const __filename = fileURLToPath(import.meta.url) - const __dirname = dirname(__filename) - - const resultsDir = outputDir ?? join(__dirname, 'results') - - const result = await writeComparisonResults({ - userPrompt, - results, - outputDir: resultsDir, - formatOutput: (flow: ExtendedOpenFlow) => ({ - summary: flow.summary ?? '', - value: { - modules: flow.value.modules - }, - schema: flow.schema ?? { - $schema: 'https://json-schema.org/draft/2020-12/schema', - properties: {}, - required: [], - type: 'object' - } - }), - outputLabel: 'flow' - }) - - return { summaryPath: result.summaryPath, flowPaths: result.outputPaths } -} diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/variants/baseline.ts b/frontend/src/lib/components/copilot/chat/__tests__/flow/variants/baseline.ts deleted file mode 100644 index bd20f4f8c2..0000000000 --- a/frontend/src/lib/components/copilot/chat/__tests__/flow/variants/baseline.ts +++ /dev/null @@ -1,12 +0,0 @@ -import type { VariantConfig } from '../../shared' - -/** - * Baseline variant - uses the production system prompt and all tools. - * This is the default configuration that matches the actual flow chat implementation. - */ -export const BASELINE_VARIANT: VariantConfig = { - name: 'baseline', - description: 'Production configuration with default system prompt and all tools', - systemPrompt: { type: 'default' }, - tools: { type: 'default' } -} diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/variants/index.ts b/frontend/src/lib/components/copilot/chat/__tests__/flow/variants/index.ts deleted file mode 100644 index 914db4a398..0000000000 --- a/frontend/src/lib/components/copilot/chat/__tests__/flow/variants/index.ts +++ /dev/null @@ -1,6 +0,0 @@ -// Re-export all variant configurations -export { BASELINE_VARIANT } from './baseline' -export { MINIMAL_SINGLE_TOOL_VARIANT, setFlowJsonTool } from './minimal-single-tool' - -// Re-export types for convenience -export type { VariantConfig } from '../../shared' diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/variants/minimal-single-tool.ts b/frontend/src/lib/components/copilot/chat/__tests__/flow/variants/minimal-single-tool.ts deleted file mode 100644 index a07b24b691..0000000000 --- a/frontend/src/lib/components/copilot/chat/__tests__/flow/variants/minimal-single-tool.ts +++ /dev/null @@ -1,402 +0,0 @@ -import type { VariantConfig } from '../../shared' -import type { Tool } from '../../shared' -import type { FlowAIChatHelpers } from '../../../flow/core' -import { flowTools } from '../../../flow/core' -import openFlowSchema from '../../../flow/openFlow.json' - -/** - * IDs of the granular flow editing tools that should be replaced by set_flow_json. - */ -const FLOW_EDITING_TOOL_NAMES = [ - 'add_module', - 'remove_module', - 'remove_branch', - 'modify_module', - 'set_flow_schema' -] - -/** - * A single tool that sets the entire flow JSON at once. - * This replaces the granular flow editing tools (add_module, remove_module, modify_module, etc.) - */ -export const setFlowJsonTool: Tool = { - def: { - type: 'function', - function: { - name: 'set_flow_json', - description: - 'Set the entire flow by providing the complete flow object. This replaces all existing modules and schema.', - strict: false, - parameters: { - type: 'object', - properties: { - modules: { - type: 'array', - description: 'Array of flow modules', - items: { - type: 'object' - } - }, - schema: { - type: 'object', - description: - 'Flow input schema (JSON Schema format) defining parameters the flow accepts' - } - }, - required: ['modules'] - } - } - }, - fn: async ({ args, helpers }) => { - const { modules, schema } = args as { modules: any[]; schema?: Record } - await helpers.setFlowJson(modules, schema) - return `Flow updated with ${modules.length} module(s): [${modules.map((m: any) => m.id).join(', ')}]` - } -} - -/** - * Build the tools array for the minimal-single-tool variant. - * Keeps all utility tools (search, resource type, test run, db schema, code generation instructions) - * but replaces all flow editing tools with a single set_flow_json tool. - */ -function buildMinimalSingleToolTools(): Tool[] { - // Get all production tools except flow editing tools - const utilityTools = (flowTools as Tool[]).filter( - (t) => !FLOW_EDITING_TOOL_NAMES.includes(t.def.function.name) - ) - - return [...utilityTools, setFlowJsonTool] -} - -const MINIMAL_SINGLE_TOOL_SYSTEM_PROMPT = `You are a helpful assistant that creates and edits workflows on the Windmill platform. - -## IMPORTANT RULES - -**Reserved IDs - Do NOT use these module IDs:** -- \`failure\` - Reserved for failure handler module -- \`preprocessor\` - Reserved for preprocessor module -- \`Input\` - Reserved for flow input reference - -## Tool Selection Guide - -**Flow Modification:** -- **Create or modify the entire flow** → \`set_flow_json\` (provide complete modules array and optional schema) - -**Code & Scripts:** -- **Get language-specific coding instructions** → \`get_instructions_for_code_generation\` (call BEFORE writing code) -- **Find workspace scripts and flows** → \`search_workspace\` -- **Get details of a specific script or flow** → \`get_runnable_details\` -- **Find Windmill Hub scripts** → \`search_hub_scripts\` - -**Testing:** -- **Test entire flow** → \`test_run_flow\` -- **Test single step** → \`test_run_step\` - -**Resources & Schema:** -- **Search resource types** → \`resource_type\` -- **Get database schema** → \`get_db_schema\` - -## Common Mistakes to Avoid - -- **Don't forget \`input_transforms\`** - Rawscript parameters won't receive values without them -- **Don't use spaces in module IDs** - Use underscores (e.g., \`fetch_data\` not \`fetch data\`) -- **Don't reference future steps** - \`results.step_id\` only works for steps that execute before the current one -- **Don't create duplicate IDs** - Each module ID must be unique in the flow - -## Flow Modification with set_flow_json - -Use the \`set_flow_json\` tool to set the entire flow structure at once. Provide the complete modules array and optionally the flow input schema. - -**Parameters:** -- \`modules\`: Array of flow modules (required) -- \`schema\`: Flow input schema in JSON Schema format (optional) - -**Example - Simple flow:** -\`\`\`javascript -set_flow_json({ - modules: [ - { - id: "fetch_data", - summary: "Fetch user data from API", - value: { - type: "rawscript", - language: "bun", - content: "export async function main(userId: string) { return { id: userId, name: 'John' }; }", - input_transforms: { - userId: { type: "javascript", expr: "flow_input.user_id" } - } - } - }, - { - id: "process_data", - summary: "Process the fetched data", - value: { - type: "rawscript", - language: "bun", - content: "export async function main(data: any) { return { processed: true, ...data }; }", - input_transforms: { - data: { type: "javascript", expr: "results.fetch_data" } - } - } - } - ], - schema: { - type: "object", - properties: { - user_id: { type: "string", description: "User ID to fetch" } - }, - required: ["user_id"] - } -}) -\`\`\` - -**Example - Flow with for loop:** -\`\`\`javascript -set_flow_json({ - modules: [ - { - id: "get_items", - summary: "Get list of items", - value: { - type: "rawscript", - language: "bun", - content: "export async function main() { return [1, 2, 3]; }", - input_transforms: {} - } - }, - { - id: "loop_items", - summary: "Process each item", - value: { - type: "forloopflow", - iterator: { type: "javascript", expr: "results.get_items" }, - skip_failures: false, - parallel: true, - modules: [ - { - id: "process_item", - summary: "Process single item", - value: { - type: "rawscript", - language: "bun", - content: "export async function main(item: number) { return item * 2; }", - input_transforms: { - item: { type: "javascript", expr: "flow_input.iter.value" } - } - } - } - ] - } - } - ] -}) -\`\`\` - -**Example - Flow with branches (branchone):** -\`\`\`javascript -set_flow_json({ - modules: [ - { - id: "get_value", - summary: "Get a value to branch on", - value: { - type: "rawscript", - language: "bun", - content: "export async function main() { return 50; }", - input_transforms: {} - } - }, - { - id: "branch_on_value", - summary: "Branch based on value", - value: { - type: "branchone", - branches: [ - { - summary: "High value", - expr: "results.get_value > 75", - modules: [ - { - id: "high_handler", - value: { - type: "rawscript", - language: "bun", - content: "export async function main() { return 'high'; }", - input_transforms: {} - } - } - ] - }, - { - summary: "Medium value", - expr: "results.get_value > 25", - modules: [ - { - id: "medium_handler", - value: { - type: "rawscript", - language: "bun", - content: "export async function main() { return 'medium'; }", - input_transforms: {} - } - } - ] - } - ], - default: [ - { - id: "low_handler", - value: { - type: "rawscript", - language: "bun", - content: "export async function main() { return 'low'; }", - input_transforms: {} - } - } - ] - } - } - ] -}) -\`\`\` - -Follow the user instructions carefully. -At the end of your changes, explain precisely what you did and what the flow does now. -ALWAYS test your modifications using the \`test_run_flow\` tool. If the user cancels the test run, do not try again and wait for the next user instruction. -When testing steps that are sql scripts, the arguments to be passed are { database: $res: }. - -### Input Transforms for Rawscripts - -Rawscript modules use \`input_transforms\` to map function parameters to values. Each key in \`input_transforms\` corresponds to a parameter name in your script's \`main\` function. - -**Transform Types:** -- \`static\`: Fixed value passed directly -- \`javascript\`: Dynamic expression evaluated at runtime - -**Available Variables in JavaScript Expressions:** -- \`flow_input.{property}\` - Access flow input parameters -- \`results.{step_id}\` - Access output from a previous step -- \`flow_input.iter.value\` - Current item when inside a for-loop -- \`flow_input.iter.index\` - Current index when inside a for-loop - -**Example - Rawscript using flow input and previous step result:** -\`\`\`json -{ - "id": "step_b", - "value": { - "type": "rawscript", - "language": "bun", - "content": "export async function main(userId: string, data: any[]) { return 'Hello, world!'; }", - "input_transforms": { - "userId": { "type": "javascript", "expr": "flow_input.user_id" }, - "data": { "type": "javascript", "expr": "results.step_a" } - } - } -} -\`\`\` - -**Important:** The parameter names in \`input_transforms\` must match the function parameter names in your script. - -### Other Key Concepts -- **Resources**: For flow inputs, use type "object" with format "resource-". For step inputs, use "$res:path/to/resource" -- **Module IDs**: Must be unique and valid identifiers. Used to reference results via \`results.step_id\` -- **Module types**: Use 'bun' as default language for rawscript if unspecified - -### Writing Code for Modules - -**IMPORTANT: Before writing any code for a rawscript module, you MUST call the \`get_instructions_for_code_generation\` tool with the target language.** This tool provides essential language-specific instructions. - -Example: Before writing TypeScript/Bun code, call \`get_instructions_for_code_generation({ language: "bun" })\` - -### Creating Flows - -1. **Search for existing scripts first** (unless user explicitly asks to write from scratch): - - First: \`search_workspace\` to find workspace scripts and flows - - Use \`get_runnable_details\` to inspect a specific script or flow (inputs, description, code) - - Then: \`search_hub_scripts\` (only consider highly relevant results) - - Only create raw scripts if no suitable script is found - -2. **Build the complete flow using \`set_flow_json\`:** - - If using existing script: use \`type: "script"\` with \`path\` - - If creating rawscript: use \`type: "rawscript"\` with \`language\` and \`content\` - - **First call \`get_instructions_for_code_generation\` to get the correct code format** - - Always define \`input_transforms\` to connect parameters to flow inputs or previous step results - -### AI Agent Modules - -AI agents can use tools to accomplish tasks. When creating an AI agent module: - -\`\`\`javascript -{ - id: "support_agent", - summary: "AI agent for customer support", - value: { - type: "aiagent", - input_transforms: { - provider: { type: "static", value: "$res:f/ai_providers/openai" }, - output_type: { type: "static", value: "text" }, - user_message: { type: "javascript", expr: "flow_input.query" }, - system_prompt: { type: "static", value: "You are a helpful assistant." } - }, - tools: [ - { - id: "search_docs", - summary: "Search_documentation", - value: { - tool_type: "flowmodule", - type: "rawscript", - language: "bun", - content: "export async function main(query: string) { return ['doc1', 'doc2']; }", - input_transforms: { query: { type: "static", value: "" } } - } - } - ] - } -} -\`\`\` - -- **Tool IDs**: Cannot contain spaces - use underscores -- **Tool summaries**: Cannot contain spaces - use underscores -- **Tool types**: \`flowmodule\` for scripts/flows, \`mcp\` for MCP server tools - -## Resource Types -On Windmill, credentials and configuration are stored in resources. Resource types define the format of the resource. -- Use the \`resource_type\` tool to search for available resource types (e.g. stripe, google, postgresql, etc.) -- If the user needs a resource as flow input, set the property type in the schema to "object" and add a key called "format" set to "resource-nameofresourcetype" (e.g. "resource-stripe") -- If the user wants a specific resource as step input, set the step value to a static string in the format: "$res:path/to/resource" - -### OpenFlow Schema Reference -Below is the complete OpenAPI schema for OpenFlow. All field descriptions and behaviors are defined here. Refer to this as the authoritative reference when generating flow JSON: - -\`\`\`json -${JSON.stringify(openFlowSchema, null, 2)} -\`\`\` - -The schema includes detailed descriptions for: -- **FlowModuleValue types**: rawscript, script, flow, forloopflow, whileloopflow, branchone, branchall, identity, aiagent -- **Module configuration**: stop_after_if, skip_if, suspend, sleep, cache_ttl, retry, mock, timeout -- **InputTransform**: static vs javascript, available variables (results, flow_input, flow_input.iter) -- **Special modules**: preprocessor_module, failure_module -- **Loop options**: iterator, parallel, parallelism, skip_failures -- **Branch types**: BranchOne (first match), BranchAll (all execute) -` - -/** - * Minimal single-tool variant. - * Replaces granular flow editing tools (add_module, remove_module, modify_module, etc.) - * with a single set_flow_json tool, while keeping all other utility tools. - * Uses the default system prompt. - */ -export const MINIMAL_SINGLE_TOOL_VARIANT: VariantConfig = { - name: 'minimal-single-tool', - description: - 'Default prompt with set_flow_json instead of granular flow editing tools, keeps all utility tools', - systemPrompt: { - type: 'custom', - content: MINIMAL_SINGLE_TOOL_SYSTEM_PROMPT - }, - tools: { - type: 'custom', - tools: buildMinimalSingleToolTools() - } -} diff --git a/frontend/src/lib/components/copilot/chat/__tests__/shared/baseLLMEvaluator.ts b/frontend/src/lib/components/copilot/chat/__tests__/shared/baseLLMEvaluator.ts deleted file mode 100644 index bd7bd06d44..0000000000 --- a/frontend/src/lib/components/copilot/chat/__tests__/shared/baseLLMEvaluator.ts +++ /dev/null @@ -1,135 +0,0 @@ -import Anthropic from '@anthropic-ai/sdk' -import type { EvaluationResult } from './types' - -/** - * Parameters for LLM-based evaluation. - */ -export interface EvaluateParams { - /** The user's original request/prompt */ - userPrompt: string - /** The generated output to evaluate */ - generatedOutput: unknown - /** The expected/reference output */ - expectedOutput: unknown - /** Domain-specific system prompt for the evaluator */ - evaluatorSystemPrompt: string - /** Anthropic API key for evaluation */ - apiKey?: string - /** Model to use for evaluation (default: 'claude-sonnet-4-5-20250514') */ - model?: string -} - -/** - * Base evaluator system prompt template. - * Domain-specific evaluators should build on this structure. - */ -export const BASE_EVALUATOR_RESPONSE_FORMAT = ` -## Response Format -You MUST respond with valid JSON only, no additional text: -{ - "resemblanceScore": <0-100 integer>, - "statement": "", - "missingRequirements": [""] -} - -Score guidelines: -- 90-100: Fully addresses user request, functionally equivalent to expected output -- 70-89: Addresses most user requirements, same overall structure with minor differences -- 50-69: Partially addresses user request, achieves similar goal but different approach -- 30-49: Missing significant requirements from user request -- 0-29: Does not address user request or significantly incorrect` - -/** - * Evaluates how well a generated output matches an expected output using an LLM. - * Uses Anthropic API directly instead of OpenRouter. - */ -export async function evaluateWithLLM(params: EvaluateParams): Promise { - const { - userPrompt, - generatedOutput, - expectedOutput, - evaluatorSystemPrompt, - apiKey, - model = 'claude-sonnet-4-5-20250514' - } = params - - // @ts-ignore - process.env - const anthropicKey = apiKey ?? process.env.ANTHROPIC_API_KEY - if (!anthropicKey) { - return { - success: false, - resemblanceScore: 0, - statement: 'No API key available for evaluation', - error: 'ANTHROPIC_API_KEY not set and no apiKey provided' - } - } - - const client = new Anthropic({ apiKey: anthropicKey }) - - const userMessage = `## User's Original Request -${userPrompt} - -## Expected Reference Output -\`\`\`json -${JSON.stringify(expectedOutput, null, 2)} -\`\`\` - -## Generated Output -\`\`\`json -${JSON.stringify(generatedOutput, null, 2)} -\`\`\` - -Please evaluate how well the generated output: -1. Fulfills ALL requirements from the user's original request -2. Matches the structure and logic of the expected reference output` - - try { - const response = await client.messages.create({ - model, - max_tokens: 2048, - system: evaluatorSystemPrompt, - messages: [ - { role: 'user', content: userMessage } - ], - temperature: 0 - }) - - const textBlock = response.content.find((block) => block.type === 'text') - const content = textBlock?.text - if (!content) { - return { - success: false, - resemblanceScore: 0, - statement: 'No response from evaluator', - error: 'Empty response from LLM' - } - } - - // Parse JSON response - handle potential markdown code blocks - let jsonContent = content.trim() - if (jsonContent.startsWith('```')) { - jsonContent = jsonContent.replace(/^```(?:json)?\n?/, '').replace(/\n?```$/, '') - } - - const parsed = JSON.parse(jsonContent) as { - resemblanceScore: number - statement: string - missingRequirements?: string[] - } - - return { - success: true, - resemblanceScore: Math.max(0, Math.min(100, Math.round(parsed.resemblanceScore))), - statement: parsed.statement, - missingRequirements: parsed.missingRequirements ?? [] - } - } catch (err) { - const errorMessage = err instanceof Error ? err.message : String(err) - return { - success: false, - resemblanceScore: 0, - statement: 'Evaluation failed', - error: errorMessage - } - } -} diff --git a/frontend/src/lib/components/copilot/chat/__tests__/shared/baseResultsWriter.ts b/frontend/src/lib/components/copilot/chat/__tests__/shared/baseResultsWriter.ts deleted file mode 100644 index 0ecce615ba..0000000000 --- a/frontend/src/lib/components/copilot/chat/__tests__/shared/baseResultsWriter.ts +++ /dev/null @@ -1,169 +0,0 @@ -// @ts-ignore -import { writeFile, mkdir } from 'fs/promises' -// @ts-ignore -import { join, dirname } from 'path' -// @ts-ignore -import { fileURLToPath } from 'url' -import type { BaseEvalResult } from './types' - -/** - * Generates a timestamp string suitable for filenames. - * Format: 2024-01-15T10-30-45-123Z (ISO but with dashes instead of colons) - */ -export function generateTimestamp(): string { - return new Date().toISOString().replace(/:/g, '-') -} - -/** - * Parameters for writing comparison results. - */ -export interface WriteResultsParams { - /** User prompt that was tested */ - userPrompt: string - /** Results from all variants */ - results: BaseEvalResult[] - /** Directory to write results to */ - outputDir: string - /** Function to format domain-specific output for JSON files */ - formatOutput: (output: TOutput) => unknown - /** Label for the output type (e.g., 'flow', 'app') */ - outputLabel?: string -} - -/** - * Writes comparison results to files in the results folder. - * Creates: - * - summary.md - Summary with prompt and results table - * - {variant_name}.json - Full result with metadata for each variant - * - {variant_name}_{outputLabel}.json - Clean output for each variant - */ -export async function writeComparisonResults( - params: WriteResultsParams -): Promise<{ summaryPath: string; outputPaths: string[] }> { - const { userPrompt, results, outputDir, formatOutput, outputLabel = 'output' } = params - const timestamp = generateTimestamp() - - // Ensure results directory exists - await mkdir(outputDir, { recursive: true }) - const resultFolder = join(outputDir, timestamp) - await mkdir(resultFolder, { recursive: true }) - - // Check if any results have evaluation data - const hasEvaluation = results.some((r) => r.evaluationResult) - - // Build summary markdown - const summaryLines: string[] = [ - `# Eval Results - ${timestamp}`, - '', - '## User Prompt', - '```', - userPrompt.trim(), - '```', - '', - '## Results', - '' - ] - - // Add results table header based on whether evaluation data exists - if (hasEvaluation) { - summaryLines.push( - '| Variant | Success | Total Tokens | Tool Calls | Iterations | Resemblance Score |' - ) - summaryLines.push( - '|---------|---------|--------------|------------|------------|-------------------|' - ) - } else { - summaryLines.push('| Variant | Success | Total Tokens | Tool Calls | Iterations |') - summaryLines.push('|---------|---------|--------------|------------|------------|') - } - - for (const result of results) { - const baseRow = `| ${result.variantName} | ${result.success} | ${result.tokenUsage.total} | ${result.toolsCalled.length} | ${result.iterations}` - if (hasEvaluation) { - const score = result.evaluationResult?.resemblanceScore ?? 'N/A' - summaryLines.push(`${baseRow} | ${score} |`) - } else { - summaryLines.push(`${baseRow} |`) - } - } - - // Add evaluation details section if available - if (hasEvaluation) { - summaryLines.push('') - summaryLines.push('## Evaluation Details') - summaryLines.push('') - for (const result of results) { - if (result.evaluationResult) { - summaryLines.push(`### ${result.variantName}`) - summaryLines.push('') - summaryLines.push(`**Score:** ${result.evaluationResult.resemblanceScore}/100`) - summaryLines.push('') - summaryLines.push(`**Statement:** ${result.evaluationResult.statement}`) - summaryLines.push('') - if ( - result.evaluationResult.missingRequirements && - result.evaluationResult.missingRequirements.length > 0 - ) { - summaryLines.push('**Missing Requirements:**') - for (const req of result.evaluationResult.missingRequirements) { - summaryLines.push(`- ${req}`) - } - summaryLines.push('') - } - if (result.evaluationResult.error) { - summaryLines.push(`**Error:** ${result.evaluationResult.error}`) - summaryLines.push('') - } - } - } - } - - // Add errors section for failed variants - const failedResults = results.filter((r) => !r.success && r.error) - if (failedResults.length > 0) { - summaryLines.push('') - summaryLines.push('## Errors') - summaryLines.push('') - for (const result of failedResults) { - summaryLines.push(`### ${result.variantName}`) - summaryLines.push('') - summaryLines.push('```') - summaryLines.push(result.error!) - summaryLines.push('```') - summaryLines.push('') - } - } - - const outputPaths: string[] = [] - - for (const result of results) { - const resultFilename = `${result.variantName}.json` - const resultPath = join(resultFolder, resultFilename) - outputPaths.push(resultPath) - - const outputFilename = `${result.variantName}_${outputLabel}.json` - const outputPath = join(resultFolder, outputFilename) - - // Write result JSON file (with metadata) - const resultData = { - variantName: result.variantName, - success: result.success, - error: result.error, - evaluationResult: result.evaluationResult, - toolsCalled: result.toolsCalled, - toolCallDetails: result.toolCallDetails, - messages: result.messages - } - await writeFile(resultPath, JSON.stringify(resultData, null, 2)) - - // Write clean output JSON file (domain-specific format) - const outputData = formatOutput(result.output) - await writeFile(outputPath, JSON.stringify(outputData, null, 2)) - } - - // Write summary markdown file - const summaryPath = join(resultFolder, `summary.md`) - await writeFile(summaryPath, summaryLines.join('\n')) - - return { summaryPath, outputPaths } -} diff --git a/frontend/src/lib/components/copilot/chat/__tests__/shared/baseVariants.ts b/frontend/src/lib/components/copilot/chat/__tests__/shared/baseVariants.ts deleted file mode 100644 index 26d9bf57cc..0000000000 --- a/frontend/src/lib/components/copilot/chat/__tests__/shared/baseVariants.ts +++ /dev/null @@ -1,108 +0,0 @@ -import type { - ChatCompletionFunctionTool, - ChatCompletionSystemMessageParam -} from 'openai/resources/chat/completions.mjs' -import type { ChatCompletionTool } from 'openai/resources/chat/completions.mjs' -import type { VariantConfig } from './types' - -/** - * Generic tool interface that matches the structure used across chat modules. - */ -export interface Tool { - def: ChatCompletionFunctionTool - fn: (params: { - args: Record - workspace: string - helpers: THelpers - toolCallbacks: { - setToolStatus: (...args: unknown[]) => void - removeToolStatus: (...args: unknown[]) => void - } - toolId: string - }) => Promise -} - -/** - * Domain-specific defaults for variant resolution. - */ -export interface VariantDefaults { - /** Function to prepare system message, optionally with custom prompt */ - prepareSystemMessage: (customPrompt?: string) => ChatCompletionSystemMessageParam - /** Available tools for the domain */ - tools: Tool[] -} - -/** - * Resolves system prompt from variant config. - * Returns the appropriate ChatCompletionSystemMessageParam based on config. - */ -export function resolveSystemPrompt( - variant: VariantConfig | undefined, - defaults: VariantDefaults, - fallbackCustomPrompt?: string -): ChatCompletionSystemMessageParam { - if (!variant?.systemPrompt || variant.systemPrompt.type === 'default') { - return defaults.prepareSystemMessage(fallbackCustomPrompt) - } - - if (variant.systemPrompt.type === 'default-with-custom') { - return defaults.prepareSystemMessage(variant.systemPrompt.custom) - } - - // type === 'custom' - return { - role: 'system', - content: variant.systemPrompt.content - } -} - -/** - * Resolves tools from variant config. - * Returns both the tool definitions (for API) and full tools (for execution). - */ -export function resolveTools( - variant: VariantConfig | undefined, - defaults: VariantDefaults -): { - toolDefs: ChatCompletionTool[] - tools: Tool[] -} { - if (!variant?.tools || variant.tools.type === 'default') { - return { - toolDefs: defaults.tools.map((t) => t.def), - tools: defaults.tools - } - } - - if (variant.tools.type === 'subset') { - const includeList = (variant.tools as { type: 'subset'; include: string[] }).include - const subset = defaults.tools.filter((t) => includeList.includes(t.def.function.name)) - return { - toolDefs: subset.map((t) => t.def), - tools: subset - } - } - - if (variant.tools.type === 'custom') { - // Custom tools are typed as unknown[] in base VariantConfig but domain-specific - // code should ensure they are the correct Tool type - const customTools = variant.tools.tools as Tool[] - return { - toolDefs: customTools.map((t) => t.def), - tools: customTools - } - } - - // Default fallback - return { - toolDefs: defaults.tools.map((t) => t.def), - tools: defaults.tools - } -} - -/** - * Resolves model from variant config with fallback. - */ -export function resolveModel(variant?: VariantConfig, fallback?: string): string { - return variant?.model ?? fallback ?? 'gpt-4o' -} diff --git a/frontend/src/lib/components/copilot/chat/__tests__/shared/index.ts b/frontend/src/lib/components/copilot/chat/__tests__/shared/index.ts deleted file mode 100644 index 0c1b3bc8cb..0000000000 --- a/frontend/src/lib/components/copilot/chat/__tests__/shared/index.ts +++ /dev/null @@ -1,28 +0,0 @@ -// Types -export type { - TokenUsage, - ToolCallDetail, - EvaluationResult, - BaseEvalResult, - VariantConfig, - EvalRunnerOptions, - ToolCallbacks -} from './types' - -export { createNoOpToolCallbacks } from './types' - -// Variant resolution -export type { Tool, VariantDefaults } from './baseVariants' -export { resolveSystemPrompt, resolveTools, resolveModel } from './baseVariants' - -// Eval runner -export type { RawEvalResult, RunEvalParams } from './baseEvalRunner' -export { runEval } from './baseEvalRunner' - -// LLM evaluator -export type { EvaluateParams } from './baseLLMEvaluator' -export { evaluateWithLLM, BASE_EVALUATOR_RESPONSE_FORMAT } from './baseLLMEvaluator' - -// Results writer -export type { WriteResultsParams } from './baseResultsWriter' -export { writeComparisonResults, generateTimestamp } from './baseResultsWriter' diff --git a/frontend/src/lib/components/copilot/chat/__tests__/shared/types.ts b/frontend/src/lib/components/copilot/chat/__tests__/shared/types.ts deleted file mode 100644 index 61f7f1fd1f..0000000000 --- a/frontend/src/lib/components/copilot/chat/__tests__/shared/types.ts +++ /dev/null @@ -1,107 +0,0 @@ -import type { ChatCompletionMessageParam } from 'openai/resources/chat/completions.mjs' -import type { AIProvider } from '$lib/gen/types.gen' - -/** - * Token usage tracking for LLM calls. - */ -export interface TokenUsage { - prompt: number - completion: number - total: number -} - -/** - * Details of a single tool call made during evaluation. - */ -export interface ToolCallDetail { - name: string - arguments: Record -} - -/** - * Result of LLM-based comparison/evaluation. - */ -export interface EvaluationResult { - success: boolean - resemblanceScore: number - statement: string - missingRequirements?: string[] - error?: string -} - -/** - * Base evaluation result that can be extended for domain-specific outputs. - * @template TOutput The domain-specific output type (e.g., flow definition, app files) - */ -export interface BaseEvalResult { - success: boolean - output: TOutput - error?: string - tokenUsage: TokenUsage - toolCallsCount: number - toolsCalled: string[] - toolCallDetails: ToolCallDetail[] - iterations: number - variantName: string - evaluationResult?: EvaluationResult - messages: ChatCompletionMessageParam[] -} - -/** - * Base configuration for a variant in eval testing. - * Allows customizing system prompt, tools, and model for comparison. - * - * Note: Domain-specific variants may extend this with custom tool configurations. - * See flow/flowEvalVariants.ts for an example with custom tools. - */ -export interface VariantConfig { - name: string - description?: string - - /** System prompt configuration */ - systemPrompt?: - | { type: 'default' } - | { type: 'default-with-custom'; custom: string } - | { type: 'custom'; content: string } - - /** Tools configuration - basic types supported by shared code */ - tools?: - | { type: 'default' } - | { type: 'subset'; include: string[] } - | { type: 'custom'; tools: unknown[] } - - /** Model to use (default: 'gpt-4o') */ - model?: string -} - -/** - * Options for running an evaluation. - */ -export interface EvalRunnerOptions { - /** Maximum iterations for tool call loop (default: 20) */ - maxIterations?: number - /** Model to use for LLM calls */ - model?: string - /** Workspace ID for tool calls */ - workspace?: string - /** AI provider (inferred from model name if omitted) */ - provider?: AIProvider -} - -/** - * No-op tool callbacks for eval testing. - */ -export interface ToolCallbacks { - setToolStatus: (id: string, status: { content?: string; result?: string; error?: string }) => void - removeToolStatus: (id: string) => void -} - -/** - * Creates no-op tool callbacks for eval testing. - */ -export function createNoOpToolCallbacks(): ToolCallbacks { - return { - setToolStatus: () => {}, - removeToolStatus: () => {} - } -} diff --git a/frontend/src/lib/components/copilot/chat/anthropic.ts b/frontend/src/lib/components/copilot/chat/anthropic.ts index ac45c175a6..4bc581db88 100644 --- a/frontend/src/lib/components/copilot/chat/anthropic.ts +++ b/frontend/src/lib/components/copilot/chat/anthropic.ts @@ -17,6 +17,12 @@ import type { MessageStream } from '@anthropic-ai/sdk/lib/MessageStream' import type { AIProviderModel } from '$lib/gen' import { getProviderAndCompletionConfig, workspaceAIClients } from '../lib' import { processToolCall, type Tool, type ToolCallbacks } from './shared' +import { anthropicUsageToChatTokenUsage, type ChatTokenUsage } from './tokenUsage' + +interface ParsedCompletionResult { + shouldContinue: boolean + tokenUsage: ChatTokenUsage +} export async function getAnthropicCompletion( messages: ChatCompletionMessageParam[], @@ -70,7 +76,7 @@ export async function parseAnthropicCompletion( helpers: any, abortController?: AbortController, options?: { workspace?: string } -): Promise { +): Promise { let toolCallsToProcess: ChatCompletionMessageFunctionToolCall[] = [] let error = null @@ -205,6 +211,9 @@ export async function parseAnthropicCompletion( throw error } + const finalMessage = await completion.finalMessage() + const tokenUsage = anthropicUsageToChatTokenUsage(finalMessage.usage) + // Process tool calls if any if (toolCallsToProcess.length > 0) { const assistantWithTools = { @@ -226,10 +235,10 @@ export async function parseAnthropicCompletion( messages.push(messageToAdd) addedMessages.push(messageToAdd) } - return true // Continue the conversation loop + return { shouldContinue: true, tokenUsage } } - return false // End the conversation + return { shouldContinue: false, tokenUsage } } export function convertOpenAIToAnthropicMessages(messages: ChatCompletionMessageParam[]): { diff --git a/frontend/src/lib/components/copilot/chat/chatLoop.ts b/frontend/src/lib/components/copilot/chat/chatLoop.ts index 4b239e4a05..3c27317be3 100644 --- a/frontend/src/lib/components/copilot/chat/chatLoop.ts +++ b/frontend/src/lib/components/copilot/chat/chatLoop.ts @@ -13,6 +13,11 @@ import { parseOpenAIResponsesCompletion } from './openai-responses' import type { Tool, ToolCallbacks } from './shared' +import { + addChatTokenUsage, + emptyChatTokenUsage, + type ChatTokenUsage +} from './tokenUsage' export interface ChatClients { openai: OpenAI @@ -49,6 +54,7 @@ export interface ChatLoopConfig { export interface ChatLoopResult { addedMessages: ChatCompletionMessageParam[] + tokenUsage: ChatTokenUsage } export async function runChatLoop(config: ChatLoopConfig): Promise { @@ -66,6 +72,7 @@ export async function runChatLoop(config: ChatLoopConfig): Promise('FlowCopilotContext') ?? {} + const inlineScriptSession = createInlineScriptSession() // Get diffManager from the graph const diffManager = $derived(flowModuleSchemaMap?.getDiffManager()) @@ -62,6 +63,7 @@ } return flowStore.val.value.modules }, + inlineScriptSession, setSnapshot: (snapshot: ExtendedOpenFlow) => { diffManager?.setBeforeFlow(snapshot) }, @@ -103,6 +105,7 @@ // 2. Apply the code change module.value.content = code + inlineScriptSession.set(id, code) const { input_transforms, schema } = await loadSchemaFromModule(module) module.value.input_transforms = input_transforms refreshStateStore(flowStore) @@ -216,7 +219,13 @@ if (modules) { // Restore inline script references back to full content - const restoredModules = restoreInlineScriptReferences(modules) + const restoredModules = inlineScriptSession.restoreInlineScriptReferences(modules) + const unresolvedRefs = inlineScriptSession.findUnresolvedInlineScriptRefs(restoredModules) + if (unresolvedRefs.length > 0) { + throw new Error( + `Unresolved inline script references: ${unresolvedRefs.join(', ')}` + ) + } // Directly modify flowStore (immediate effect) flowStore.val.value.modules = restoredModules } diff --git a/frontend/src/lib/components/copilot/chat/flow/core.ts b/frontend/src/lib/components/copilot/chat/flow/core.ts index f695da0aa1..e8fd54020a 100644 --- a/frontend/src/lib/components/copilot/chat/flow/core.ts +++ b/frontend/src/lib/components/copilot/chat/flow/core.ts @@ -34,7 +34,7 @@ import { } from '../shared' import type { ContextElement } from '../context' import type { ExtendedOpenFlow } from '$lib/components/flows/types' -import { inlineScriptStore, extractAndReplaceInlineScripts } from './inlineScriptsUtils' +import type { InlineScriptSession } from './inlineScriptsUtils' import { flowModulesSchema } from './openFlowZod' import { collectAllModuleIdsFromArray } from './utils' import { getFlowPrompt } from '$system_prompts' @@ -247,6 +247,7 @@ export interface FlowAIChatHelpers { // flow context getFlowAndSelectedId: () => { flow: ExtendedOpenFlow; selectedId: string } getModules: (id?: string) => FlowModule[] + inlineScriptSession: InlineScriptSession // snapshot management (AI sets this when making changes) /** Set the before flow snapshot */ @@ -581,7 +582,7 @@ export const flowTools: Tool[] = [ }, { def: inspectInlineScriptToolDef, - fn: async ({ args, toolCallbacks, toolId }) => { + fn: async ({ args, helpers, toolCallbacks, toolId }) => { const parsedArgs = inspectInlineScriptSchema.parse(args) const moduleId = parsedArgs.moduleId @@ -589,7 +590,7 @@ export const flowTools: Tool[] = [ content: `Retrieving inline script content for module '${moduleId}'...` }) - const content = inlineScriptStore.get(moduleId) + const content = helpers.inlineScriptSession.get(moduleId) if (content === undefined) { toolCallbacks.setToolStatus(toolId, { @@ -623,9 +624,6 @@ export const flowTools: Tool[] = [ toolCallbacks.setToolStatus(toolId, { content: `Setting code for module '${moduleId}'...` }) - // Update store to keep it coherent (for subsequent set_flow_json calls with references) - inlineScriptStore.set(moduleId, code) - // Update the flow directly via helper await helpers.setCode(moduleId, code) @@ -1057,7 +1055,8 @@ You have access to the following contexts: export function prepareFlowUserMessage( instructions: string, flowAndSelectedId?: { flow: ExtendedOpenFlow; selectedId: string }, - selectedContext: ContextElement[] = [] + selectedContext: ContextElement[] = [], + inlineScriptSession?: InlineScriptSession ): ChatCompletionUserMessageParam { const flow = flowAndSelectedId?.flow const selectedId = flowAndSelectedId?.selectedId @@ -1075,10 +1074,13 @@ ${instructions}` } const codePieces = selectedContext.filter((c) => c.type === 'flow_module_code_piece') + const scriptSession = inlineScriptSession // Clear the inline script store and extract inline scripts for token optimization - inlineScriptStore.clear() - const optimizedModules = extractAndReplaceInlineScripts(flow.value.modules) + scriptSession?.clear() + const optimizedModules = scriptSession + ? scriptSession.extractAndReplaceInlineScripts(flow.value.modules) + : flow.value.modules // Apply code pieces to the optimized modules (returns YAML string) const flowModulesYaml = applyCodePiecesToFlowModules(codePieces, optimizedModules) @@ -1086,7 +1088,7 @@ ${instructions}` // Handle preprocessor and failure modules let optimizedPreprocessor = flow.value.preprocessor_module if (optimizedPreprocessor?.value?.type === 'rawscript' && optimizedPreprocessor.value.content) { - inlineScriptStore.set(optimizedPreprocessor.id, optimizedPreprocessor.value.content) + scriptSession?.set(optimizedPreprocessor.id, optimizedPreprocessor.value.content) optimizedPreprocessor = { ...optimizedPreprocessor, value: { @@ -1098,7 +1100,7 @@ ${instructions}` let optimizedFailure = flow.value.failure_module if (optimizedFailure?.value?.type === 'rawscript' && optimizedFailure.value.content) { - inlineScriptStore.set(optimizedFailure.id, optimizedFailure.value.content) + scriptSession?.set(optimizedFailure.id, optimizedFailure.value.content) optimizedFailure = { ...optimizedFailure, value: { diff --git a/frontend/src/lib/components/copilot/chat/flow/inlineScriptsUtils.ts b/frontend/src/lib/components/copilot/chat/flow/inlineScriptsUtils.ts index 9a124c44d7..4180b7e36b 100644 --- a/frontend/src/lib/components/copilot/chat/flow/inlineScriptsUtils.ts +++ b/frontend/src/lib/components/copilot/chat/flow/inlineScriptsUtils.ts @@ -1,10 +1,17 @@ import type { FlowModule } from '$lib/gen' -/** - * Storage for inline scripts extracted from flow modules. - * Maps module IDs to their rawscript content for token-efficient transmission to AI. - */ -class InlineScriptStore { +export interface InlineScriptSession { + clear(): void + set(moduleId: string, content: string): void + get(moduleId: string): string | undefined + has(moduleId: string): boolean + getAll(): Record + extractAndReplaceInlineScripts(modules: FlowModule[]): FlowModule[] + restoreInlineScriptReferences(modules: FlowModule[]): FlowModule[] + findUnresolvedInlineScriptRefs(modules: FlowModule[]): string[] +} + +class DefaultInlineScriptSession implements InlineScriptSession { private scripts: Map = new Map() clear() { @@ -26,15 +33,28 @@ class InlineScriptStore { getAll(): Record { return Object.fromEntries(this.scripts.entries()) } + + extractAndReplaceInlineScripts(modules: FlowModule[]): FlowModule[] { + return extractAndReplaceInlineScripts(modules, this) + } + + restoreInlineScriptReferences(modules: FlowModule[]): FlowModule[] { + return restoreInlineScriptReferences(modules, this) + } + + findUnresolvedInlineScriptRefs(modules: FlowModule[]): string[] { + return findUnresolvedInlineScriptRefs(modules) + } } -export const inlineScriptStore = new InlineScriptStore() +export function createInlineScriptSession(): InlineScriptSession { + return new DefaultInlineScriptSession() +} -/** - * Recursively extracts all rawscript content from flow modules and stores them. - * Replaces the content with references like "inline_script.{module_id}". - */ -export function extractAndReplaceInlineScripts(modules: FlowModule[]): FlowModule[] { +function extractAndReplaceInlineScripts( + modules: FlowModule[], + session: Pick +): FlowModule[] { if (!modules || !Array.isArray(modules)) { return [] } @@ -43,52 +63,45 @@ export function extractAndReplaceInlineScripts(modules: FlowModule[]): FlowModul const newModule = { ...module } if (newModule.value.type === 'rawscript' && newModule.value.content) { - // Store the original content - inlineScriptStore.set(module.id, newModule.value.content) - - // Replace with reference + session.set(module.id, newModule.value.content) newModule.value = { ...newModule.value, content: `inline_script.${module.id}` } } else if (newModule.value.type === 'forloopflow' || newModule.value.type === 'whileloopflow') { - // Recursively process nested modules in loops if (newModule.value.modules) { newModule.value = { ...newModule.value, - modules: extractAndReplaceInlineScripts(newModule.value.modules) + modules: extractAndReplaceInlineScripts(newModule.value.modules, session) } } } else if (newModule.value.type === 'branchone') { - // Process branches and default modules if (newModule.value.branches) { newModule.value = { ...newModule.value, branches: newModule.value.branches.map((branch) => ({ ...branch, - modules: branch.modules ? extractAndReplaceInlineScripts(branch.modules) : [] + modules: branch.modules ? extractAndReplaceInlineScripts(branch.modules, session) : [] })) } } if (newModule.value.default) { newModule.value = { ...newModule.value, - default: extractAndReplaceInlineScripts(newModule.value.default) + default: extractAndReplaceInlineScripts(newModule.value.default, session) } } } else if (newModule.value.type === 'branchall') { - // Process all branches if (newModule.value.branches) { newModule.value = { ...newModule.value, branches: newModule.value.branches.map((branch) => ({ ...branch, - modules: branch.modules ? extractAndReplaceInlineScripts(branch.modules) : [] + modules: branch.modules ? extractAndReplaceInlineScripts(branch.modules, session) : [] })) } } } else if (newModule.value.type === 'aiagent') { - // Process AI agent tools if (newModule.value.tools) { newModule.value = { ...newModule.value, @@ -102,7 +115,7 @@ export function extractAndReplaceInlineScripts(modules: FlowModule[]): FlowModul 'content' in tool.value && tool.value.content ) { - inlineScriptStore.set(tool.id, tool.value.content as string) + session.set(tool.id, tool.value.content as string) return { ...tool, value: { @@ -121,70 +134,58 @@ export function extractAndReplaceInlineScripts(modules: FlowModule[]): FlowModul }) } -/** - * Recursively restores inline script references back to their full content. - * If content matches pattern "inline_script.{id}", looks up and restores the original. - * If content doesn't match (new/modified script), keeps it as-is. - */ -export function restoreInlineScriptReferences(modules: FlowModule[]): FlowModule[] { +function restoreInlineScriptReferences( + modules: FlowModule[], + session: Pick +): FlowModule[] { return modules.map((module) => { const newModule = { ...module } if (newModule.value.type === 'rawscript' && newModule.value.content) { - const content = newModule.value.content - // Check if it's a reference - const match = content.match(/^inline_script\.(.+)$/) + const match = newModule.value.content.match(/^inline_script\.(.+)$/) if (match) { - const moduleId = match[1] - const storedContent = inlineScriptStore.get(moduleId) + const storedContent = session.get(match[1]) if (storedContent !== undefined) { - // Restore original content newModule.value = { ...newModule.value, content: storedContent } } - // If not found in store, keep the reference as-is (shouldn't happen normally) } - // If not a reference, it's new/modified content - keep as-is } else if (newModule.value.type === 'forloopflow' || newModule.value.type === 'whileloopflow') { - // Recursively process nested modules in loops if (newModule.value.modules) { newModule.value = { ...newModule.value, - modules: restoreInlineScriptReferences(newModule.value.modules) + modules: restoreInlineScriptReferences(newModule.value.modules, session) } } } else if (newModule.value.type === 'branchone') { - // Process branches and default modules if (newModule.value.branches) { newModule.value = { ...newModule.value, branches: newModule.value.branches.map((branch) => ({ ...branch, - modules: branch.modules ? restoreInlineScriptReferences(branch.modules) : [] + modules: branch.modules ? restoreInlineScriptReferences(branch.modules, session) : [] })) } } if (newModule.value.default) { newModule.value = { ...newModule.value, - default: restoreInlineScriptReferences(newModule.value.default) + default: restoreInlineScriptReferences(newModule.value.default, session) } } } else if (newModule.value.type === 'branchall') { - // Process all branches if (newModule.value.branches) { newModule.value = { ...newModule.value, branches: newModule.value.branches.map((branch) => ({ ...branch, - modules: branch.modules ? restoreInlineScriptReferences(branch.modules) : [] + modules: branch.modules ? restoreInlineScriptReferences(branch.modules, session) : [] })) } } } else if (newModule.value.type === 'aiagent') { - // Process AI agent tools if (newModule.value.tools) { newModule.value = { ...newModule.value, @@ -198,11 +199,9 @@ export function restoreInlineScriptReferences(modules: FlowModule[]): FlowModule 'content' in tool.value && tool.value.content ) { - const content = tool.value.content as string - const match = content.match(/^inline_script\.(.+)$/) + const match = (tool.value.content as string).match(/^inline_script\.(.+)$/) if (match) { - const toolId = match[1] - const storedContent = inlineScriptStore.get(toolId) + const storedContent = session.get(match[1]) if (storedContent !== undefined) { return { ...tool, @@ -224,11 +223,7 @@ export function restoreInlineScriptReferences(modules: FlowModule[]): FlowModule }) } -/** - * Recursively finds any unresolved inline script references in flow modules. - * Returns array of module IDs that still have `inline_script.{id}` patterns. - */ -export function findUnresolvedInlineScriptRefs(modules: FlowModule[]): string[] { +function findUnresolvedInlineScriptRefs(modules: FlowModule[]): string[] { const unresolvedRefs: string[] = [] function checkModule(module: FlowModule) { @@ -257,7 +252,6 @@ export function findUnresolvedInlineScriptRefs(modules: FlowModule[]): string[] }) } } else if (module.value.type === 'aiagent') { - // Check AI agent tools if (module.value.tools) { for (const tool of module.value.tools) { if ( diff --git a/frontend/src/lib/components/copilot/chat/openai-responses.ts b/frontend/src/lib/components/copilot/chat/openai-responses.ts index 56364e1401..c557132417 100644 --- a/frontend/src/lib/components/copilot/chat/openai-responses.ts +++ b/frontend/src/lib/components/copilot/chat/openai-responses.ts @@ -14,6 +14,15 @@ import { import { processToolCall, type Tool, type ToolCallbacks } from './shared' import type { ResponseStream } from 'openai/lib/responses/ResponseStream.mjs' import type { AIProviderModel } from '$lib/gen' +import { + openAIResponsesUsageToChatTokenUsage, + type ChatTokenUsage +} from './tokenUsage' + +interface ParsedCompletionResult { + shouldContinue: boolean + tokenUsage: ChatTokenUsage +} // Conversion utilities for Responses API function convertMessagesToResponsesInput(messages: ChatCompletionMessageParam[]): { @@ -219,7 +228,7 @@ export async function parseOpenAIResponsesCompletion( tools: Tool[], helpers: any, options?: { workspace?: string } -): Promise { +): Promise { let toolCallsToProcess: ChatCompletionMessageFunctionToolCall[] = [] let error: OpenAIError | ResponseErrorEvent | null = null let textContent = '' @@ -337,6 +346,9 @@ export async function parseOpenAIResponsesCompletion( throw error } + const finalResponse = await runner.finalResponse() + const tokenUsage = openAIResponsesUsageToChatTokenUsage(finalResponse.usage) + // Process tool calls if any if (toolCallsToProcess.length > 0) { const assistantWithTools = { @@ -358,10 +370,10 @@ export async function parseOpenAIResponsesCompletion( messages.push(messageToAdd) addedMessages.push(messageToAdd) } - return true // Continue the conversation loop + return { shouldContinue: true, tokenUsage } } - return false // End the conversation + return { shouldContinue: false, tokenUsage } } export async function getNonStreamingOpenAIResponsesCompletion( diff --git a/frontend/src/lib/components/copilot/chat/shared.ts b/frontend/src/lib/components/copilot/chat/shared.ts index 20e488d923..a1a8e3497f 100644 --- a/frontend/src/lib/components/copilot/chat/shared.ts +++ b/frontend/src/lib/components/copilot/chat/shared.ts @@ -920,8 +920,6 @@ export function formatScriptLintResult(lintResult: ScriptLintResult): string { return response } -// ============= Workspace Runnables Search ============= - export class WorkspaceRunnablesSearch { private uf: uFuzzy private scriptsWorkspace: string | undefined = undefined diff --git a/frontend/src/lib/components/copilot/chat/tokenUsage.ts b/frontend/src/lib/components/copilot/chat/tokenUsage.ts new file mode 100644 index 0000000000..d1b7a2e56b --- /dev/null +++ b/frontend/src/lib/components/copilot/chat/tokenUsage.ts @@ -0,0 +1,73 @@ +export interface ChatTokenUsage { + prompt: number + completion: number + total: number +} + +export function emptyChatTokenUsage(): ChatTokenUsage { + return { prompt: 0, completion: 0, total: 0 } +} + +export function addChatTokenUsage( + total: ChatTokenUsage, + usage: ChatTokenUsage | null | undefined +): ChatTokenUsage { + if (!usage) { + return total + } + + return { + prompt: total.prompt + usage.prompt, + completion: total.completion + usage.completion, + total: total.total + usage.total + } +} + +export function anthropicUsageToChatTokenUsage(usage: { + input_tokens?: number | null + output_tokens?: number | null + cache_creation_input_tokens?: number | null + cache_read_input_tokens?: number | null +} | null | undefined): ChatTokenUsage { + const prompt = + (usage?.input_tokens ?? 0) + + (usage?.cache_creation_input_tokens ?? 0) + + (usage?.cache_read_input_tokens ?? 0) + const completion = usage?.output_tokens ?? 0 + + return { + prompt, + completion, + total: prompt + completion + } +} + +export function openAIResponsesUsageToChatTokenUsage(usage: { + input_tokens?: number | null + output_tokens?: number | null + total_tokens?: number | null +} | null | undefined): ChatTokenUsage { + const prompt = usage?.input_tokens ?? 0 + const completion = usage?.output_tokens ?? 0 + + return { + prompt, + completion, + total: usage?.total_tokens ?? prompt + completion + } +} + +export function openAICompletionsUsageToChatTokenUsage(usage: { + prompt_tokens?: number | null + completion_tokens?: number | null + total_tokens?: number | null +} | null | undefined): ChatTokenUsage { + const prompt = usage?.prompt_tokens ?? 0 + const completion = usage?.completion_tokens ?? 0 + + return { + prompt, + completion, + total: usage?.total_tokens ?? prompt + completion + } +} diff --git a/frontend/src/lib/components/copilot/lib.ts b/frontend/src/lib/components/copilot/lib.ts index 4b0c8c0e9e..2650f4d3de 100644 --- a/frontend/src/lib/components/copilot/lib.ts +++ b/frontend/src/lib/components/copilot/lib.ts @@ -25,6 +25,11 @@ import { convertOpenAIToAnthropicMessages } from './chat/anthropic' import type { Stream } from 'openai/core/streaming.mjs' import { generateRandomString } from '$lib/utils' import { copilotInfo, getCurrentModel } from '$lib/aiStore' +import { + emptyChatTokenUsage, + openAICompletionsUsageToChatTokenUsage, + type ChatTokenUsage +} from './chat/tokenUsage' export const SUPPORTED_LANGUAGES = new Set(Object.keys(GEN_CONFIG.prompts)) @@ -905,7 +910,18 @@ export async function getCompletion( // Use Completions API for other providers const client = options?.openaiClient ?? workspaceAIClients.getOpenaiClient() - const completion = client.chat.completions.create(config, { + const completionConfig = + (provider === 'openai' || provider === 'azure_openai' || provider === 'googleai') && + config.stream + ? { + ...config, + stream_options: { + ...(config.stream_options ?? {}), + include_usage: true + } + } + : config + const completion = client.chat.completions.create(completionConfig, { signal: abortController.signal, headers: { 'X-Provider': provider @@ -936,12 +952,16 @@ export async function parseOpenAICompletion( helpers: any, _abortController?: AbortController, // unused, for signature compatibility with parseAnthropicCompletion options?: { workspace?: string } -): Promise { +): Promise<{ shouldContinue: boolean; tokenUsage: ChatTokenUsage }> { const finalToolCalls: Record = {} let malformedFunctionCallError = false + let tokenUsage = emptyChatTokenUsage() let answer = '' for await (const chunk of completion) { + if ('usage' in chunk && chunk.usage) { + tokenUsage = openAICompletionsUsageToChatTokenUsage(chunk.usage) + } if (!('choices' in chunk && chunk.choices.length > 0 && 'delta' in chunk.choices[0])) { continue } @@ -1118,9 +1138,9 @@ export async function parseOpenAICompletion( messages.push(toolResponse) addedMessages.push(toolResponse) } else { - return false + return { shouldContinue: false, tokenUsage } } - return true + return { shouldContinue: true, tokenUsage } } export function getResponseFromEvent(part: OpenAI.Chat.Completions.ChatCompletionChunk): string {