From cdcc56461b77554964622f490ae901f170886595 Mon Sep 17 00:00:00 2001
From: centdix <40307056+centdix@users.noreply.github.com>
Date: Mon, 13 Apr 2026 14:05:46 +0200
Subject: [PATCH] feat: add black-box ai eval benchmarks (#8618)

---
 .gitignore                                    |    2 +
 ai_evals/.gitignore                           |    2 +
 ai_evals/AGENTS.md                            |  172 +++
 ai_evals/CLAUDE.md                            |    1 +
 ai_evals/README.md                            |  176 +++
 ai_evals/adapters/cli/runtime.test.ts         |   72 ++
 ai_evals/adapters/cli/runtime.ts              |  199 ++++
 ai_evals/adapters/frontend/benchmarkRunner.ts |   87 ++
 .../frontend/core/app/appEvalRunner.ts        |   92 ++
 .../frontend/core}/app/appFixtureLoader.ts    |    6 +-
 .../adapters/frontend/core/app/fileHelpers.ts |  255 +++++
 .../frontend/core/flow/fileHelpers.ts         |  161 +++
 .../frontend/core/flow/flowEvalRunner.ts      |  103 ++
 .../frontend/core/script/fileHelpers.ts       |   73 ++
 .../adapters/frontend/core/script/preview.ts  |   96 ++
 .../frontend/core/script/scriptEvalRunner.ts  |  109 ++
 .../frontend/core}/shared/baseEvalRunner.ts   |  100 +-
 .../adapters/frontend/core/shared/index.ts    |    3 +
 .../core/shared/providerConfig.test.ts        |   41 +
 .../frontend/core/shared/providerConfig.ts    |   71 ++
 .../adapters/frontend/core/shared/types.ts    |   32 +
 ai_evals/adapters/frontend/mockBackend.ts     |  270 +++++
 ai_evals/adapters/frontend/progress.ts        |  133 +++
 ai_evals/adapters/frontend/runtime.ts         |  216 ++++
 ai_evals/adapters/frontend/vitest.config.ts   |   28 +
 .../adapters/frontend/vitestAdapter.test.ts   |  165 +++
 ai_evals/bun.lock                             |  313 ++++++
 ai_evals/cases/app.yaml                       |   93 ++
 ai_evals/cases/cli.yaml                       |   66 ++
 ai_evals/cases/flow.yaml                      |  246 ++++
 ai_evals/cases/script.yaml                    |   11 +
 ai_evals/cli/index.ts                         |  295 +++++
 ai_evals/core/cases.ts                        |   71 ++
 ai_evals/core/files.ts                        |   67 ++
 ai_evals/core/judge.ts                        |  149 +++
 ai_evals/core/models.test.ts                  |   29 +
 ai_evals/core/models.ts                       |  185 +++
 ai_evals/core/results.ts                      |  296 +++++
 ai_evals/core/runSuite.ts                     |  264 +++++
 ai_evals/core/types.ts                        |  198 ++++
 ai_evals/core/validators.test.ts              |   36 +
 ai_evals/core/validators.ts                   |  997 ++++++++++++++++
 .../f/evals/hello__flow/flow.yaml             |    0
 .../f/evals/hello__flow/hello.ts              |    0
 .../f/evals/hello__flow/flow.yaml             |    0
 .../f/evals/hello__flow/hello.ts              |    2 +
 .../f/evals/hello.ts                          |    0
 .../bun-hello-script/f/evals/hello.ts         |    3 +
 .../f/evals/reuse_greeting__flow/flow.yaml    |    2 +
 .../f/lib/format_greeting.ts                  |    3 +
 .../f/evals/add_numbers.py                    |    2 +
 .../f/evals/hello__flow/flow.yaml             |   20 +
 .../f/evals/hello__flow/hello.ts              |    3 +
 .../f/evals/hello.ts                          |    3 +
 .../f/lib/format_greeting.ts                  |    3 +
 .../file_manager/backend/createFolder/main.ts |    0
 .../backend/createFolder/meta.json            |    0
 .../file_manager/backend/deleteItem/main.ts   |    0
 .../file_manager/backend/deleteItem/meta.json |    0
 .../file_manager/backend/listFiles/main.ts    |    0
 .../file_manager/backend/listFiles/meta.json  |    0
 .../file_manager/backend/listFolders/main.ts  |    0
 .../backend/listFolders/meta.json             |    0
 .../file_manager/backend/moveItem/main.ts     |    0
 .../file_manager/backend/moveItem/meta.json   |    0
 .../file_manager/backend/renameItem/main.ts   |    0
 .../file_manager/backend/renameItem/meta.json |    0
 .../frontend/components/Breadcrumb.tsx        |    0
 .../frontend/components/FileItem.tsx          |    0
 .../frontend/components/FileList.tsx          |    0
 .../frontend/components/FolderTree.tsx        |    0
 .../frontend/components/Toolbar.tsx           |    0
 .../initial/file_manager/frontend/index.tsx   |    0
 .../shopping_cart/backend/addToCart/main.ts   |    0
 .../shopping_cart/backend/addToCart/meta.json |    0
 .../backend/calculateTotal/main.ts            |    0
 .../backend/calculateTotal/meta.json          |    0
 .../shopping_cart/backend/getProducts/main.ts |    0
 .../backend/getProducts/meta.json             |    0
 .../backend/removeFromCart/main.ts            |    0
 .../backend/removeFromCart/meta.json          |    0
 .../frontend/components/Cart.tsx              |    0
 .../frontend/components/ProductCard.tsx       |    0
 .../frontend/components/ProductList.tsx       |    0
 .../initial/shopping_cart/frontend/index.tsx  |    0
 .../backend/decrementCounter/main.ts          |    0
 .../backend/decrementCounter/meta.json        |    0
 .../backend/incrementCounter/main.ts          |    0
 .../backend/incrementCounter/meta.json        |    0
 .../test1_counter_app/frontend/index.tsx      |    0
 .../flow/expected/test0_sum_two_numbers.json  |   31 +
 .../frontend}/flow/expected/test1.json        |    0
 .../expected/test10_while_loop_counter.json   |   30 +
 .../expected/test11_preprocessor_failure.json |   36 +
 .../flow/expected/test12_approval_step.json   |   44 +
 .../expected/test1_reuse_existing_script.json |   39 +
 .../frontend}/flow/expected/test2.json        |    0
 .../expected/test2_call_existing_subflow.json |   39 +
 .../frontend}/flow/expected/test3.json        |    0
 .../expected/test3_branchone_routing.json     |   24 +
 .../frontend}/flow/expected/test4.json        |    0
 .../flow/expected/test5_modify_simple.json    |    0
 .../flow/expected/test6_modify_medium.json    |    0
 .../flow/expected/test7_modify_complex.json   |    0
 .../test1_reuse_existing_script_initial.json  |   29 +
 .../test2_call_existing_subflow_initial.json  |   49 +
 .../frontend}/flow/initial/test5_initial.json |    0
 .../frontend}/flow/initial/test6_initial.json |    0
 .../frontend}/flow/initial/test7_initial.json |    0
 .../script/expected/test1_greet_user.json     |    8 +
 .../script/initial/test1_empty_bun.json       |    8 +
 ai_evals/history/app.jsonl                    |    3 +
 ai_evals/history/cli.jsonl                    |    2 +
 ai_evals/history/flow.jsonl                   |    3 +
 ai_evals/history/script.jsonl                 |    3 +
 ai_evals/modes/app.ts                         |   79 ++
 ai_evals/modes/cli.ts                         |  162 +++
 ai_evals/modes/flow.ts                        |  104 ++
 ai_evals/modes/frontendCommon.test.ts         |   28 +
 ai_evals/modes/frontendCommon.ts              |   23 +
 ai_evals/modes/script.ts                      |   61 +
 ai_evals/package.json                         |   19 +
 cli/README.md                                 |   22 +
 cli/TESTING.md                                |   14 +
 cli/src/commands/init/init.ts                 |  113 +-
 cli/src/guidance/writer.ts                    |  269 +++++
 cli/test-skills/README.md                     |  103 --
 cli/test-skills/bun.lock                      |   61 -
 cli/test-skills/package.json                  |   16 -
 cli/test-skills/src/skill-invocation.test.ts  |   91 --
 cli/test-skills/src/test-utils.ts             |  137 ---
 cli/test-skills/tsconfig.json                 |   17 -
 cli/test/guidance_writer_unit.test.ts         |  148 +++
 docs/failing-tests.md                         |   33 +
 docs/system-prompt-testing-plan.md            | 1000 +++++++++++++++++
 docs/system-prompt-testing-status.md          |  129 +++
 .../copilot/chat/AIChatManager.svelte.ts      |    7 +-
 .../chat/__tests__/app/appChat.eval.test.ts   |  303 -----
 .../chat/__tests__/app/appEvalComparison.ts   |  171 ---
 .../chat/__tests__/app/appEvalHelpers.ts      |  147 ---
 .../chat/__tests__/app/appEvalRunner.ts       |  177 ---
 .../chat/__tests__/app/appResultsWriter.ts    |  247 ----
 .../chat/__tests__/app/variants/baseline.ts   |   12 -
 .../chat/__tests__/app/variants/index.ts      |    6 -
 .../__tests__/app/variants/streamlined.ts     |  144 ---
 .../chat/__tests__/flow/flowChat.eval.test.ts |  449 --------
 .../chat/__tests__/flow/flowEvalComparison.ts |   68 --
 .../chat/__tests__/flow/flowEvalHelpers.ts    |  104 --
 .../chat/__tests__/flow/flowEvalRunner.ts     |  186 ---
 .../chat/__tests__/flow/variants/baseline.ts  |   12 -
 .../chat/__tests__/flow/variants/index.ts     |    6 -
 .../flow/variants/minimal-single-tool.ts      |  402 -------
 .../chat/__tests__/shared/baseLLMEvaluator.ts |  135 ---
 .../__tests__/shared/baseResultsWriter.ts     |  169 ---
 .../chat/__tests__/shared/baseVariants.ts     |  108 --
 .../copilot/chat/__tests__/shared/index.ts    |   28 -
 .../copilot/chat/__tests__/shared/types.ts    |  107 --
 .../lib/components/copilot/chat/anthropic.ts  |   15 +-
 .../lib/components/copilot/chat/chatLoop.ts   |   21 +-
 .../copilot/chat/flow/FlowAIChat.svelte       |   13 +-
 .../lib/components/copilot/chat/flow/core.ts  |   24 +-
 .../copilot/chat/flow/inlineScriptsUtils.ts   |  106 +-
 .../copilot/chat/openai-responses.ts          |   18 +-
 .../src/lib/components/copilot/chat/shared.ts |    2 -
 .../lib/components/copilot/chat/tokenUsage.ts |   73 ++
 frontend/src/lib/components/copilot/lib.ts    |   28 +-
 166 files changed, 8640 insertions(+), 3647 deletions(-)
 create mode 100644 ai_evals/.gitignore
 create mode 100644 ai_evals/AGENTS.md
 create mode 100644 ai_evals/CLAUDE.md
 create mode 100644 ai_evals/README.md
 create mode 100644 ai_evals/adapters/cli/runtime.test.ts
 create mode 100644 ai_evals/adapters/cli/runtime.ts
 create mode 100644 ai_evals/adapters/frontend/benchmarkRunner.ts
 create mode 100644 ai_evals/adapters/frontend/core/app/appEvalRunner.ts
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/adapters/frontend/core}/app/appFixtureLoader.ts (97%)
 create mode 100644 ai_evals/adapters/frontend/core/app/fileHelpers.ts
 create mode 100644 ai_evals/adapters/frontend/core/flow/fileHelpers.ts
 create mode 100644 ai_evals/adapters/frontend/core/flow/flowEvalRunner.ts
 create mode 100644 ai_evals/adapters/frontend/core/script/fileHelpers.ts
 create mode 100644 ai_evals/adapters/frontend/core/script/preview.ts
 create mode 100644 ai_evals/adapters/frontend/core/script/scriptEvalRunner.ts
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/adapters/frontend/core}/shared/baseEvalRunner.ts (66%)
 create mode 100644 ai_evals/adapters/frontend/core/shared/index.ts
 create mode 100644 ai_evals/adapters/frontend/core/shared/providerConfig.test.ts
 create mode 100644 ai_evals/adapters/frontend/core/shared/providerConfig.ts
 create mode 100644 ai_evals/adapters/frontend/core/shared/types.ts
 create mode 100644 ai_evals/adapters/frontend/mockBackend.ts
 create mode 100644 ai_evals/adapters/frontend/progress.ts
 create mode 100644 ai_evals/adapters/frontend/runtime.ts
 create mode 100644 ai_evals/adapters/frontend/vitest.config.ts
 create mode 100644 ai_evals/adapters/frontend/vitestAdapter.test.ts
 create mode 100644 ai_evals/bun.lock
 create mode 100644 ai_evals/cases/app.yaml
 create mode 100644 ai_evals/cases/cli.yaml
 create mode 100644 ai_evals/cases/flow.yaml
 create mode 100644 ai_evals/cases/script.yaml
 create mode 100644 ai_evals/cli/index.ts
 create mode 100644 ai_evals/core/cases.ts
 create mode 100644 ai_evals/core/files.ts
 create mode 100644 ai_evals/core/judge.ts
 create mode 100644 ai_evals/core/models.test.ts
 create mode 100644 ai_evals/core/models.ts
 create mode 100644 ai_evals/core/results.ts
 create mode 100644 ai_evals/core/runSuite.ts
 create mode 100644 ai_evals/core/types.ts
 create mode 100644 ai_evals/core/validators.test.ts
 create mode 100644 ai_evals/core/validators.ts
 create mode 100644 ai_evals/fixtures/cli/expected/bun-hello-flow-punctuation/f/evals/hello__flow/flow.yaml
 create mode 100644 ai_evals/fixtures/cli/expected/bun-hello-flow-punctuation/f/evals/hello__flow/hello.ts
 create mode 100644 ai_evals/fixtures/cli/expected/bun-hello-flow/f/evals/hello__flow/flow.yaml
 create mode 100644 ai_evals/fixtures/cli/expected/bun-hello-flow/f/evals/hello__flow/hello.ts
 create mode 100644 ai_evals/fixtures/cli/expected/bun-hello-script-uppercase/f/evals/hello.ts
 create mode 100644 ai_evals/fixtures/cli/expected/bun-hello-script/f/evals/hello.ts
 create mode 100644 ai_evals/fixtures/cli/expected/flow-reuse-existing-script/f/evals/reuse_greeting__flow/flow.yaml
 create mode 100644 ai_evals/fixtures/cli/expected/flow-reuse-existing-script/f/lib/format_greeting.ts
 create mode 100644 ai_evals/fixtures/cli/expected/python-add-numbers-script/f/evals/add_numbers.py
 create mode 100644 ai_evals/fixtures/cli/initial/bun-hello-flow-punctuation/f/evals/hello__flow/flow.yaml
 create mode 100644 ai_evals/fixtures/cli/initial/bun-hello-flow-punctuation/f/evals/hello__flow/hello.ts
 create mode 100644 ai_evals/fixtures/cli/initial/bun-hello-script-uppercase/f/evals/hello.ts
 create mode 100644 ai_evals/fixtures/cli/initial/flow-reuse-existing-script/f/lib/format_greeting.ts
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/createFolder/main.ts (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/createFolder/meta.json (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/deleteItem/main.ts (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/deleteItem/meta.json (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/listFiles/main.ts (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/listFiles/meta.json (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/listFolders/main.ts (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/listFolders/meta.json (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/moveItem/main.ts (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/moveItem/meta.json (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/renameItem/main.ts (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/backend/renameItem/meta.json (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/frontend/components/Breadcrumb.tsx (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/frontend/components/FileItem.tsx (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/frontend/components/FileList.tsx (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/frontend/components/FolderTree.tsx (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/frontend/components/Toolbar.tsx (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/file_manager/frontend/index.tsx (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/backend/addToCart/main.ts (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/backend/addToCart/meta.json (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/backend/calculateTotal/main.ts (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/backend/calculateTotal/meta.json (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/backend/getProducts/main.ts (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/backend/getProducts/meta.json (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/backend/removeFromCart/main.ts (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/backend/removeFromCart/meta.json (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/frontend/components/Cart.tsx (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/frontend/components/ProductCard.tsx (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/frontend/components/ProductList.tsx (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/shopping_cart/frontend/index.tsx (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/test1_counter_app/backend/decrementCounter/main.ts (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/test1_counter_app/backend/decrementCounter/meta.json (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/test1_counter_app/backend/incrementCounter/main.ts (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/test1_counter_app/backend/incrementCounter/meta.json (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/app/initial/test1_counter_app/frontend/index.tsx (100%)
 create mode 100644 ai_evals/fixtures/frontend/flow/expected/test0_sum_two_numbers.json
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/flow/expected/test1.json (100%)
 create mode 100644 ai_evals/fixtures/frontend/flow/expected/test10_while_loop_counter.json
 create mode 100644 ai_evals/fixtures/frontend/flow/expected/test11_preprocessor_failure.json
 create mode 100644 ai_evals/fixtures/frontend/flow/expected/test12_approval_step.json
 create mode 100644 ai_evals/fixtures/frontend/flow/expected/test1_reuse_existing_script.json
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/flow/expected/test2.json (100%)
 create mode 100644 ai_evals/fixtures/frontend/flow/expected/test2_call_existing_subflow.json
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/flow/expected/test3.json (100%)
 create mode 100644 ai_evals/fixtures/frontend/flow/expected/test3_branchone_routing.json
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/flow/expected/test4.json (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/flow/expected/test5_modify_simple.json (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/flow/expected/test6_modify_medium.json (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/flow/expected/test7_modify_complex.json (100%)
 create mode 100644 ai_evals/fixtures/frontend/flow/initial/test1_reuse_existing_script_initial.json
 create mode 100644 ai_evals/fixtures/frontend/flow/initial/test2_call_existing_subflow_initial.json
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/flow/initial/test5_initial.json (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/flow/initial/test6_initial.json (100%)
 rename {frontend/src/lib/components/copilot/chat/__tests__ => ai_evals/fixtures/frontend}/flow/initial/test7_initial.json (100%)
 create mode 100644 ai_evals/fixtures/frontend/script/expected/test1_greet_user.json
 create mode 100644 ai_evals/fixtures/frontend/script/initial/test1_empty_bun.json
 create mode 100644 ai_evals/history/app.jsonl
 create mode 100644 ai_evals/history/cli.jsonl
 create mode 100644 ai_evals/history/flow.jsonl
 create mode 100644 ai_evals/history/script.jsonl
 create mode 100644 ai_evals/modes/app.ts
 create mode 100644 ai_evals/modes/cli.ts
 create mode 100644 ai_evals/modes/flow.ts
 create mode 100644 ai_evals/modes/frontendCommon.test.ts
 create mode 100644 ai_evals/modes/frontendCommon.ts
 create mode 100644 ai_evals/modes/script.ts
 create mode 100644 ai_evals/package.json
 create mode 100644 cli/src/guidance/writer.ts
 delete mode 100644 cli/test-skills/README.md
 delete mode 100644 cli/test-skills/bun.lock
 delete mode 100644 cli/test-skills/package.json
 delete mode 100644 cli/test-skills/src/skill-invocation.test.ts
 delete mode 100644 cli/test-skills/src/test-utils.ts
 delete mode 100644 cli/test-skills/tsconfig.json
 create mode 100644 cli/test/guidance_writer_unit.test.ts
 create mode 100644 docs/failing-tests.md
 create mode 100644 docs/system-prompt-testing-plan.md
 create mode 100644 docs/system-prompt-testing-status.md
 delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/app/appChat.eval.test.ts
 delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/app/appEvalComparison.ts
 delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/app/appEvalHelpers.ts
 delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/app/appEvalRunner.ts
 delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/app/appResultsWriter.ts
 delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/app/variants/baseline.ts
 delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/app/variants/index.ts
 delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/app/variants/streamlined.ts
 delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/flow/flowChat.eval.test.ts
 delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalComparison.ts
 delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalHelpers.ts
 delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalRunner.ts
 delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/flow/variants/baseline.ts
 delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/flow/variants/index.ts
 delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/flow/variants/minimal-single-tool.ts
 delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/shared/baseLLMEvaluator.ts
 delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/shared/baseResultsWriter.ts
 delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/shared/baseVariants.ts
 delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/shared/index.ts
 delete mode 100644 frontend/src/lib/components/copilot/chat/__tests__/shared/types.ts
 create mode 100644 frontend/src/lib/components/copilot/chat/tokenUsage.ts

diff --git a/.gitignore b/.gitignore
index a080d58795..b2741131a5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,8 @@ rust-client/Cargo.toml
 backend/target
 frontend/node_modules
 typescript-client/node_modules
+ai_evals/node_modules
+ai_evals/results/
 frontend/.svelte-kit
 backend/chrome_profiler.json
 .fast-check/
diff --git a/ai_evals/.gitignore b/ai_evals/.gitignore
new file mode 100644
index 0000000000..9263598939
--- /dev/null
+++ b/ai_evals/.gitignore
@@ -0,0 +1,2 @@
+.env
+results/
diff --git a/ai_evals/AGENTS.md b/ai_evals/AGENTS.md
new file mode 100644
index 0000000000..096baf5b58
--- /dev/null
+++ b/ai_evals/AGENTS.md
@@ -0,0 +1,172 @@
+# AI Evals Authoring Guide
+
+This folder contains black-box benchmark cases for:
+
+- `flow`
+- `app`
+- `script`
+- `cli`
+
+The goal is to test the current production prompts and guidance with realistic user requests, not to test one exact implementation shape.
+
+## Core rules
+
+1. Write prompts like a real user request.
+2. Prefer behavior, inputs, constraints, and outcomes over internal implementation details.
+3. Keep deterministic validation narrow and hard.
+4. Put semantic expectations in `judgeChecklist`.
+5. Use `expected` fixtures only when exact structure really matters.
+
+## Prompt writing
+
+Prompts should sound like something a user would naturally ask.
+
+Good:
+
+- "Create a flow that routes support requests based on customer tier."
+- "Add a reset button that sets the counter back to 0."
+- "Create a flow that reuses the existing greeting script instead of duplicating the logic."
+
+Bad:
+
+- "Use `branchone` with 3 branches and a default branch."
+- "Create a `rawscript` step with this exact topology."
+- "This is a benchmark harness."
+
+Do not write prompts as if the user knows Windmill internals unless the case is explicitly testing a power-user workflow.
+
+## Flow-specific rules
+
+This is the main principle you asked for:
+
+- flow prompts should read like requests from a user who does not know the product internals
+- the user should ask for behavior, not for `branchone`, `branchall`, `rawscript`, `preprocessor_module`, `failure_module`, exact graph topology, or other internal constructs
+
+That means:
+
+- creation cases should describe the business behavior and expected result
+- modification cases may mention existing step names, because the user can see the current flow
+- only mention special Windmill constructs when the case is explicitly about those constructs
+
+Examples:
+
+- acceptable creation prompt:
+  "Create a purchase approval flow that pauses for approval and asks the approver for a comment."
+- avoid:
+  "Create a suspend step with one required event and a resume form."
+
+For flow cases, do not fail a case just because the model chose a different valid topology.
+
+## App-specific rules
+
+App prompts should focus on user-visible behavior:
+
+- what the UI should let the user do
+- what should persist
+- what backend behavior is needed
+
+Avoid prompting in terms of React structure, component names, or implementation unless the case is specifically about editing an existing app.
+
+## CLI-specific rules
+
+CLI prompts can be more explicit about paths and file names because real CLI users often do specify them.
+
+Still, avoid benchmark phrasing. The prompt should read like a repo task, not a harness instruction.
+
+When relevant, ask the assistant to tell the user which `wmill` commands to run next. That is part of the benchmarked behavior.
+
+## Deterministic validation
+
+Use deterministic validation only for hard failures such as:
+
+- missing required files
+- unexpected extra files when the prompt says not to create them
+- syntax errors
+- unresolved flow refs
+- missing required special modules or suspend config
+- obvious artifact corruption
+
+Do not use deterministic validation to enforce one preferred implementation for broad creation tasks.
+
+Examples of bad hard checks:
+
+- exact step topology for a creation flow
+- exact branch structure when the prompt only asked for routing behavior
+- exact input shape when multiple reasonable shapes are acceptable
+
+## Judge checklist
+
+Every non-trivial case should have a `judgeChecklist`.
+
+The checklist should capture:
+
+- the user-visible behavior that must be present
+- important constraints
+- key completion criteria
+
+The checklist should not duplicate low-level implementation details unless they are truly required by the task.
+
+Good checklist items:
+
+- "the flow calculates the order total with 8% tax"
+- "the app persists recipes appropriately for a raw Windmill app"
+- "the flow reuses the existing workspace script instead of rewriting the logic"
+
+Bad checklist items:
+
+- "uses `branchone`"
+- "contains a `rawscript` node"
+
+## When to use `expected`
+
+Use `expected` fixtures when the case is structure-sensitive, for example:
+
+- exact file creation
+- exact script content
+- modification cases where a specific file must change in a specific way
+- cases where preserving an existing structure is part of the requirement
+
+Do not use a full `expected` artifact as the semantic oracle for broad creation tasks when multiple valid outputs should pass.
+
+## When to use `initial`
+
+Use `initial` when the benchmark is about:
+
+- editing an existing artifact
+- reusing existing workspace assets
+- preserving existing behavior while adding a change
+
+If the case is greenfield, prefer no `initial`.
+
+## Case design ladder
+
+Prefer suites that get gradually harder:
+
+1. trivial create case
+2. realistic create case
+3. reuse-existing-assets case
+4. modification case
+5. refactor case
+6. edge-case or niche product behavior
+
+The last cases in a suite should cover unusual or product-specific behavior.
+
+## Anti-patterns
+
+Avoid these:
+
+- benchmark framing in prompts
+- over-specified internal topology for creation tasks
+- judge checklists that just restate implementation details
+- deterministic validation that encodes one preferred solution
+- fixtures that are so minimal or brittle that they create false negatives
+
+## Before adding a case
+
+Ask:
+
+1. Would a real user plausibly write this prompt?
+2. If the model solves it in a different valid way, would the case still pass?
+3. Are the hard deterministic checks only catching objectively broken output?
+4. Does the `judgeChecklist` describe the real success criteria?
+5. If this case fails, will the reason be understandable from the saved artifacts?
diff --git a/ai_evals/CLAUDE.md b/ai_evals/CLAUDE.md
new file mode 100644
index 0000000000..eef4bd20cf
--- /dev/null
+++ b/ai_evals/CLAUDE.md
@@ -0,0 +1 @@
+@AGENTS.md
\ No newline at end of file
diff --git a/ai_evals/README.md b/ai_evals/README.md
new file mode 100644
index 0000000000..353bee9dc3
--- /dev/null
+++ b/ai_evals/README.md
@@ -0,0 +1,176 @@
+# AI Evals
+
+Small benchmark runner for the four Windmill AI generation modes:
+
+- `cli`
+- `flow`
+- `script`
+- `app`
+
+The benchmark always tests the current production prompts, tools, and guidance in this checkout.
+
+Each attempt runs:
+
+1. the real production path
+2. deterministic validation
+3. LLM judging
+
+## Install
+
+```bash
+cd ai_evals
+bun install
+```
+
+Frontend modes also require frontend dependencies:
+
+```bash
+cd frontend
+bun install
+```
+
+## Commands
+
+List model aliases:
+
+```bash
+cd ai_evals
+bun run cli -- models
+```
+
+List cases:
+
+```bash
+cd ai_evals
+bun run cli -- cases
+bun run cli -- cases flow
+```
+
+Run benchmarks:
+
+```bash
+cd ai_evals
+bun run cli -- run flow
+bun run cli -- run flow flow-test4-order-processing-loop --model opus
+bun run cli -- run flow flow-test0-sum-two-numbers --models haiku,opus,4o
+bun run cli -- run flow flow-test0-sum-two-numbers --runs 3 --verbose
+bun run cli -- run flow --record
+bun run cli -- run cli bun-hello-script
+```
+
+Public CLI surface:
+
+- `models`
+- `cases [mode]`
+- `run <mode> [caseIds...]`
+
+`run` options:
+
+- `--runs <n>`: repeat each case `n` times
+- `--output <path>`: custom result JSON path
+- `--model <alias>`: choose the model under test
+- `--models <a,b,c>`: run the same cases sequentially against several model aliases
+- `--verbose`: stream assistant output for frontend runs
+- `--record`: append a compact tracked summary line to `ai_evals/history/<mode>.jsonl` for full-suite runs only
+
+## Models
+
+Use `bun run cli -- models` to see the current aliases.
+
+Today:
+
+- `haiku`
+- `sonnet`
+- `opus`
+- `4o`
+- `gemini-flash`
+- `gemini-pro`
+- `gemini-3-flash-preview`
+- `gemini-3.1-pro-preview`
+
+Notes:
+
+- the command also prints accepted alias spellings such as `gpt-4o`, `claude-opus-4.6`, and `claude-haiku-4.5`
+- frontend modes (`flow`, `script`, `app`) can use Anthropic, OpenAI, and Gemini-backed aliases
+- `cli` mode always uses the Anthropic agent SDK, so only Anthropic aliases are valid there
+- the judge model is separate and currently defaults to `claude-sonnet-4-6`
+
+## Case Format
+
+Cases live in one YAML file per mode under `ai_evals/cases/`.
+
+Minimal shape:
+
+```yaml
+- id: flow-test0-sum-two-numbers
+  prompt: |-
+    Create a flow that takes two numbers, `a` and `b`, and returns their sum.
+  initial: ai_evals/fixtures/...
+  expected: ai_evals/fixtures/...
+```
+
+Optional fields:
+
+- `initial`: starting state fixture
+- `expected`: expected artifact fixture
+- `validate`: extra deterministic validation rules
+
+For `flow` mode, `validate` can express requirements such as:
+
+- accepted input schema shapes
+- required `results.*` reference validity
+- required module/code/input characteristics
+
+For `flow` mode, an `initial` fixture can also include a benchmark workspace catalog of
+existing scripts and flows. That lets the real `search_workspace` and
+`get_runnable_details` tools discover reusable workspace runnables during evals.
+
+## Results And Artifacts
+
+Every run writes:
+
+- a summary JSON under `ai_evals/results/`
+- generated artifacts in a sibling directory
+
+If `--record` is used, the CLI also appends one compact JSON line to:
+
+- `ai_evals/history/flow.jsonl`
+- `ai_evals/history/script.jsonl`
+- `ai_evals/history/app.jsonl`
+- `ai_evals/history/cli.jsonl`
+
+Each recorded line contains:
+
+- run metadata (`createdAt`, `gitSha`, `mode`, `runModel`, `judgeModel`)
+- suite totals (`caseCount`, `attemptCount`, `passedAttempts`, `passRate`, `averageDurationMs`, `averageJudgeScore`)
+- average token usage (`averageTokenUsagePerAttempt`)
+- per-case metrics under `cases[]` (`averageDurationMs`, `averageJudgeScore`, `averageTokenUsagePerAttempt`, pass rate)
+- `failedCaseIds`
+
+Example:
+
+- summary: `ai_evals/results/2026-04-09T09-40-33.051Z__flow.json`
+- artifacts: `ai_evals/results/2026-04-09T09-40-33.051Z__flow/`
+
+Typical artifacts by mode:
+
+- `flow`: `flow.json`
+- `script`: `script.json` plus the generated script file
+- `app`: `app.json` plus frontend/backend files
+- `cli`: `assistant-output.txt` plus generated workspace files
+
+## Layout
+
+- `cases/`: one YAML file per mode
+- `fixtures/`: initial and expected fixtures
+- `core/`: shared loading, model resolution, validation, judging, and result writing
+- `modes/`: one runner per mode
+- `history/`: optional tracked pass-rate history written by `run --record`, one JSONL file per mode
+- `results/`: local benchmark output and artifacts
+
+## Notes
+
+- Frontend modes reuse the production frontend chat code through the Vitest bridge.
+- CLI mode creates an isolated workspace, writes the current checkout guidance into it, and benchmarks the real skills / `AGENTS.md` flow.
+- Frontend progress streams live while the benchmark is running.
+- Deterministic validators should stay focused on real correctness constraints, not one exact implementation shape.
diff --git a/ai_evals/adapters/cli/runtime.test.ts b/ai_evals/adapters/cli/runtime.test.ts
new file mode 100644
index 0000000000..aedbcba58d
--- /dev/null
+++ b/ai_evals/adapters/cli/runtime.test.ts
@@ -0,0 +1,72 @@
+import { describe, expect, it } from "bun:test";
+import {
+  anthropicUsageToBenchmarkTokenUsage,
+  extractCliResultTokenUsage,
+} from "./runtime";
+
+describe("anthropicUsageToBenchmarkTokenUsage", () => {
+  it("includes cache tokens in prompt usage", () => {
+    expect(
+      anthropicUsageToBenchmarkTokenUsage({
+        input_tokens: 120,
+        output_tokens: 45,
+        cache_creation_input_tokens: 30,
+        cache_read_input_tokens: 5,
+      })
+    ).toEqual({
+      prompt: 155,
+      completion: 45,
+      total: 200,
+    });
+  });
+
+  it("returns null when usage is absent", () => {
+    expect(anthropicUsageToBenchmarkTokenUsage(null)).toBeNull();
+  });
+});
+
+describe("extractCliResultTokenUsage", () => {
+  it("reads aggregate usage from the SDK result event", () => {
+    expect(
+      extractCliResultTokenUsage({
+        type: "result",
+        usage: {
+          input_tokens: 400,
+          output_tokens: 120,
+          cache_creation_input_tokens: 50,
+          cache_read_input_tokens: 25,
+        },
+      })
+    ).toEqual({
+      prompt: 475,
+      completion: 120,
+      total: 595,
+    });
+  });
+
+  it("falls back to modelUsage when aggregate usage is unavailable", () => {
+    expect(
+      extractCliResultTokenUsage({
+        type: "result",
+        modelUsage: {
+          opus: {
+            inputTokens: 200,
+            outputTokens: 60,
+            cacheCreationInputTokens: 10,
+            cacheReadInputTokens: 5,
+          },
+          haiku: {
+            inputTokens: 80,
+            outputTokens: 20,
+            cacheCreationInputTokens: 0,
+            cacheReadInputTokens: 15,
+          },
+        },
+      })
+    ).toEqual({
+      prompt: 310,
+      completion: 80,
+      total: 390,
+    });
+  });
+});
diff --git a/ai_evals/adapters/cli/runtime.ts b/ai_evals/adapters/cli/runtime.ts
new file mode 100644
index 0000000000..3e184bae8d
--- /dev/null
+++ b/ai_evals/adapters/cli/runtime.ts
@@ -0,0 +1,199 @@
+import { query, type Options } from "@anthropic-ai/claude-agent-sdk";
+import { join } from "path";
+import { fileURLToPath } from "url";
+import { getCliEvalModel, resolveEvalModel, type CliEvalModelConfig } from "../../core/models";
+import type { BenchmarkTokenUsage } from "../../core/types";
+
+export interface ToolInvocation {
+  tool: string;
+  input: Record<string, unknown>;
+  timestamp: number;
+}
+
+export interface PromptRunResult {
+  toolsUsed: ToolInvocation[];
+  skillsInvoked: string[];
+  output: string;
+  durationMs: number;
+  assistantMessageCount: number;
+  tokenUsage: BenchmarkTokenUsage | null;
+}
+
+interface AnthropicUsageLike {
+  input_tokens?: number | null;
+  output_tokens?: number | null;
+  cache_creation_input_tokens?: number | null;
+  cache_read_input_tokens?: number | null;
+}
+
+interface AnthropicModelUsageLike {
+  inputTokens?: number | null;
+  outputTokens?: number | null;
+  cacheCreationInputTokens?: number | null;
+  cacheReadInputTokens?: number | null;
+}
+
+interface CliResultMessageLike {
+  type?: string;
+  usage?: AnthropicUsageLike | null;
+  modelUsage?: Record<string, AnthropicModelUsageLike> | null;
+}
+
+const REPO_ROOT = fileURLToPath(new URL("../../../", import.meta.url));
+export const DEFAULT_CLI_EVAL_MODEL: CliEvalModelConfig = getCliEvalModel(resolveEvalModel("cli"));
+
+export function getGeneratedSkillsSource(): string {
+  return join(REPO_ROOT, "system_prompts", "auto-generated", "skills");
+}
+
+export function anthropicUsageToBenchmarkTokenUsage(
+  usage: AnthropicUsageLike | null | undefined
+): BenchmarkTokenUsage | null {
+  if (!usage) {
+    return null;
+  }
+
+  const prompt =
+    (usage.input_tokens ?? 0) +
+    (usage.cache_creation_input_tokens ?? 0) +
+    (usage.cache_read_input_tokens ?? 0);
+  const completion = usage.output_tokens ?? 0;
+
+  return {
+    prompt,
+    completion,
+    total: prompt + completion,
+  };
+}
+
+export function extractCliResultTokenUsage(message: unknown): BenchmarkTokenUsage | null {
+  if (!message || typeof message !== "object") {
+    return null;
+  }
+
+  const resultMessage = message as CliResultMessageLike;
+  if (resultMessage.type !== "result") {
+    return null;
+  }
+
+  const usage = anthropicUsageToBenchmarkTokenUsage(resultMessage.usage);
+  if (usage) {
+    return usage;
+  }
+
+  if (!resultMessage.modelUsage || typeof resultMessage.modelUsage !== "object") {
+    return null;
+  }
+
+  let prompt = 0;
+  let completion = 0;
+  let sawModelUsage = false;
+
+  for (const modelUsage of Object.values(resultMessage.modelUsage)) {
+    if (!modelUsage || typeof modelUsage !== "object") {
+      continue;
+    }
+
+    prompt +=
+      (modelUsage.inputTokens ?? 0) +
+      (modelUsage.cacheCreationInputTokens ?? 0) +
+      (modelUsage.cacheReadInputTokens ?? 0);
+    completion += modelUsage.outputTokens ?? 0;
+    sawModelUsage = true;
+  }
+
+  if (!sawModelUsage) {
+    return null;
+  }
+
+  return {
+    prompt,
+    completion,
+    total: prompt + completion,
+  };
+}
+
+export async function runPromptAndCapture(
+  prompt: string,
+  cwd: string,
+  maxTurns: number = 3,
+  modelConfig: CliEvalModelConfig = DEFAULT_CLI_EVAL_MODEL
+): Promise<PromptRunResult> {
+  const toolsUsed: ToolInvocation[] = [];
+  const skillsInvoked: string[] = [];
+  let output = "";
+  let assistantMessageCount = 0;
+  let tokenUsage: BenchmarkTokenUsage | null = null;
+  const startedAt = Date.now();
+
+  const options: Options = {
+    cwd,
+    model: modelConfig.model,
+    maxTurns,
+    settingSources: ["project"],
+    allowedTools: ["Skill", "Read", "Glob", "Grep", "Bash", "Write", "Edit"]
+  };
+
+  for await (const message of query({ prompt, options })) {
+    if (message.type === "assistant") {
+      assistantMessageCount += 1;
+      const content = message.message?.content;
+      if (Array.isArray(content)) {
+        for (const block of content) {
+          if (block.type === "tool_use") {
+            toolsUsed.push({
+              tool: block.name,
+              input: block.input as Record<string, unknown>,
+              timestamp: Date.now()
+            });
+
+            if (block.name === "Skill" && typeof block.input === "object" && block.input !== null) {
+              const skillInput = block.input as { skill?: string };
+              if (skillInput.skill) {
+                skillsInvoked.push(skillInput.skill);
+              }
+            }
+          } else if (block.type === "text") {
+            output += block.text;
+          }
+        }
+      }
+    } else if (message.type === "result") {
+      const resultMessage = message as { result?: string };
+      tokenUsage = extractCliResultTokenUsage(message) ?? tokenUsage;
+      if (typeof resultMessage.result === "string") {
+        output += resultMessage.result;
+      }
+    }
+  }
+
+  return {
+    toolsUsed,
+    skillsInvoked,
+    output,
+    durationMs: Date.now() - startedAt,
+    assistantMessageCount,
+    tokenUsage,
+  };
+}
+
+export function wasSkillInvoked(result: PromptRunResult, skillName: string): boolean {
+  return result.skillsInvoked.some((skill) => skill === skillName || skill.includes(skillName));
+}
+
+export function wasToolUsed(result: PromptRunResult, toolName: string): boolean {
+  return result.toolsUsed.some((tool) => tool.tool === toolName);
+}
+
+export function formatCliRunModelLabel(modelConfig: CliEvalModelConfig): string {
+  return `${modelConfig.provider}:${modelConfig.model}`;
+}
+
+export function getToolInputs(
+  result: PromptRunResult,
+  toolName: string
+): Record<string, unknown>[] {
+  return result.toolsUsed
+    .filter((tool) => tool.tool === toolName)
+    .map((tool) => tool.input);
+}
diff --git a/ai_evals/adapters/frontend/benchmarkRunner.ts b/ai_evals/adapters/frontend/benchmarkRunner.ts
new file mode 100644
index 0000000000..33b1555654
--- /dev/null
+++ b/ai_evals/adapters/frontend/benchmarkRunner.ts
@@ -0,0 +1,87 @@
+import { loadSelectedCases } from "../../core/cases";
+import {
+  formatRunModelLabel,
+  getFrontendEvalModel,
+  resolveEvalModel,
+} from "../../core/models";
+import { buildRunResult } from "../../core/results";
+import { runSuite } from "../../core/runSuite";
+import type { BenchmarkRunResult, ModeRunner } from "../../core/types";
+import { emitFrontendBenchmarkProgress } from "./progress";
+import { createAppModeRunner } from "../../modes/app";
+import { createFlowModeRunner } from "../../modes/flow";
+import { createScriptModeRunner } from "../../modes/script";
+import { DEFAULT_JUDGE_MODEL } from "../../core/judge";
+
+export type FrontendBenchmarkMode = "flow" | "app" | "script";
+
+export async function runFrontendBenchmarkFromEnv(): Promise<BenchmarkRunResult> {
+  const mode = parseMode(process.env.WMILL_FRONTEND_AI_EVAL_MODE);
+  const caseIds = parseOptionalJsonStringArray(process.env.WMILL_FRONTEND_AI_EVAL_CASE_IDS);
+  const runs = parsePositiveInteger(process.env.WMILL_FRONTEND_AI_EVAL_RUNS, "WMILL_FRONTEND_AI_EVAL_RUNS");
+  const emitProgress = process.env.WMILL_FRONTEND_AI_EVAL_PROGRESS === "1";
+  const verbose = process.env.WMILL_FRONTEND_AI_EVAL_VERBOSE === "1";
+  const model = resolveEvalModel(mode, process.env.WMILL_FRONTEND_AI_EVAL_MODEL);
+
+  const selectedCases = await loadSelectedCases(mode, caseIds);
+  const modeRunner = getModeRunner(mode, getFrontendEvalModel(model));
+  const runModel = formatRunModelLabel(mode, model);
+  const caseResults = await runSuite({
+    modeRunner,
+    cases: selectedCases,
+    runs,
+    runModel,
+    judgeModel: DEFAULT_JUDGE_MODEL,
+    concurrency: verbose ? 1 : undefined,
+    verbose,
+    onProgress: emitProgress ? (event) => emitFrontendBenchmarkProgress(event) : undefined,
+  });
+
+  return buildRunResult({
+    mode,
+    runs,
+    runModel,
+    judgeModel: DEFAULT_JUDGE_MODEL,
+    caseResults,
+  });
+}
+
+function getModeRunner(
+  mode: FrontendBenchmarkMode,
+  model: ReturnType<typeof getFrontendEvalModel>
+): ModeRunner<any, any, any> {
+  switch (mode) {
+    case "flow":
+      return createFlowModeRunner(model);
+    case "app":
+      return createAppModeRunner(model);
+    case "script":
+      return createScriptModeRunner(model);
+  }
+}
+
+function parseMode(value: string | undefined): FrontendBenchmarkMode {
+  if (value === "flow" || value === "app" || value === "script") {
+    return value;
+  }
+  throw new Error(`Unsupported frontend benchmark mode: ${String(value)}`);
+}
+
+function parseOptionalJsonStringArray(value: string | undefined): string[] {
+  if (!value) {
+    return [];
+  }
+  const parsed = JSON.parse(value) as unknown;
+  if (!Array.isArray(parsed) || parsed.some((entry) => typeof entry !== "string")) {
+    throw new Error("WMILL_FRONTEND_AI_EVAL_CASE_IDS must be a JSON string array");
+  }
+  return parsed;
+}
+
+function parsePositiveInteger(value: string | undefined, envName: string): number {
+  const parsed = Number(value);
+  if (!Number.isInteger(parsed) || parsed <= 0) {
+    throw new Error(`${envName} must be a positive integer`);
+  }
+  return parsed;
+}
diff --git a/ai_evals/adapters/frontend/core/app/appEvalRunner.ts b/ai_evals/adapters/frontend/core/app/appEvalRunner.ts
new file mode 100644
index 0000000000..f0d0ce91d3
--- /dev/null
+++ b/ai_evals/adapters/frontend/core/app/appEvalRunner.ts
@@ -0,0 +1,92 @@
+import { mkdtemp } from 'fs/promises'
+import { tmpdir } from 'os'
+import { join } from 'path'
+import type {
+	AppFiles,
+	BackendRunnable,
+	AppAIChatHelpers
+} from '../../../../../frontend/src/lib/components/copilot/chat/app/core'
+import {
+	getAppTools,
+	prepareAppSystemMessage,
+	prepareAppUserMessage
+} from '../../../../../frontend/src/lib/components/copilot/chat/app/core'
+import type { Tool as ProductionTool } from '../../../../../frontend/src/lib/components/copilot/chat/shared'
+import { createAppFileHelpers } from './fileHelpers'
+import { runEval } from '../shared'
+import type { AIProvider } from '$lib/gen/types.gen'
+import type { ModeRunContext } from '../../../../core/types'
+import type { TokenUsage } from '../shared/types'
+
+export interface AppEvalResult {
+	success: boolean
+	files: AppFiles
+	error?: string
+	assistantMessageCount: number
+	toolCallCount: number
+	toolsUsed: string[]
+	tokenUsage: TokenUsage
+}
+
+export interface AppEvalOptions {
+	initialFrontend?: Record<string, string>
+	initialBackend?: Record<string, BackendRunnable>
+	model?: string
+	maxIterations?: number
+	provider?: AIProvider
+	workspaceRoot?: string
+	runContext?: ModeRunContext
+}
+
+export async function runAppEval(
+	userPrompt: string,
+	apiKey: string,
+	options?: AppEvalOptions
+): Promise<AppEvalResult> {
+	const workspaceRoot =
+		options?.workspaceRoot ??
+		(await mkdtemp(join(tmpdir(), 'wmill-frontend-app-benchmark-')))
+	const { helpers, getFiles, cleanup } = await createAppFileHelpers(
+		options?.initialFrontend ?? {},
+		options?.initialBackend ?? {},
+		workspaceRoot
+	)
+
+	try {
+		const systemMessage = prepareAppSystemMessage()
+		const tools = getAppTools() as ProductionTool<AppAIChatHelpers>[]
+		const model = options?.model ?? 'claude-haiku-4-5-20251001'
+		const userMessage = prepareAppUserMessage(userPrompt, helpers.getSelectedContext())
+
+		const rawResult = await runEval({
+			userPrompt,
+			systemMessage,
+			userMessage,
+			tools,
+			helpers,
+			apiKey,
+			getOutput: getFiles,
+			onAssistantMessageStart: options?.runContext?.onAssistantMessageStart,
+			onAssistantToken: options?.runContext?.onAssistantChunk,
+			onAssistantMessageEnd: options?.runContext?.onAssistantMessageEnd,
+			options: {
+				maxIterations: options?.maxIterations,
+				model,
+				workspace: workspaceRoot,
+				provider: options?.provider
+			}
+		})
+
+		return {
+			files: rawResult.output,
+			success: rawResult.success,
+			error: rawResult.error,
+			assistantMessageCount: rawResult.iterations,
+			toolCallCount: rawResult.toolCallsCount,
+			toolsUsed: rawResult.toolsCalled,
+			tokenUsage: rawResult.tokenUsage
+		}
+	} finally {
+		await cleanup()
+	}
+}
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/appFixtureLoader.ts b/ai_evals/adapters/frontend/core/app/appFixtureLoader.ts
similarity index 97%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/appFixtureLoader.ts
rename to ai_evals/adapters/frontend/core/app/appFixtureLoader.ts
index 8d3be427e4..3a1cacdb3e 100644
--- a/frontend/src/lib/components/copilot/chat/__tests__/app/appFixtureLoader.ts
+++ b/ai_evals/adapters/frontend/core/app/appFixtureLoader.ts
@@ -1,4 +1,8 @@
-import type { AppFiles, BackendRunnable, InlineScript } from '../../app/core'
+import type {
+	AppFiles,
+	BackendRunnable,
+	InlineScript
+} from '../../../../../frontend/src/lib/components/copilot/chat/app/core'
 
 /**
  * Backend runnable metadata stored in meta.json files.
diff --git a/ai_evals/adapters/frontend/core/app/fileHelpers.ts b/ai_evals/adapters/frontend/core/app/fileHelpers.ts
new file mode 100644
index 0000000000..02bfe799c5
--- /dev/null
+++ b/ai_evals/adapters/frontend/core/app/fileHelpers.ts
@@ -0,0 +1,255 @@
+import { mkdir, rm, writeFile } from 'fs/promises'
+import { dirname, join } from 'path'
+import type {
+	AppAIChatHelpers,
+	AppFiles,
+	BackendRunnable,
+	DataTableSchema,
+	LintResult,
+	SelectedContext
+} from '../../../../../frontend/src/lib/components/copilot/chat/app/core'
+
+function createEmptyLintResult(): LintResult {
+	return {
+		errorCount: 0,
+		warningCount: 0,
+		errors: { frontend: {}, backend: {} },
+		warnings: { frontend: {}, backend: {} }
+	}
+}
+
+async function writeFrontendFile(
+	workspaceRoot: string | undefined,
+	path: string,
+	content: string
+): Promise<void> {
+	if (!workspaceRoot) {
+		return
+	}
+	const relativePath = path.startsWith('/') ? path.slice(1) : path
+	const fullPath = join(workspaceRoot, 'frontend', relativePath)
+	await mkdir(dirname(fullPath), { recursive: true })
+	await writeFile(fullPath, content, 'utf8')
+}
+
+async function removeFrontendFile(workspaceRoot: string | undefined, path: string): Promise<void> {
+	if (!workspaceRoot) {
+		return
+	}
+	const relativePath = path.startsWith('/') ? path.slice(1) : path
+	await rm(join(workspaceRoot, 'frontend', relativePath), { force: true })
+}
+
+async function writeBackendRunnable(
+	workspaceRoot: string | undefined,
+	key: string,
+	runnable: BackendRunnable
+): Promise<void> {
+	if (!workspaceRoot) {
+		return
+	}
+	const runnableDir = join(workspaceRoot, 'backend', key)
+	await mkdir(runnableDir, { recursive: true })
+
+	const meta: { name: string; language?: string; type?: string; path?: string } = {
+		name: runnable.name
+	}
+
+	if (runnable.type === 'inline' && runnable.inlineScript) {
+		meta.language = runnable.inlineScript.language
+		const extension = runnable.inlineScript.language === 'python3' ? 'py' : 'ts'
+		await writeFile(
+			join(runnableDir, `main.${extension}`),
+			runnable.inlineScript.content,
+			'utf8'
+		)
+	} else {
+		meta.type = runnable.type
+		if (runnable.path) {
+			meta.path = runnable.path
+		}
+	}
+
+	await writeFile(join(runnableDir, 'meta.json'), JSON.stringify(meta, null, 2) + '\n', 'utf8')
+}
+
+async function removeBackendRunnable(workspaceRoot: string | undefined, key: string): Promise<void> {
+	if (!workspaceRoot) {
+		return
+	}
+	await rm(join(workspaceRoot, 'backend', key), { recursive: true, force: true })
+}
+
+async function persistDatatables(
+	workspaceRoot: string | undefined,
+	datatables: DataTableSchema[]
+): Promise<void> {
+	if (!workspaceRoot) {
+		return
+	}
+	await writeFile(
+		join(workspaceRoot, 'datatables.json'),
+		JSON.stringify(datatables, null, 2) + '\n',
+		'utf8'
+	)
+}
+
+export async function createAppFileHelpers(
+	initialFrontend: Record<string, string> = {},
+	initialBackend: Record<string, BackendRunnable> = {},
+	workspaceRoot?: string
+): Promise<{
+	helpers: AppAIChatHelpers
+	getFiles: () => AppFiles
+	getFrontend: () => Record<string, string>
+	getBackend: () => Record<string, BackendRunnable>
+	cleanup: () => Promise<void>
+	workspaceDir: string | null
+}> {
+	let frontend = { ...initialFrontend }
+	let backend = { ...initialBackend }
+	let snapshotId = 0
+	const snapshots = new Map<
+		number,
+		{ frontend: Record<string, string>; backend: Record<string, BackendRunnable> }
+	>()
+	const datatables: DataTableSchema[] = []
+
+	for (const [path, content] of Object.entries(frontend)) {
+		await writeFrontendFile(workspaceRoot, path, content)
+	}
+	for (const [key, runnable] of Object.entries(backend)) {
+		await writeBackendRunnable(workspaceRoot, key, runnable)
+	}
+	await persistDatatables(workspaceRoot, datatables)
+
+	const helpers: AppAIChatHelpers = {
+		listFrontendFiles: () => Object.keys(frontend),
+		getFrontendFile: (path: string) => frontend[path],
+		getFrontendFiles: () => ({ ...frontend }),
+		setFrontendFile: (path: string, content: string) => {
+			frontend[path] = content
+			void writeFrontendFile(workspaceRoot, path, content)
+			return createEmptyLintResult()
+		},
+		deleteFrontendFile: (path: string) => {
+			delete frontend[path]
+			void removeFrontendFile(workspaceRoot, path)
+		},
+		listBackendRunnables: () =>
+			Object.entries(backend).map(([key, runnable]) => ({
+				key,
+				name: runnable.name
+			})),
+		getBackendRunnable: (key: string) => backend[key],
+		getBackendRunnables: () => ({ ...backend }),
+		setBackendRunnable: async (key: string, runnable: BackendRunnable) => {
+			backend[key] = runnable
+			await writeBackendRunnable(workspaceRoot, key, runnable)
+			return createEmptyLintResult()
+		},
+		deleteBackendRunnable: (key: string) => {
+			delete backend[key]
+			void removeBackendRunnable(workspaceRoot, key)
+		},
+		getFiles: (): AppFiles => ({
+			frontend: { ...frontend },
+			backend: { ...backend }
+		}),
+		getSelectedContext: (): SelectedContext => ({ type: 'none' }),
+		snapshot: () => {
+			const id = ++snapshotId
+			snapshots.set(id, {
+				frontend: { ...frontend },
+				backend: { ...backend }
+			})
+			return id
+		},
+		revertToSnapshot: (id: number) => {
+			const snapshot = snapshots.get(id)
+			if (!snapshot) {
+				return
+			}
+			frontend = { ...snapshot.frontend }
+			backend = { ...snapshot.backend }
+			void syncWorkspace()
+		},
+		lint: () => createEmptyLintResult(),
+		getDatatables: async () => structuredClone(datatables),
+		getAvailableDatatableNames: () => datatables.map((datatable) => datatable.datatable_name),
+		execDatatableSql: async (
+			datatableName: string,
+			sql: string,
+			newTable?: { schema: string; name: string }
+		) => {
+			if (newTable) {
+				datatables.push({
+					datatable_name: datatableName,
+					schemas: {
+						[newTable.schema]: {
+							[newTable.name]: {}
+						}
+					}
+				})
+				await persistDatatables(workspaceRoot, datatables)
+			}
+			return {
+				success: true,
+				result: [
+					{
+						datatableName,
+						sql
+					}
+				]
+			}
+		},
+		addTableToWhitelist: (datatableName: string, schemaName: string, tableName: string) => {
+			const existing = datatables.find((entry) => entry.datatable_name === datatableName)
+			if (existing) {
+				existing.schemas[schemaName] ??= {}
+				existing.schemas[schemaName][tableName] ??= {}
+			} else {
+				datatables.push({
+					datatable_name: datatableName,
+					schemas: {
+						[schemaName]: {
+							[tableName]: {}
+						}
+					}
+				})
+			}
+			void persistDatatables(workspaceRoot, datatables)
+		}
+	}
+
+	async function syncWorkspace(): Promise<void> {
+		if (!workspaceRoot) {
+			return
+		}
+		await rm(join(workspaceRoot, 'frontend'), { recursive: true, force: true })
+		await rm(join(workspaceRoot, 'backend'), { recursive: true, force: true })
+		for (const [path, content] of Object.entries(frontend)) {
+			await writeFrontendFile(workspaceRoot, path, content)
+		}
+		for (const [key, runnable] of Object.entries(backend)) {
+			await writeBackendRunnable(workspaceRoot, key, runnable)
+		}
+		await persistDatatables(workspaceRoot, datatables)
+	}
+
+	return {
+		helpers,
+		getFiles: () => ({
+			frontend: { ...frontend },
+			backend: { ...backend }
+		}),
+		getFrontend: () => ({ ...frontend }),
+		getBackend: () => ({ ...backend }),
+		cleanup: async () => {
+			if (workspaceRoot) {
+				await rm(workspaceRoot, { recursive: true, force: true })
+			}
+		},
+		workspaceDir: workspaceRoot ?? null
+	}
+}
diff --git a/ai_evals/adapters/frontend/core/flow/fileHelpers.ts b/ai_evals/adapters/frontend/core/flow/fileHelpers.ts
new file mode 100644
index 0000000000..22f6587de3
--- /dev/null
+++ b/ai_evals/adapters/frontend/core/flow/fileHelpers.ts
@@ -0,0 +1,161 @@
+import { mkdir, rm, writeFile } from 'fs/promises'
+import { dirname, join } from 'path'
+import type { FlowModule, InputTransform } from '../../../../../frontend/src/lib/gen'
+import type { ExtendedOpenFlow } from '../../../../../frontend/src/lib/components/flows/types'
+import type { FlowAIChatHelpers } from '../../../../../frontend/src/lib/components/copilot/chat/flow/core'
+import type { ScriptLintResult } from '../../../../../frontend/src/lib/components/copilot/chat/shared'
+import { findModuleById } from '../../../../../frontend/src/lib/components/copilot/chat/shared'
+import {
+	createInlineScriptSession
+} from '../../../../../frontend/src/lib/components/copilot/chat/flow/inlineScriptsUtils'
+import {
+	registerBenchmarkWorkspace,
+	registerBenchmarkWorkspaceRunnables,
+	unregisterBenchmarkWorkspaceRunnables,
+	createBenchmarkCompletedJob,
+	type BenchmarkWorkspaceFlow,
+	type BenchmarkWorkspaceScript
+} from '../../mockBackend'
+
+const EMPTY_SCRIPT_LINT_RESULT: ScriptLintResult = {
+	errorCount: 0,
+	warningCount: 0,
+	errors: [],
+	warnings: []
+}
+
+export interface FlowWorkspaceFixtures {
+	scripts?: BenchmarkWorkspaceScript[]
+	flows?: BenchmarkWorkspaceFlow[]
+}
+
+export async function createFlowFileHelpers(
+	initialModules: FlowModule[] = [],
+	initialSchema?: Record<string, any>,
+	workspaceRoot?: string,
+	workspaceFixtures?: FlowWorkspaceFixtures
+): Promise<{
+	helpers: FlowAIChatHelpers
+	getFlow: () => ExtendedOpenFlow
+	getModules: () => FlowModule[]
+	cleanup: () => Promise<void>
+	workspaceDir: string | null
+}> {
+	let flow: ExtendedOpenFlow = {
+		value: { modules: structuredClone(initialModules) },
+		summary: '',
+		schema: initialSchema ?? {
+			$schema: 'https://json-schema.org/draft/2020-12/schema',
+			properties: {},
+			required: [],
+			type: 'object'
+		}
+	}
+	const inlineScriptSession = createInlineScriptSession()
+
+	const flowFilePath = workspaceRoot ? join(workspaceRoot, 'flow.json') : null
+
+	async function persistFlow(): Promise<void> {
+		if (!flowFilePath) {
+			return
+		}
+		await mkdir(dirname(flowFilePath), { recursive: true })
+		await writeFile(flowFilePath, JSON.stringify(flow, null, 2) + '\n', 'utf8')
+	}
+
+	await persistFlow()
+
+	if (workspaceRoot) {
+		registerBenchmarkWorkspace(workspaceRoot)
+		if (workspaceFixtures) {
+			registerBenchmarkWorkspaceRunnables(workspaceRoot, workspaceFixtures)
+		}
+	}
+
+	const helpers: FlowAIChatHelpers = {
+		getFlowAndSelectedId: () => ({ flow, selectedId: '' }),
+		getModules: (id?: string) => {
+			if (!id) return flow.value.modules
+			const module = findModuleById(flow.value.modules, id)
+			return module ? [module] : []
+		},
+		inlineScriptSession,
+		setSnapshot: () => {},
+		revertToSnapshot: () => {},
+		setCode: async (id: string, code: string) => {
+			const module = findModuleById(flow.value.modules, id)
+			if (module && module.value.type === 'rawscript') {
+				module.value.content = code
+			}
+			inlineScriptSession.set(id, code)
+			await persistFlow()
+		},
+		setFlowJson: async (
+			modules: FlowModule[] | undefined,
+			schema: Record<string, any> | undefined
+		) => {
+			if (modules) {
+				flow.value.modules = inlineScriptSession.restoreInlineScriptReferences(modules)
+				const unresolvedRefs = inlineScriptSession.findUnresolvedInlineScriptRefs(flow.value.modules)
+				if (unresolvedRefs.length > 0) {
+					throw new Error(
+						`Unresolved inline script references: ${unresolvedRefs.join(', ')}`
+					)
+				}
+			}
+			if (schema !== undefined) {
+				flow.schema = schema
+			}
+			await persistFlow()
+		},
+		getFlowInputsSchema: async () => flow.schema ?? {},
+		updateExprsToSet: (_id: string, _inputTransforms: Record<string, InputTransform>) => {},
+		acceptAllModuleActions: () => {},
+		rejectAllModuleActions: () => {},
+		hasPendingChanges: () => false,
+		selectStep: (_id: string) => {},
+		testFlow: async (args?: Record<string, any>) => {
+			if (workspaceRoot) {
+				const runPath = join(workspaceRoot, 'test-run.json')
+				await writeFile(
+					runPath,
+					JSON.stringify(
+						{
+							requestedArgs: args ?? {},
+							modules: flow.value.modules.map((module) => module.id)
+						},
+						null,
+						2
+					) + '\n',
+					'utf8'
+				)
+			}
+			return createBenchmarkCompletedJob({
+				workspace: workspaceRoot ?? 'benchmark',
+				jobKind: 'flowpreview',
+				result: {
+					requestedArgs: args ?? {},
+					modules: flow.value.modules.map((module) => module.id),
+					mocked: true
+				},
+				logs: 'Mock benchmark flow test run completed successfully.'
+			})
+		},
+		getLintErrors: async () => EMPTY_SCRIPT_LINT_RESULT
+	}
+
+	return {
+		helpers,
+		getFlow: () => flow,
+		getModules: () => flow.value.modules,
+		cleanup: async () => {
+			if (workspaceRoot) {
+				unregisterBenchmarkWorkspaceRunnables(workspaceRoot)
+			}
+			if (workspaceRoot) {
+				await rm(workspaceRoot, { recursive: true, force: true })
+			}
+		},
+		workspaceDir: workspaceRoot ?? null
+	}
+}
diff --git a/ai_evals/adapters/frontend/core/flow/flowEvalRunner.ts b/ai_evals/adapters/frontend/core/flow/flowEvalRunner.ts
new file mode 100644
index 0000000000..a07e22f584
--- /dev/null
+++ b/ai_evals/adapters/frontend/core/flow/flowEvalRunner.ts
@@ -0,0 +1,103 @@
+import { mkdtemp } from 'fs/promises'
+import { tmpdir } from 'os'
+import { join } from 'path'
+import type { FlowModule } from '$lib/gen'
+import type { AIProvider } from '$lib/gen/types.gen'
+import type { ExtendedOpenFlow } from '$lib/components/flows/types'
+import {
+	flowTools,
+	prepareFlowSystemMessage,
+	prepareFlowUserMessage,
+	type FlowAIChatHelpers
+} from '../../../../../frontend/src/lib/components/copilot/chat/flow/core'
+import type { Tool as ProductionTool } from '../../../../../frontend/src/lib/components/copilot/chat/shared'
+import { createFlowFileHelpers, type FlowWorkspaceFixtures } from './fileHelpers'
+import { runEval } from '../shared'
+import type { ModeRunContext } from '../../../../core/types'
+import type { TokenUsage } from '../shared/types'
+
+export interface FlowFixture {
+	value?: {
+		modules?: FlowModule[]
+	}
+	schema?: Record<string, unknown>
+}
+
+export interface FlowEvalResult {
+	success: boolean
+	flow: ExtendedOpenFlow
+	error?: string
+	assistantMessageCount: number
+	toolCallCount: number
+	toolsUsed: string[]
+	tokenUsage: TokenUsage
+}
+
+export interface FlowEvalOptions {
+	initialFlow?: FlowFixture
+	workspaceFixtures?: FlowWorkspaceFixtures
+	model?: string
+	maxIterations?: number
+	provider?: AIProvider
+	workspaceRoot?: string
+	runContext?: ModeRunContext
+}
+
+export async function runFlowEval(
+	userPrompt: string,
+	apiKey: string,
+	options?: FlowEvalOptions
+): Promise<FlowEvalResult> {
+	const workspaceRoot =
+		options?.workspaceRoot ??
+		(await mkdtemp(join(tmpdir(), 'wmill-frontend-flow-benchmark-')))
+	const { helpers, getFlow, cleanup } = await createFlowFileHelpers(
+		options?.initialFlow?.value?.modules ?? [],
+		options?.initialFlow?.schema,
+		workspaceRoot,
+		options?.workspaceFixtures
+	)
+
+	try {
+		const systemMessage = prepareFlowSystemMessage()
+		const tools = flowTools as ProductionTool<FlowAIChatHelpers>[]
+		const model = options?.model ?? 'claude-haiku-4-5-20251001'
+		const userMessage = prepareFlowUserMessage(
+			userPrompt,
+			helpers.getFlowAndSelectedId(),
+			[],
+			helpers.inlineScriptSession
+		)
+
+		const rawResult = await runEval({
+			userPrompt,
+			systemMessage,
+			userMessage,
+			tools,
+			helpers,
+			apiKey,
+			getOutput: getFlow,
+			onAssistantMessageStart: options?.runContext?.onAssistantMessageStart,
+			onAssistantToken: options?.runContext?.onAssistantChunk,
+			onAssistantMessageEnd: options?.runContext?.onAssistantMessageEnd,
+			options: {
+				maxIterations: options?.maxIterations,
+				model,
+				workspace: workspaceRoot,
+				provider: options?.provider
+			}
+		})
+
+		return {
+			flow: rawResult.output,
+			success: rawResult.success,
+			error: rawResult.error,
+			assistantMessageCount: rawResult.iterations,
+			toolCallCount: rawResult.toolCallsCount,
+			toolsUsed: rawResult.toolsCalled,
+			tokenUsage: rawResult.tokenUsage
+		}
+	} finally {
+		await cleanup()
+	}
+}
diff --git a/ai_evals/adapters/frontend/core/script/fileHelpers.ts b/ai_evals/adapters/frontend/core/script/fileHelpers.ts
new file mode 100644
index 0000000000..3d6f3b1139
--- /dev/null
+++ b/ai_evals/adapters/frontend/core/script/fileHelpers.ts
@@ -0,0 +1,73 @@
+import { mkdir, rm, writeFile } from 'fs/promises'
+import { dirname, join } from 'path'
+import type { ScriptLang } from '../../../../../frontend/src/lib/gen/types.gen'
+import type { ReviewChangesOpts } from '../../../../../frontend/src/lib/components/copilot/chat/monaco-adapter'
+import type { ScriptChatHelpers } from '../../../../../frontend/src/lib/components/copilot/chat/script/core'
+import { buildScriptLintResult } from './preview'
+import { registerBenchmarkWorkspace, unregisterBenchmarkWorkspace } from '../../mockBackend'
+
+export interface ScriptEvalState {
+	code: string
+	lang: ScriptLang | 'bunnative'
+	path: string
+	args: Record<string, any>
+}
+
+export async function createScriptFileHelpers(
+	initialScript: ScriptEvalState,
+	workspaceRoot?: string
+): Promise<{
+	helpers: ScriptChatHelpers
+	getScript: () => ScriptEvalState
+	cleanup: () => Promise<void>
+	workspaceDir: string | null
+}> {
+	let script = structuredClone(initialScript)
+	const scriptFilePath = workspaceRoot ? join(workspaceRoot, script.path) : null
+
+	async function persistScript(): Promise<void> {
+		if (!scriptFilePath) {
+			return
+		}
+		await mkdir(dirname(scriptFilePath), { recursive: true })
+		await writeFile(scriptFilePath, script.code, 'utf8')
+	}
+
+	await persistScript()
+
+	if (workspaceRoot) {
+		registerBenchmarkWorkspace(workspaceRoot)
+	}
+
+	const helpers: ScriptChatHelpers = {
+		getScriptOptions: () => ({
+			code: script.code,
+			lang: script.lang,
+			path: script.path,
+			args: structuredClone(script.args)
+		}),
+		applyCode: async (code: string, opts?: ReviewChangesOpts) => {
+			if (opts?.mode === 'revert') {
+				return
+			}
+			script = {
+				...script,
+				code
+			}
+			await persistScript()
+		},
+		getLintErrors: () => buildScriptLintResult(script.code, script.lang)
+	}
+
+	return {
+		helpers,
+		getScript: () => structuredClone(script),
+		cleanup: async () => {
+			if (workspaceRoot) {
+				unregisterBenchmarkWorkspace(workspaceRoot)
+				await rm(workspaceRoot, { recursive: true, force: true })
+			}
+		},
+		workspaceDir: workspaceRoot ?? null
+	}
+}
diff --git a/ai_evals/adapters/frontend/core/script/preview.ts b/ai_evals/adapters/frontend/core/script/preview.ts
new file mode 100644
index 0000000000..40befc5266
--- /dev/null
+++ b/ai_evals/adapters/frontend/core/script/preview.ts
@@ -0,0 +1,96 @@
+import ts from 'typescript'
+import type { ScriptLang } from '../../../../../frontend/src/lib/gen/types.gen'
+import type { ScriptLintResult } from '../../../../../frontend/src/lib/components/copilot/chat/shared'
+
+export type ScriptPreviewLanguage = ScriptLang | 'bunnative'
+
+const TS_LIKE_LANGUAGES = new Set<ScriptPreviewLanguage>(['bun', 'deno', 'nativets', 'bunnative'])
+const JS_LIKE_LANGUAGES = new Set<ScriptPreviewLanguage>(['bun', 'deno', 'nativets', 'bunnative'])
+
+function hasSupportedEntrypoint(code: string): boolean {
+	return (
+		/export\s+(async\s+)?function\s+main\s*\(/.test(code) ||
+		/export\s+(async\s+)?function\s+preprocessor\s*\(/.test(code)
+	)
+}
+
+function compilerOptionsForLanguage(lang: ScriptPreviewLanguage): ts.CompilerOptions | null {
+	if (!TS_LIKE_LANGUAGES.has(lang)) {
+		return null
+	}
+
+	return {
+		target: ts.ScriptTarget.ES2022,
+		module: ts.ModuleKind.ESNext,
+		moduleResolution: ts.ModuleResolutionKind.Bundler,
+		noEmit: true,
+		allowJs: true,
+		checkJs: false,
+		strict: false,
+		skipLibCheck: true
+	}
+}
+
+function getLineAndColumn(sourceText: string, start: number): { line: number; column: number } {
+	const prefix = sourceText.slice(0, Math.max(0, start))
+	const line = prefix.split('\n').length
+	const lastNewline = prefix.lastIndexOf('\n')
+	const column = lastNewline === -1 ? prefix.length + 1 : prefix.length - lastNewline
+	return { line, column }
+}
+
+export function buildScriptLintResult(
+	code: string,
+	lang: ScriptPreviewLanguage
+): ScriptLintResult {
+	const diagnostics: ScriptLintResult['errors'] = []
+	const compilerOptions = compilerOptionsForLanguage(lang)
+
+	if (compilerOptions) {
+		const sourceFile = ts.createSourceFile(
+			'script.ts',
+			code,
+			ts.ScriptTarget.ES2022,
+			true,
+			JS_LIKE_LANGUAGES.has(lang) ? ts.ScriptKind.TS : ts.ScriptKind.JS
+		)
+		const output = ts.transpileModule(code, {
+			compilerOptions,
+			fileName: sourceFile.fileName,
+			reportDiagnostics: true
+		})
+
+		for (const diagnostic of output.diagnostics ?? []) {
+			const start = diagnostic.start ?? 0
+			const length = diagnostic.length ?? 1
+			const { line, column } = getLineAndColumn(code, start)
+			const message = ts.flattenDiagnosticMessageText(diagnostic.messageText, '\n')
+			diagnostics.push({
+				startLineNumber: line,
+				startColumn: column,
+				endLineNumber: line,
+				endColumn: column + Math.max(1, length),
+				message,
+				severity: 8
+			} as ScriptLintResult['errors'][number])
+		}
+	}
+
+	if (!hasSupportedEntrypoint(code)) {
+		diagnostics.push({
+			startLineNumber: 1,
+			startColumn: 1,
+			endLineNumber: 1,
+			endColumn: 1,
+			message: 'Script must export a main or preprocessor function.',
+			severity: 8
+		} as ScriptLintResult['errors'][number])
+	}
+
+	return {
+		errorCount: diagnostics.length,
+		warningCount: 0,
+		errors: diagnostics,
+		warnings: []
+	}
+}
diff --git a/ai_evals/adapters/frontend/core/script/scriptEvalRunner.ts b/ai_evals/adapters/frontend/core/script/scriptEvalRunner.ts
new file mode 100644
index 0000000000..fe37078348
--- /dev/null
+++ b/ai_evals/adapters/frontend/core/script/scriptEvalRunner.ts
@@ -0,0 +1,109 @@
+import { mkdtemp } from 'fs/promises'
+import { tmpdir } from 'os'
+import { join } from 'path'
+import type { AIProvider, AIProviderModel, ScriptLang } from '$lib/gen/types.gen'
+import type { ContextElement } from '../../../../../frontend/src/lib/components/copilot/chat/context'
+import {
+	prepareScriptSystemMessage,
+	prepareScriptTools,
+	prepareScriptUserMessage,
+	type ScriptChatHelpers
+} from '../../../../../frontend/src/lib/components/copilot/chat/script/core'
+import type { Tool as ProductionTool } from '../../../../../frontend/src/lib/components/copilot/chat/shared'
+import { createScriptFileHelpers, type ScriptEvalState } from './fileHelpers'
+import { runEval } from '../shared'
+import type { ModeRunContext } from '../../../../core/types'
+import type { TokenUsage } from '../shared/types'
+
+export interface ScriptEvalResult {
+	success: boolean
+	script: ScriptEvalState
+	error?: string
+	assistantMessageCount: number
+	toolCallCount: number
+	toolsUsed: string[]
+	tokenUsage: TokenUsage
+}
+
+export interface ScriptEvalOptions {
+	initialScript: ScriptEvalState
+	model?: string
+	maxIterations?: number
+	provider?: AIProvider
+	workspaceRoot?: string
+	runContext?: ModeRunContext
+}
+
+function resolveModelProvider(
+	model: string,
+	provider?: AIProvider
+): AIProviderModel {
+	if (provider) {
+		return { provider, model }
+	}
+	if (model.startsWith('claude')) {
+		return { provider: 'anthropic', model }
+	}
+	return { provider: 'openai', model }
+}
+
+export async function runScriptEval(
+	userPrompt: string,
+	apiKey: string,
+	options: ScriptEvalOptions
+): Promise<ScriptEvalResult> {
+	const workspaceRoot =
+		options.workspaceRoot ?? (await mkdtemp(join(tmpdir(), 'wmill-frontend-script-benchmark-')))
+	const { helpers, getScript, cleanup } = await createScriptFileHelpers(
+		options.initialScript,
+		workspaceRoot
+	)
+
+	try {
+		const model = options.model ?? 'claude-haiku-4-5-20251001'
+		const modelProvider = resolveModelProvider(model, options.provider)
+		const selectedContext: ContextElement[] = []
+		const systemMessage = prepareScriptSystemMessage(
+			modelProvider,
+			options.initialScript.lang,
+			{}
+		)
+		const tools = prepareScriptTools(
+			modelProvider,
+			options.initialScript.lang,
+			selectedContext
+		) as ProductionTool<ScriptChatHelpers>[]
+		const userMessage = prepareScriptUserMessage(userPrompt, selectedContext)
+
+		const rawResult = await runEval({
+			userPrompt,
+			systemMessage,
+			userMessage,
+			tools,
+			helpers,
+			apiKey,
+			getOutput: getScript,
+			onAssistantMessageStart: options.runContext?.onAssistantMessageStart,
+			onAssistantToken: options.runContext?.onAssistantChunk,
+			onAssistantMessageEnd: options.runContext?.onAssistantMessageEnd,
+			options: {
+				maxIterations: options.maxIterations,
+				model,
+				workspace: workspaceRoot,
+				provider: modelProvider.provider
+			}
+		})
+
+		return {
+			script: rawResult.output,
+			success: rawResult.success,
+			error: rawResult.error,
+			assistantMessageCount: rawResult.iterations,
+			toolCallCount: rawResult.toolCallsCount,
+			toolsUsed: rawResult.toolsCalled,
+			tokenUsage: rawResult.tokenUsage
+		}
+	} finally {
+		await cleanup()
+	}
+}
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/shared/baseEvalRunner.ts b/ai_evals/adapters/frontend/core/shared/baseEvalRunner.ts
similarity index 66%
rename from frontend/src/lib/components/copilot/chat/__tests__/shared/baseEvalRunner.ts
rename to ai_evals/adapters/frontend/core/shared/baseEvalRunner.ts
index f46acb9108..964a349785 100644
--- a/frontend/src/lib/components/copilot/chat/__tests__/shared/baseEvalRunner.ts
+++ b/ai_evals/adapters/frontend/core/shared/baseEvalRunner.ts
@@ -1,29 +1,19 @@
-import OpenAI from 'openai'
-import Anthropic from '@anthropic-ai/sdk'
 import type {
 	ChatCompletionMessageParam,
 	ChatCompletionSystemMessageParam
 } from 'openai/resources/chat/completions.mjs'
-import type { AIProvider, AIProviderModel } from '$lib/gen/types.gen'
-import type { TokenUsage, ToolCallDetail, EvalRunnerOptions } from './types'
-import type { Tool } from './baseVariants'
-import { runChatLoop, type ChatClients } from '../../chatLoop'
-import type { Tool as ProductionTool, ToolCallbacks } from '../../shared'
-
-/**
- * Result from a single eval run (before domain-specific evaluation).
- */
-export interface RawEvalResult<TOutput> {
-	success: boolean
-	output: TOutput
-	error?: string
-	tokenUsage: TokenUsage
-	toolCallsCount: number
-	toolsCalled: string[]
-	toolCallDetails: ToolCallDetail[]
-	iterations: number
-	messages: ChatCompletionMessageParam[]
-}
+import type { AIProviderModel } from '$lib/gen/types.gen'
+import type { TokenUsage, ToolCallDetail, EvalRunnerOptions, RawEvalResult } from './types'
+import { runChatLoop, type ChatClients } from '../../../../../frontend/src/lib/components/copilot/chat/chatLoop'
+import type {
+	Tool as ProductionTool,
+	ToolCallbacks
+} from '../../../../../frontend/src/lib/components/copilot/chat/shared'
+import {
+	createEvalClients,
+	type FrontendEvalProvider,
+	resolveEvalModelProvider
+} from './providerConfig'
 
 /**
  * Parameters for running a base evaluation.
@@ -38,7 +28,7 @@ export interface RunEvalParams<THelpers, TOutput> {
 	/** Tool definitions for the LLM API (unused — derived from tools) */
 	toolDefs?: unknown
 	/** Full tool implementations for execution */
-	tools: Tool<THelpers>[]
+	tools: ProductionTool<THelpers>[]
 	/** Domain-specific helpers for tool execution */
 	helpers: THelpers
 	/** API key for the provider */
@@ -47,35 +37,9 @@ export interface RunEvalParams<THelpers, TOutput> {
 	getOutput: () => TOutput
 	/** Optional configuration */
 	options?: EvalRunnerOptions
-}
-
-/**
- * Creates SDK clients for the given provider.
- */
-function createEvalClients(provider: AIProvider, apiKey: string): ChatClients {
-	if (provider === 'anthropic') {
-		return {
-			openai: new OpenAI({ apiKey: 'unused' }),
-			anthropic: new Anthropic({ apiKey })
-		}
-	}
-	return {
-		openai: new OpenAI({ apiKey }),
-		anthropic: new Anthropic({ apiKey: 'unused' })
-	}
-}
-
-/**
- * Resolves model string to AIProviderModel.
- */
-function resolveModelProvider(
-	model: string,
-	provider?: AIProvider
-): AIProviderModel {
-	if (provider) return { provider, model }
-	if (model.startsWith('claude')) return { provider: 'anthropic', model }
-	if (model.startsWith('gpt') || model.startsWith('o')) return { provider: 'openai', model }
-	return { provider: 'openai', model }
+	onAssistantMessageStart?: () => void
+	onAssistantToken?: (token: string) => void
+	onAssistantMessageEnd?: () => void
 }
 
 /**
@@ -92,16 +56,23 @@ export async function runEval<THelpers, TOutput>(
 		helpers,
 		apiKey,
 		getOutput,
-		options
+		options,
+		onAssistantMessageStart,
+		onAssistantToken,
+		onAssistantMessageEnd
 	} = params
+	let shouldEmitMessageStart = true
 
 	const model = options?.model ?? 'gpt-4o'
 	const maxIterations = options?.maxIterations ?? 20
 	const workspace = options?.workspace ?? 'test-workspace'
 	const provider = options?.provider
 
-	const modelProvider = resolveModelProvider(model, provider)
-	const clients = createEvalClients(modelProvider.provider, apiKey)
+	const modelProvider = resolveEvalModelProvider(
+		model,
+		provider as FrontendEvalProvider | undefined
+	) as AIProviderModel
+	const clients = createEvalClients(modelProvider.provider, apiKey) as ChatClients
 
 	const messages: ChatCompletionMessageParam[] = [userMessage]
 	let toolCallsCount = 0
@@ -128,7 +99,7 @@ export async function runEval<THelpers, TOutput>(
 			}
 			return tool.fn(p)
 		}
-	})) as ProductionTool<THelpers>[]
+	}))
 
 	// No-op callbacks for eval
 	const callbacks: ToolCallbacks & {
@@ -137,8 +108,19 @@ export async function runEval<THelpers, TOutput>(
 	} = {
 		setToolStatus: () => {},
 		removeToolStatus: () => {},
-		onNewToken: () => {},
-		onMessageEnd: () => {}
+		onNewToken: (token: string) => {
+			if (shouldEmitMessageStart) {
+				onAssistantMessageStart?.()
+				shouldEmitMessageStart = false
+			}
+			onAssistantToken?.(token)
+		},
+		onMessageEnd: () => {
+			if (!shouldEmitMessageStart) {
+				onAssistantMessageEnd?.()
+			}
+			shouldEmitMessageStart = true
+		}
 	}
 
 	const abortController = new AbortController()
@@ -161,7 +143,7 @@ export async function runEval<THelpers, TOutput>(
 		return {
 			success: true,
 			output: getOutput(),
-			tokenUsage: { prompt: 0, completion: 0, total: 0 },
+			tokenUsage: result.tokenUsage,
 			toolCallsCount,
 			toolsCalled,
 			toolCallDetails,
diff --git a/ai_evals/adapters/frontend/core/shared/index.ts b/ai_evals/adapters/frontend/core/shared/index.ts
new file mode 100644
index 0000000000..290abc8b0f
--- /dev/null
+++ b/ai_evals/adapters/frontend/core/shared/index.ts
@@ -0,0 +1,3 @@
+export type { TokenUsage, ToolCallDetail, EvalRunnerOptions, RawEvalResult } from './types'
+export type { RunEvalParams } from './baseEvalRunner'
+export { runEval } from './baseEvalRunner'
diff --git a/ai_evals/adapters/frontend/core/shared/providerConfig.test.ts b/ai_evals/adapters/frontend/core/shared/providerConfig.test.ts
new file mode 100644
index 0000000000..ad0ef7652a
--- /dev/null
+++ b/ai_evals/adapters/frontend/core/shared/providerConfig.test.ts
@@ -0,0 +1,41 @@
+import { describe, expect, it } from "bun:test";
+import {
+  buildOpenAICompatibleClientOptions,
+  resolveEvalModelProvider,
+} from "./providerConfig";
+
+describe("buildOpenAICompatibleClientOptions", () => {
+  it("adds Gemini's OpenAI-compatible base URL and client header", () => {
+    const options = buildOpenAICompatibleClientOptions("googleai", "gemini-test-key");
+
+    expect(options).toMatchObject({
+      apiKey: "gemini-test-key",
+      baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
+      defaultHeaders: {
+        "x-goog-api-client": "windmill-ai-evals/1.0",
+      },
+    });
+  });
+
+  it("keeps the default OpenAI-compatible config for OpenAI", () => {
+    expect(buildOpenAICompatibleClientOptions("openai", "openai-test-key")).toEqual({
+      apiKey: "openai-test-key",
+    });
+  });
+});
+
+describe("resolveEvalModelProvider", () => {
+  it("infers googleai from Gemini model ids", () => {
+    expect(resolveEvalModelProvider("gemini-2.5-flash")).toEqual({
+      provider: "googleai",
+      model: "gemini-2.5-flash",
+    });
+  });
+
+  it("preserves an explicit provider", () => {
+    expect(resolveEvalModelProvider("gemini-2.5-pro", "googleai")).toEqual({
+      provider: "googleai",
+      model: "gemini-2.5-pro",
+    });
+  });
+});
diff --git a/ai_evals/adapters/frontend/core/shared/providerConfig.ts b/ai_evals/adapters/frontend/core/shared/providerConfig.ts
new file mode 100644
index 0000000000..44a698b2af
--- /dev/null
+++ b/ai_evals/adapters/frontend/core/shared/providerConfig.ts
@@ -0,0 +1,71 @@
+import Anthropic from "@anthropic-ai/sdk";
+import OpenAI from "openai";
+import type { FrontendEvalModelConfig } from "../../../../core/models";
+
+export type FrontendEvalProvider = FrontendEvalModelConfig["provider"];
+
+export interface EvalClients {
+  openai: OpenAI;
+  anthropic: Anthropic;
+}
+
+export interface ResolvedEvalModelProvider {
+  provider: FrontendEvalProvider;
+  model: string;
+}
+
+const GEMINI_OPENAI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/";
+const GEMINI_GOOG_API_CLIENT = "windmill-ai-evals/1.0";
+
+export function buildOpenAICompatibleClientOptions(
+  provider: Exclude<FrontendEvalProvider, "anthropic">,
+  apiKey: string
+): ConstructorParameters<typeof OpenAI>[0] {
+  if (provider === "googleai") {
+    return {
+      apiKey,
+      baseURL: GEMINI_OPENAI_BASE_URL,
+      defaultHeaders: {
+        "x-goog-api-client": GEMINI_GOOG_API_CLIENT,
+      },
+    };
+  }
+
+  return { apiKey };
+}
+
+export function createEvalClients(
+  provider: FrontendEvalProvider,
+  apiKey: string
+): EvalClients {
+  if (provider === "anthropic") {
+    return {
+      openai: new OpenAI({ apiKey: "unused" }),
+      anthropic: new Anthropic({ apiKey }),
+    };
+  }
+
+  return {
+    openai: new OpenAI(buildOpenAICompatibleClientOptions(provider, apiKey)),
+    anthropic: new Anthropic({ apiKey: "unused" }),
+  };
+}
+
+export function resolveEvalModelProvider(
+  model: string,
+  provider?: FrontendEvalProvider
+): ResolvedEvalModelProvider {
+  if (provider) {
+    return { provider, model };
+  }
+  if (model.startsWith("claude")) {
+    return { provider: "anthropic", model };
+  }
+  if (model.startsWith("gemini")) {
+    return { provider: "googleai", model };
+  }
+  if (model.startsWith("gpt") || model.startsWith("o")) {
+    return { provider: "openai", model };
+  }
+  return { provider: "openai", model };
+}
diff --git a/ai_evals/adapters/frontend/core/shared/types.ts b/ai_evals/adapters/frontend/core/shared/types.ts
new file mode 100644
index 0000000000..4bc3a49b3c
--- /dev/null
+++ b/ai_evals/adapters/frontend/core/shared/types.ts
@@ -0,0 +1,32 @@
+import type { ChatCompletionMessageParam } from 'openai/resources/chat/completions.mjs'
+import type { AIProvider } from '$lib/gen/types.gen'
+
+export interface TokenUsage {
+	prompt: number
+	completion: number
+	total: number
+}
+
+export interface ToolCallDetail {
+	name: string
+	arguments: Record<string, unknown>
+}
+
+export interface EvalRunnerOptions {
+	maxIterations?: number
+	model?: string
+	workspace?: string
+	provider?: AIProvider
+}
+
+export interface RawEvalResult<TOutput> {
+	success: boolean
+	output: TOutput
+	error?: string
+	tokenUsage: TokenUsage
+	toolCallsCount: number
+	toolsCalled: string[]
+	toolCallDetails: ToolCallDetail[]
+	iterations: number
+	messages: ChatCompletionMessageParam[]
+}
diff --git a/ai_evals/adapters/frontend/mockBackend.ts b/ai_evals/adapters/frontend/mockBackend.ts
new file mode 100644
index 0000000000..7a0bea443f
--- /dev/null
+++ b/ai_evals/adapters/frontend/mockBackend.ts
@@ -0,0 +1,270 @@
+import { randomUUID } from 'node:crypto'
+import type { CompletedJob, Flow, Script } from '../../../frontend/src/lib/gen'
+import type { ScriptLang } from '../../../frontend/src/lib/gen/types.gen'
+import { buildScriptLintResult } from './core/script/preview'
+
+const BENCHMARK_TIMESTAMP = '1970-01-01T00:00:00.000Z'
+
+export interface BenchmarkWorkspaceScript {
+	path: string
+	summary: string
+	description?: string
+	language: Script['language']
+	schema?: Record<string, unknown>
+	content: string
+}
+
+export interface BenchmarkWorkspaceFlow {
+	path: string
+	summary: string
+	description?: string
+	schema?: Record<string, unknown>
+	value: Flow['value']
+}
+
+export interface BenchmarkWorkspaceRunnables {
+	scripts?: BenchmarkWorkspaceScript[]
+	flows?: BenchmarkWorkspaceFlow[]
+}
+
+type BenchmarkCompletedJob = CompletedJob & { type: 'CompletedJob' }
+
+const benchmarkWorkspaces = new Set<string>()
+const benchmarkWorkspaceRunnables = new Map<string, BenchmarkWorkspaceRunnables>()
+const benchmarkJobs = new Map<string, { workspace: string; job: BenchmarkCompletedJob }>()
+
+export function resetBenchmarkMockBackend(): void {
+	benchmarkWorkspaces.clear()
+	benchmarkWorkspaceRunnables.clear()
+	benchmarkJobs.clear()
+}
+
+export function registerBenchmarkWorkspace(workspace: string): void {
+	benchmarkWorkspaces.add(workspace)
+}
+
+export function registerBenchmarkWorkspaceRunnables(
+	workspace: string,
+	runnables: BenchmarkWorkspaceRunnables
+): void {
+	benchmarkWorkspaces.add(workspace)
+	benchmarkWorkspaceRunnables.set(workspace, runnables)
+}
+
+export function unregisterBenchmarkWorkspace(workspace: string): void {
+	benchmarkWorkspaces.delete(workspace)
+	benchmarkWorkspaceRunnables.delete(workspace)
+	for (const [jobId, entry] of benchmarkJobs.entries()) {
+		if (entry.workspace === workspace) {
+			benchmarkJobs.delete(jobId)
+		}
+	}
+}
+
+export function unregisterBenchmarkWorkspaceRunnables(workspace: string): void {
+	unregisterBenchmarkWorkspace(workspace)
+}
+
+export function hasBenchmarkWorkspace(workspace: string): boolean {
+	return benchmarkWorkspaces.has(workspace)
+}
+
+export function listBenchmarkScripts(workspace: string): Script[] | null {
+	const runnables = benchmarkWorkspaceRunnables.get(workspace)
+	if (!runnables) {
+		return null
+	}
+	return (runnables.scripts ?? []).map(buildBenchmarkScript)
+}
+
+export function listBenchmarkFlows(workspace: string): Flow[] | null {
+	const runnables = benchmarkWorkspaceRunnables.get(workspace)
+	if (!runnables) {
+		return null
+	}
+	return (runnables.flows ?? []).map(buildBenchmarkFlow)
+}
+
+export function getBenchmarkScriptByPath(workspace: string, path: string): Script | null {
+	const script = benchmarkWorkspaceRunnables
+		.get(workspace)
+		?.scripts?.find((entry) => entry.path === path)
+
+	return script ? buildBenchmarkScript(script) : null
+}
+
+export function getBenchmarkScriptByHash(workspace: string, hash: string): Script | null {
+	const script = benchmarkWorkspaceRunnables
+		.get(workspace)
+		?.scripts?.find((entry) => buildBenchmarkScriptHash(entry.path) === hash)
+
+	return script ? buildBenchmarkScript(script) : null
+}
+
+export function getBenchmarkFlowByPath(workspace: string, path: string): Flow | null {
+	const flow = benchmarkWorkspaceRunnables
+		.get(workspace)
+		?.flows?.find((entry) => entry.path === path)
+
+	return flow ? buildBenchmarkFlow(flow) : null
+}
+
+export function createBenchmarkCompletedJob(input: {
+	workspace: string
+	jobKind: CompletedJob['job_kind']
+	success?: boolean
+	result?: unknown
+	logs?: string
+	scriptPath?: string
+	scriptHash?: string
+	args?: Record<string, unknown>
+}): string {
+	const jobId = `benchmark-job-${randomUUID()}`
+	const now = new Date().toISOString()
+	const job: BenchmarkCompletedJob = {
+		type: 'CompletedJob',
+		id: jobId,
+		workspace_id: input.workspace,
+		created_by: 'ai-evals',
+		created_at: now,
+		started_at: now,
+		completed_at: now,
+		duration_ms: 0,
+		success: input.success ?? true,
+		script_path: input.scriptPath,
+		script_hash: input.scriptHash,
+		args: input.args,
+		result: input.result,
+		logs: input.logs,
+		canceled: false,
+		job_kind: input.jobKind,
+		permissioned_as: 'u/ai-evals',
+		is_flow_step: false,
+		is_skipped: false,
+		email: 'ai-evals@local',
+		visible_to_owner: true,
+		tag: 'benchmark'
+	}
+
+	benchmarkJobs.set(jobId, { workspace: input.workspace, job })
+	return jobId
+}
+
+export function getBenchmarkCompletedJob(
+	workspace: string,
+	jobId: string
+): BenchmarkCompletedJob | null {
+	const entry = benchmarkJobs.get(jobId)
+	if (!entry || entry.workspace !== workspace) {
+		return null
+	}
+	return structuredClone(entry.job)
+}
+
+export function runBenchmarkScriptPreview(input: {
+	workspace: string
+	requestBody: {
+		content?: string
+		language?: ScriptLang | 'bunnative'
+		args?: Record<string, unknown>
+		path?: string
+	}
+}): string {
+	const content = input.requestBody.content ?? ''
+	const language = input.requestBody.language ?? 'bun'
+	const lintResult = buildScriptLintResult(content, language)
+	const success = lintResult.errorCount === 0
+
+	return createBenchmarkCompletedJob({
+		workspace: input.workspace,
+		jobKind: 'preview',
+		success,
+		scriptPath: input.requestBody.path,
+		args: input.requestBody.args,
+		result: success
+			? {
+					path: input.requestBody.path,
+					args: input.requestBody.args ?? {},
+					validated: true
+				}
+			: {
+					path: input.requestBody.path,
+					args: input.requestBody.args ?? {},
+					errorCount: lintResult.errorCount,
+					errors: lintResult.errors.map((entry) => ({
+						line: entry.startLineNumber,
+						message: entry.message
+					}))
+				}
+	})
+}
+
+export function runBenchmarkFlowByPath(input: {
+	workspace: string
+	path: string
+	args?: Record<string, unknown>
+}): string {
+	const flow = getBenchmarkFlowByPath(input.workspace, input.path)
+	return createBenchmarkCompletedJob({
+		workspace: input.workspace,
+		jobKind: 'flowpreview',
+		success: flow !== null,
+		args: input.args,
+		result:
+			flow !== null
+				? {
+						path: input.path,
+						args: input.args ?? {},
+						mocked: true
+					}
+				: {
+						error: `Flow "${input.path}" not found in benchmark workspace`
+					},
+		logs:
+			flow !== null
+				? 'Mock benchmark flow run completed successfully.'
+				: `Flow "${input.path}" not found in benchmark workspace.`
+	})
+}
+
+function buildBenchmarkScriptHash(path: string): string {
+	return `benchmark:${path}`
+}
+
+function buildBenchmarkScript(script: BenchmarkWorkspaceScript): Script {
+	return {
+		workspace_id: 'benchmark',
+		hash: buildBenchmarkScriptHash(script.path),
+		path: script.path,
+		parent_hashes: [],
+		summary: script.summary,
+		description: script.description ?? '',
+		content: script.content,
+		created_by: 'benchmark',
+		created_at: BENCHMARK_TIMESTAMP,
+		archived: false,
+		schema: script.schema ?? {},
+		deleted: false,
+		is_template: false,
+		extra_perms: {},
+		language: script.language,
+		kind: 'script',
+		starred: false,
+		has_preprocessor: false,
+		modules: null
+	}
+}
+
+function buildBenchmarkFlow(flow: BenchmarkWorkspaceFlow): Flow {
+	return {
+		path: flow.path,
+		summary: flow.summary,
+		description: flow.description ?? '',
+		value: flow.value,
+		schema: flow.schema ?? {},
+		edited_by: 'benchmark',
+		edited_at: BENCHMARK_TIMESTAMP,
+		archived: false,
+		extra_perms: {}
+	} as Flow
+}
diff --git a/ai_evals/adapters/frontend/progress.ts b/ai_evals/adapters/frontend/progress.ts
new file mode 100644
index 0000000000..5ec414c9fd
--- /dev/null
+++ b/ai_evals/adapters/frontend/progress.ts
@@ -0,0 +1,133 @@
+export type FrontendBenchmarkProgressSurface = 'flow' | 'app' | 'script'
+
+export type FrontendBenchmarkProgressEvent =
+	| {
+			type: 'run-start'
+			surface: FrontendBenchmarkProgressSurface
+			totalCases: number
+			runs: number
+			concurrency: number
+	  }
+	| {
+			type: 'attempt-start'
+			surface: FrontendBenchmarkProgressSurface
+			caseId: string
+			caseNumber: number
+			totalCases: number
+			attempt: number
+			runs: number
+	  }
+	| {
+			type: 'attempt-finish'
+			surface: FrontendBenchmarkProgressSurface
+			caseId: string
+			caseNumber: number
+			totalCases: number
+			attempt: number
+			runs: number
+			passed: boolean
+			durationMs: number
+			judgeScore: number | null
+			error: string | null
+	  }
+	| {
+			type: 'assistant-message-start'
+			surface: FrontendBenchmarkProgressSurface
+			caseId: string
+			caseNumber: number
+			totalCases: number
+			attempt: number
+			runs: number
+	  }
+	| {
+			type: 'assistant-chunk'
+			surface: FrontendBenchmarkProgressSurface
+			caseId: string
+			caseNumber: number
+			totalCases: number
+			attempt: number
+			runs: number
+			chunk: string
+	  }
+	| {
+			type: 'assistant-message-end'
+			surface: FrontendBenchmarkProgressSurface
+			caseId: string
+			caseNumber: number
+			totalCases: number
+			attempt: number
+			runs: number
+	  }
+
+export const FRONTEND_BENCHMARK_PROGRESS_PREFIX = 'WMILL_FRONTEND_AI_EVAL_PROGRESS '
+
+export function emitFrontendBenchmarkProgress(event: FrontendBenchmarkProgressEvent): void {
+	process.stderr.write(
+		`${FRONTEND_BENCHMARK_PROGRESS_PREFIX}${JSON.stringify(event)}\n`
+	)
+}
+
+export function parseFrontendBenchmarkProgressLine(
+	line: string
+): FrontendBenchmarkProgressEvent | null {
+	if (!line.startsWith(FRONTEND_BENCHMARK_PROGRESS_PREFIX)) {
+		return null
+	}
+
+	try {
+		const parsed = JSON.parse(
+			line.slice(FRONTEND_BENCHMARK_PROGRESS_PREFIX.length)
+		) as FrontendBenchmarkProgressEvent
+		return parsed?.type ? parsed : null
+	} catch {
+		return null
+	}
+}
+
+export function formatFrontendBenchmarkProgressEvent(
+	event: FrontendBenchmarkProgressEvent
+): string {
+	switch (event.type) {
+		case 'run-start':
+			return `Running ${event.surface}: ${event.totalCases} cases x ${event.runs} run${event.runs === 1 ? '' : 's'}, concurrency ${event.concurrency}`
+		case 'attempt-start':
+			return `${formatCasePrefix(event.caseNumber, event.totalCases)} ${event.caseId} attempt ${event.attempt}/${event.runs}...`
+		case 'attempt-finish': {
+			const parts = [
+				`${formatCasePrefix(event.caseNumber, event.totalCases)} ${event.caseId} attempt ${event.attempt}/${event.runs} ${event.passed ? 'pass' : 'fail'}`,
+				formatDuration(event.durationMs)
+			]
+			if (event.judgeScore !== null) {
+				parts.push(`judge ${formatNumber(event.judgeScore)}`)
+			}
+			if (event.error) {
+				parts.push(truncateSingleLine(event.error, 120))
+			}
+			return parts.join(' | ')
+		}
+		case 'assistant-message-start':
+		case 'assistant-chunk':
+		case 'assistant-message-end':
+			return ''
+	}
+}
+
+function formatCasePrefix(caseNumber: number, totalCases: number): string {
+	return `[${caseNumber}/${totalCases}]`
+}
+
+function formatDuration(durationMs: number): string {
+	return `${formatNumber(durationMs / 1000)}s`
+}
+
+function formatNumber(value: number): string {
+	return Number.isInteger(value) ? String(value) : value.toFixed(1)
+}
+
+function truncateSingleLine(value: string, maxLength: number): string {
+	const normalized = value.replace(/\s+/g, ' ').trim()
+	if (normalized.length <= maxLength) {
+		return normalized
+	}
+	return `${normalized.slice(0, Math.max(0, maxLength - 3))}...`
+}
diff --git a/ai_evals/adapters/frontend/runtime.ts b/ai_evals/adapters/frontend/runtime.ts
new file mode 100644
index 0000000000..8828cd63af
--- /dev/null
+++ b/ai_evals/adapters/frontend/runtime.ts
@@ -0,0 +1,216 @@
+import { spawn } from 'node:child_process'
+import { mkdtemp, readFile, rm } from 'node:fs/promises'
+import { tmpdir } from 'node:os'
+import path from 'node:path'
+import { fileURLToPath } from 'node:url'
+import {
+	formatFrontendBenchmarkProgressEvent,
+	parseFrontendBenchmarkProgressLine
+} from './progress'
+import type { BenchmarkRunResult } from '../../core/types'
+
+const REPO_ROOT = fileURLToPath(new URL('../../../', import.meta.url))
+const FRONTEND_DIR = path.join(REPO_ROOT, 'frontend')
+const FRONTEND_BENCHMARK_TEST = '../ai_evals/adapters/frontend/vitestAdapter.test.ts'
+const FRONTEND_BENCHMARK_CONFIG = '../ai_evals/adapters/frontend/vitest.config.ts'
+
+export type FrontendMode = 'flow' | 'app' | 'script'
+
+export async function runFrontendBenchmarkAdapter(input: {
+	mode: FrontendMode
+	caseIds: string[]
+	runs: number
+	model?: string
+	verbose?: boolean
+}): Promise<BenchmarkRunResult> {
+	const tempDir = await mkdtemp(path.join(tmpdir(), 'wmill-frontend-benchmark-'))
+	const outputPath = path.join(tempDir, 'result.json')
+
+	try {
+		await runVitestBenchmark(
+			path.join(FRONTEND_DIR, 'node_modules', '.bin', 'vitest'),
+			[
+				'run',
+				FRONTEND_BENCHMARK_TEST,
+				'--project',
+				'server',
+				'--config',
+				FRONTEND_BENCHMARK_CONFIG
+			],
+			{
+				cwd: FRONTEND_DIR,
+				env: {
+					...process.env,
+					BROWSERSLIST_IGNORE_OLD_DATA: '1',
+					WMILL_FRONTEND_AI_EVAL_OUTPUT_PATH: outputPath,
+					WMILL_FRONTEND_AI_EVAL_MODE: input.mode,
+					WMILL_FRONTEND_AI_EVAL_CASE_IDS: JSON.stringify(input.caseIds),
+					WMILL_FRONTEND_AI_EVAL_RUNS: String(input.runs),
+					WMILL_FRONTEND_AI_EVAL_MODEL: input.model ?? "",
+					WMILL_FRONTEND_AI_EVAL_PROGRESS: '1',
+					WMILL_FRONTEND_AI_EVAL_VERBOSE: input.verbose ? '1' : '0'
+				}
+			}
+		)
+
+		const raw = await readFile(outputPath, 'utf8')
+		return JSON.parse(raw) as BenchmarkRunResult
+	} catch (error) {
+		throw new Error(`Frontend benchmark adapter failed:\n${toErrorMessage(error)}`)
+	} finally {
+		await rm(tempDir, { recursive: true, force: true })
+	}
+}
+
+async function runVitestBenchmark(
+	command: string,
+	args: string[],
+	options: {
+		cwd: string
+		env: NodeJS.ProcessEnv
+	}
+): Promise<void> {
+	const child = spawn(command, args, {
+		cwd: options.cwd,
+		env: options.env,
+		stdio: ['ignore', 'pipe', 'pipe']
+	})
+
+	let stdout = ''
+	let stderr = ''
+	let stderrLineBuffer = ''
+	let assistantStreamOpen = false
+
+	child.stdout?.setEncoding('utf8')
+	child.stdout?.on('data', (chunk: string) => {
+		stdout += chunk
+	})
+
+	child.stderr?.setEncoding('utf8')
+	child.stderr?.on('data', (chunk: string) => {
+		stderrLineBuffer += chunk
+		const { remainder, passthrough, nextAssistantStreamOpen } = drainProgressLines(
+			stderrLineBuffer,
+			assistantStreamOpen
+		)
+		stderrLineBuffer = remainder
+		stderr += passthrough
+		assistantStreamOpen = nextAssistantStreamOpen
+	})
+
+	await new Promise<void>((resolve, reject) => {
+		child.once('error', reject)
+		child.once('close', (code) => {
+			if (stderrLineBuffer.length > 0) {
+				const {
+					remainder,
+					passthrough,
+					nextAssistantStreamOpen
+				} = drainProgressLines(`${stderrLineBuffer}\n`, assistantStreamOpen)
+				stderrLineBuffer = remainder
+				stderr += passthrough
+				assistantStreamOpen = nextAssistantStreamOpen
+			}
+
+			if (code === 0) {
+				if (assistantStreamOpen) {
+					process.stderr.write('\n')
+				}
+				resolve()
+				return
+			}
+
+			const details = [`vitest exited with code ${code}`, stdout, stderr].filter(Boolean).join('\n')
+			reject(new Error(details))
+		})
+	})
+}
+
+function drainProgressLines(buffer: string): {
+	remainder: string
+	passthrough: string
+	nextAssistantStreamOpen: boolean
+}
+function drainProgressLines(
+	buffer: string,
+	initialAssistantStreamOpen: boolean
+): {
+	remainder: string
+	passthrough: string
+	nextAssistantStreamOpen: boolean
+} {
+	let remainder = buffer
+	let passthrough = ''
+	let assistantStreamOpen = initialAssistantStreamOpen
+
+	while (true) {
+		const newlineIndex = remainder.indexOf('\n')
+		if (newlineIndex === -1) {
+			return { remainder, passthrough, nextAssistantStreamOpen: assistantStreamOpen }
+		}
+
+		const line = remainder.slice(0, newlineIndex).replace(/\r$/, '')
+		remainder = remainder.slice(newlineIndex + 1)
+
+		const progressEvent = parseFrontendBenchmarkProgressLine(line)
+		if (progressEvent) {
+			if (progressEvent.type === 'assistant-message-start') {
+				if (assistantStreamOpen) {
+					process.stderr.write('\n')
+				}
+				process.stderr.write(
+					`${formatCasePrefix(progressEvent.caseNumber, progressEvent.totalCases)} ${progressEvent.caseId} attempt ${progressEvent.attempt}/${progressEvent.runs} assistant:\n`
+				)
+				assistantStreamOpen = true
+				continue
+			}
+
+			if (progressEvent.type === 'assistant-chunk') {
+				process.stderr.write(progressEvent.chunk)
+				continue
+			}
+
+			if (progressEvent.type === 'assistant-message-end') {
+				if (assistantStreamOpen) {
+					process.stderr.write('\n')
+				}
+				assistantStreamOpen = false
+				continue
+			}
+
+			if (assistantStreamOpen) {
+				process.stderr.write('\n')
+				assistantStreamOpen = false
+			}
+			process.stderr.write(`${formatFrontendBenchmarkProgressEvent(progressEvent)}\n`)
+			continue
+		}
+
+		if (shouldSuppressFrontendStderrLine(line)) {
+			continue
+		}
+
+		passthrough += `${line}\n`
+		process.stderr.write(`${line}\n`)
+	}
+}
+
+function formatCasePrefix(caseNumber: number, totalCases: number): string {
+	return `[${caseNumber}/${totalCases}]`
+}
+
+function shouldSuppressFrontendStderrLine(line: string): boolean {
+	return (
+		line.startsWith('[baseline-browser-mapping] ') ||
+		line.startsWith('Browserslist: browsers data (caniuse-lite) is ') ||
+		line.includes('update-browserslist-db@latest') ||
+		line.includes('update-db#readme')
+	)
+}
+
+function toErrorMessage(error: unknown): string {
+	if (error instanceof Error) {
+		return error.message
+	}
+	return String(error)
+}
diff --git a/ai_evals/adapters/frontend/vitest.config.ts b/ai_evals/adapters/frontend/vitest.config.ts
new file mode 100644
index 0000000000..daed7749c1
--- /dev/null
+++ b/ai_evals/adapters/frontend/vitest.config.ts
@@ -0,0 +1,28 @@
+import { fileURLToPath } from 'node:url'
+import frontendConfig from '../../../frontend/vite.config.js'
+
+const FRONTEND_VITE_CONFIG_PATH = fileURLToPath(new URL('../../../frontend/vite.config.js', import.meta.url))
+const FRONTEND_TEST_SETUP_PATH = fileURLToPath(
+	new URL('../../../frontend/src/lib/test-setup.ts', import.meta.url)
+)
+const ADAPTER_TEST_PATH = fileURLToPath(new URL('./vitestAdapter.test.ts', import.meta.url))
+
+const config = {
+	...frontendConfig,
+	test: {
+		...frontendConfig.test,
+		projects: [
+			{
+				extends: FRONTEND_VITE_CONFIG_PATH,
+				test: {
+					name: 'server',
+					environment: 'node',
+					include: [ADAPTER_TEST_PATH],
+					setupFiles: [FRONTEND_TEST_SETUP_PATH]
+				}
+			}
+		]
+	}
+}
+
+export default config
diff --git a/ai_evals/adapters/frontend/vitestAdapter.test.ts b/ai_evals/adapters/frontend/vitestAdapter.test.ts
new file mode 100644
index 0000000000..9256bc9334
--- /dev/null
+++ b/ai_evals/adapters/frontend/vitestAdapter.test.ts
@@ -0,0 +1,165 @@
+import { expect, it, vi } from 'vitest'
+// @ts-ignore - Node.js fs/promises
+import { mkdir, writeFile } from 'fs/promises'
+// @ts-ignore - Node.js path
+import { dirname, resolve } from 'path'
+
+vi.mock('monaco-editor', () => ({
+	editor: {},
+	languages: {},
+	KeyCode: {},
+	Uri: {
+		parse: (value: string) => ({ toString: () => value })
+	},
+	MarkerSeverity: {
+		Error: 8,
+		Warning: 4,
+		Info: 2,
+		Hint: 1
+	}
+}))
+
+vi.mock('@codingame/monaco-vscode-standalone-typescript-language-features', () => ({
+	getTypeScriptWorker: async () => async () => ({}),
+	typescriptVersion: 'test'
+}))
+
+vi.mock('@codingame/monaco-vscode-languages-service-override', () => ({
+	default: () => ({})
+}))
+
+vi.mock('$lib/components/vscode', () => ({}))
+
+vi.mock('$lib/gen', async () => {
+	const actual = await vi.importActual<any>('$lib/gen')
+	const {
+		getBenchmarkCompletedJob,
+		getBenchmarkFlowByPath,
+		getBenchmarkScriptByHash,
+		getBenchmarkScriptByPath,
+		hasBenchmarkWorkspace,
+		listBenchmarkFlows,
+		listBenchmarkScripts,
+		runBenchmarkFlowByPath,
+		runBenchmarkScriptPreview
+	} = await import('./mockBackend')
+
+	function wrapService<T extends object>(target: T, overrides: Record<string, unknown>): T {
+		return new Proxy(target, {
+			get(source, property, receiver) {
+				if (typeof property === 'string' && property in overrides) {
+					return overrides[property]
+				}
+				return Reflect.get(source, property, receiver)
+			}
+		})
+	}
+
+	return {
+		...actual,
+		ScriptService: wrapService(actual.ScriptService, {
+			listScripts: async (data: { workspace: string }) =>
+				hasBenchmarkWorkspace(data.workspace)
+					? (listBenchmarkScripts(data.workspace) ?? [])
+					: actual.ScriptService.listScripts(data),
+			getScriptByPath: async (data: { workspace: string; path: string }) => {
+				if (hasBenchmarkWorkspace(data.workspace)) {
+					const script = getBenchmarkScriptByPath(data.workspace, data.path)
+					if (!script) {
+						throw new Error(`Script "${data.path}" not found in benchmark workspace`)
+					}
+					return script
+				}
+				return actual.ScriptService.getScriptByPath(data)
+			},
+			getScriptByHash: async (data: { workspace: string; hash: string }) => {
+				if (hasBenchmarkWorkspace(data.workspace)) {
+					const script = getBenchmarkScriptByHash(data.workspace, data.hash)
+					if (!script) {
+						throw new Error(`Script hash "${data.hash}" not found in benchmark workspace`)
+					}
+					return script
+				}
+				return actual.ScriptService.getScriptByHash(data)
+			}
+		}),
+		FlowService: wrapService(actual.FlowService, {
+			listFlows: async (data: { workspace: string }) =>
+				hasBenchmarkWorkspace(data.workspace)
+					? (listBenchmarkFlows(data.workspace) ?? [])
+					: actual.FlowService.listFlows(data),
+			getFlowByPath: async (data: { workspace: string; path: string }) => {
+				if (hasBenchmarkWorkspace(data.workspace)) {
+					const flow = getBenchmarkFlowByPath(data.workspace, data.path)
+					if (!flow) {
+						throw new Error(`Flow "${data.path}" not found in benchmark workspace`)
+					}
+					return flow
+				}
+				return actual.FlowService.getFlowByPath(data)
+			}
+		}),
+		JobService: wrapService(actual.JobService, {
+			runScriptPreview: async (data: {
+				workspace: string
+				requestBody?: {
+					content?: string
+					language?: string
+					args?: Record<string, unknown>
+					path?: string
+				}
+			}) =>
+				hasBenchmarkWorkspace(data.workspace)
+					? runBenchmarkScriptPreview({
+							workspace: data.workspace,
+							requestBody: data.requestBody ?? {}
+						})
+					: actual.JobService.runScriptPreview(data),
+			runFlowByPath: async (data: {
+				workspace: string
+				path: string
+				requestBody?: Record<string, unknown>
+			}) =>
+				hasBenchmarkWorkspace(data.workspace)
+					? runBenchmarkFlowByPath({
+							workspace: data.workspace,
+							path: data.path,
+							args: data.requestBody
+						})
+					: actual.JobService.runFlowByPath(data),
+			getJob: async (data: { workspace: string; id: string }) => {
+				if (hasBenchmarkWorkspace(data.workspace)) {
+					const job = getBenchmarkCompletedJob(data.workspace, data.id)
+					if (!job) {
+						throw new Error(`Job "${data.id}" not found in benchmark workspace`)
+					}
+					return job
+				}
+				return actual.JobService.getJob(data)
+			}
+		})
+	}
+})
+
+const benchmarkOutputPath = process.env.WMILL_FRONTEND_AI_EVAL_OUTPUT_PATH
+const benchmarkIt = benchmarkOutputPath ? it : it.skip
+
+benchmarkIt(
+	'runs the frontend benchmark adapter from environment input',
+	async () => {
+		const { resetBenchmarkMockBackend } = await import('./mockBackend')
+		resetBenchmarkMockBackend()
+		const { runFrontendBenchmarkFromEnv } = await import('./benchmarkRunner')
+		try {
+			const payload = await runFrontendBenchmarkFromEnv()
+			const absoluteOutputPath = resolve(benchmarkOutputPath!)
+			await mkdir(dirname(absoluteOutputPath), { recursive: true })
+			await writeFile(absoluteOutputPath, JSON.stringify(payload, null, 2) + '\n', 'utf8')
+
+			expect(payload.cases.length).toBeGreaterThan(0)
+		} finally {
+			resetBenchmarkMockBackend()
+		}
+	},
+	600_000
+)
diff --git a/ai_evals/bun.lock b/ai_evals/bun.lock
new file mode 100644
index 0000000000..eaed1db99a
--- /dev/null
+++ b/ai_evals/bun.lock
@@ -0,0 +1,313 @@
+{
+  "lockfileVersion": 1,
+  "configVersion": 1,
+  "workspaces": {
+    "": {
+      "name": "windmill-ai-evals",
+      "dependencies": {
+        "@anthropic-ai/claude-agent-sdk": "^0.2.25",
+        "@anthropic-ai/sdk": "^0.39.0",
+        "commander": "^14.0.3",
+        "openai": "^6.9.1",
+        "yaml": "^2.8.3",
+      },
+      "devDependencies": {
+        "@types/bun": "latest",
+        "typescript": "^5.0.0",
+      },
+    },
+  },
+  "packages": {
+    "@anthropic-ai/claude-agent-sdk": ["@anthropic-ai/claude-agent-sdk@0.2.87", "", { "dependencies": { "@anthropic-ai/sdk": "^0.74.0", "@modelcontextprotocol/sdk": "^1.27.1" }, "optionalDependencies": { "@img/sharp-darwin-arm64": "^0.34.2", "@img/sharp-darwin-x64": "^0.34.2", "@img/sharp-linux-arm": "^0.34.2", "@img/sharp-linux-arm64": "^0.34.2", "@img/sharp-linux-x64": "^0.34.2", "@img/sharp-linuxmusl-arm64": "^0.34.2", "@img/sharp-linuxmusl-x64": "^0.34.2", "@img/sharp-win32-arm64": "^0.34.2", "@img/sharp-win32-x64": "^0.34.2" }, "peerDependencies": { "zod": "^4.0.0" } }, "sha512-WWmgBPxPhBOvNT0ujI8vPTI2lK+w5YEkEZ/y1mH0EDkK/0kBnxVJNhCtG5vnueiAViwLoUOFn66pbkDiivijdA=="],
+
+    "@anthropic-ai/sdk": ["@anthropic-ai/sdk@0.39.0", "", { "dependencies": { "@types/node": "^18.11.18", "@types/node-fetch": "^2.6.4", "abort-controller": "^3.0.0", "agentkeepalive": "^4.2.1", "form-data-encoder": "1.7.2", "formdata-node": "^4.3.2", "node-fetch": "^2.6.7" } }, "sha512-eMyDIPRZbt1CCLErRCi3exlAvNkBtRe+kW5vvJyef93PmNr/clstYgHhtvmkxN82nlKgzyGPCyGxrm0JQ1ZIdg=="],
+
+    "@babel/runtime": ["@babel/runtime@7.29.2", "", {}, "sha512-JiDShH45zKHWyGe4ZNVRrCjBz8Nh9TMmZG1kh4QTK8hCBTWBi8Da+i7s1fJw7/lYpM4ccepSNfqzZ/QvABBi5g=="],
+
+    "@hono/node-server": ["@hono/node-server@1.19.12", "", { "peerDependencies": { "hono": "^4" } }, "sha512-txsUW4SQ1iilgE0l9/e9VQWmELXifEFvmdA1j6WFh/aFPj99hIntrSsq/if0UWyGVkmrRPKA1wCeP+UCr1B9Uw=="],
+
+    "@img/sharp-darwin-arm64": ["@img/sharp-darwin-arm64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-darwin-arm64": "1.2.4" }, "os": "darwin", "cpu": "arm64" }, "sha512-imtQ3WMJXbMY4fxb/Ndp6HBTNVtWCUI0WdobyheGf5+ad6xX8VIDO8u2xE4qc/fr08CKG/7dDseFtn6M6g/r3w=="],
+
+    "@img/sharp-darwin-x64": ["@img/sharp-darwin-x64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-darwin-x64": "1.2.4" }, "os": "darwin", "cpu": "x64" }, "sha512-YNEFAF/4KQ/PeW0N+r+aVVsoIY0/qxxikF2SWdp+NRkmMB7y9LBZAVqQ4yhGCm/H3H270OSykqmQMKLBhBJDEw=="],
+
+    "@img/sharp-libvips-darwin-arm64": ["@img/sharp-libvips-darwin-arm64@1.2.4", "", { "os": "darwin", "cpu": "arm64" }, "sha512-zqjjo7RatFfFoP0MkQ51jfuFZBnVE2pRiaydKJ1G/rHZvnsrHAOcQALIi9sA5co5xenQdTugCvtb1cuf78Vf4g=="],
+
+    "@img/sharp-libvips-darwin-x64": ["@img/sharp-libvips-darwin-x64@1.2.4", "", { "os": "darwin", "cpu": "x64" }, "sha512-1IOd5xfVhlGwX+zXv2N93k0yMONvUlANylbJw1eTah8K/Jtpi15KC+WSiaX/nBmbm2HxRM1gZ0nSdjSsrZbGKg=="],
+
+    "@img/sharp-libvips-linux-arm": ["@img/sharp-libvips-linux-arm@1.2.4", "", { "os": "linux", "cpu": "arm" }, "sha512-bFI7xcKFELdiNCVov8e44Ia4u2byA+l3XtsAj+Q8tfCwO6BQ8iDojYdvoPMqsKDkuoOo+X6HZA0s0q11ANMQ8A=="],
+
+    "@img/sharp-libvips-linux-arm64": ["@img/sharp-libvips-linux-arm64@1.2.4", "", { "os": "linux", "cpu": "arm64" }, "sha512-excjX8DfsIcJ10x1Kzr4RcWe1edC9PquDRRPx3YVCvQv+U5p7Yin2s32ftzikXojb1PIFc/9Mt28/y+iRklkrw=="],
+
+    "@img/sharp-libvips-linux-x64": ["@img/sharp-libvips-linux-x64@1.2.4", "", { "os": "linux", "cpu": "x64" }, "sha512-tJxiiLsmHc9Ax1bz3oaOYBURTXGIRDODBqhveVHonrHJ9/+k89qbLl0bcJns+e4t4rvaNBxaEZsFtSfAdquPrw=="],
+
+    "@img/sharp-libvips-linuxmusl-arm64": ["@img/sharp-libvips-linuxmusl-arm64@1.2.4", "", { "os": "linux", "cpu": "arm64" }, "sha512-FVQHuwx1IIuNow9QAbYUzJ+En8KcVm9Lk5+uGUQJHaZmMECZmOlix9HnH7n1TRkXMS0pGxIJokIVB9SuqZGGXw=="],
+
+    "@img/sharp-libvips-linuxmusl-x64": ["@img/sharp-libvips-linuxmusl-x64@1.2.4", "", { "os": "linux", "cpu": "x64" }, "sha512-+LpyBk7L44ZIXwz/VYfglaX/okxezESc6UxDSoyo2Ks6Jxc4Y7sGjpgU9s4PMgqgjj1gZCylTieNamqA1MF7Dg=="],
+
+    "@img/sharp-linux-arm": ["@img/sharp-linux-arm@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-arm": "1.2.4" }, "os": "linux", "cpu": "arm" }, "sha512-9dLqsvwtg1uuXBGZKsxem9595+ujv0sJ6Vi8wcTANSFpwV/GONat5eCkzQo/1O6zRIkh0m/8+5BjrRr7jDUSZw=="],
+
+    "@img/sharp-linux-arm64": ["@img/sharp-linux-arm64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-arm64": "1.2.4" }, "os": "linux", "cpu": "arm64" }, "sha512-bKQzaJRY/bkPOXyKx5EVup7qkaojECG6NLYswgktOZjaXecSAeCWiZwwiFf3/Y+O1HrauiE3FVsGxFg8c24rZg=="],
+
+    "@img/sharp-linux-x64": ["@img/sharp-linux-x64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-x64": "1.2.4" }, "os": "linux", "cpu": "x64" }, "sha512-MEzd8HPKxVxVenwAa+JRPwEC7QFjoPWuS5NZnBt6B3pu7EG2Ge0id1oLHZpPJdn3OQK+BQDiw9zStiHBTJQQQQ=="],
+
+    "@img/sharp-linuxmusl-arm64": ["@img/sharp-linuxmusl-arm64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linuxmusl-arm64": "1.2.4" }, "os": "linux", "cpu": "arm64" }, "sha512-fprJR6GtRsMt6Kyfq44IsChVZeGN97gTD331weR1ex1c1rypDEABN6Tm2xa1wE6lYb5DdEnk03NZPqA7Id21yg=="],
+
+    "@img/sharp-linuxmusl-x64": ["@img/sharp-linuxmusl-x64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linuxmusl-x64": "1.2.4" }, "os": "linux", "cpu": "x64" }, "sha512-Jg8wNT1MUzIvhBFxViqrEhWDGzqymo3sV7z7ZsaWbZNDLXRJZoRGrjulp60YYtV4wfY8VIKcWidjojlLcWrd8Q=="],
+
+    "@img/sharp-win32-arm64": ["@img/sharp-win32-arm64@0.34.5", "", { "os": "win32", "cpu": "arm64" }, "sha512-WQ3AgWCWYSb2yt+IG8mnC6Jdk9Whs7O0gxphblsLvdhSpSTtmu69ZG1Gkb6NuvxsNACwiPV6cNSZNzt0KPsw7g=="],
+
+    "@img/sharp-win32-x64": ["@img/sharp-win32-x64@0.34.5", "", { "os": "win32", "cpu": "x64" }, "sha512-+29YMsqY2/9eFEiW93eqWnuLcWcufowXewwSNIT6UwZdUUCrM3oFjMWH/Z6/TMmb4hlFenmfAVbpWeup2jryCw=="],
+
+    "@modelcontextprotocol/sdk": ["@modelcontextprotocol/sdk@1.29.0", "", { "dependencies": { "@hono/node-server": "^1.19.9", "ajv": "^8.17.1", "ajv-formats": "^3.0.1", "content-type": "^1.0.5", "cors": "^2.8.5", "cross-spawn": "^7.0.5", "eventsource": "^3.0.2", "eventsource-parser": "^3.0.0", "express": "^5.2.1", "express-rate-limit": "^8.2.1", "hono": "^4.11.4", "jose": "^6.1.3", "json-schema-typed": "^8.0.2", "pkce-challenge": "^5.0.0", "raw-body": "^3.0.0", "zod": "^3.25 || ^4.0", "zod-to-json-schema": "^3.25.1" }, "peerDependencies": { "@cfworker/json-schema": "^4.1.1" }, "optionalPeers": ["@cfworker/json-schema"] }, "sha512-zo37mZA9hJWpULgkRpowewez1y6ML5GsXJPY8FI0tBBCd77HEvza4jDqRKOXgHNn867PVGCyTdzqpz0izu5ZjQ=="],
+
+    "@types/bun": ["@types/bun@1.3.11", "", { "dependencies": { "bun-types": "1.3.11" } }, "sha512-5vPne5QvtpjGpsGYXiFyycfpDF2ECyPcTSsFBMa0fraoxiQyMJ3SmuQIGhzPg2WJuWxVBoxWJ2kClYTcw/4fAg=="],
+
+    "@types/node": ["@types/node@18.19.130", "", { "dependencies": { "undici-types": "~5.26.4" } }, "sha512-GRaXQx6jGfL8sKfaIDD6OupbIHBr9jv7Jnaml9tB7l4v068PAOXqfcujMMo5PhbIs6ggR1XODELqahT2R8v0fg=="],
+
+    "@types/node-fetch": ["@types/node-fetch@2.6.13", "", { "dependencies": { "@types/node": "*", "form-data": "^4.0.4" } }, "sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw=="],
+
+    "abort-controller": ["abort-controller@3.0.0", "", { "dependencies": { "event-target-shim": "^5.0.0" } }, "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg=="],
+
+    "accepts": ["accepts@2.0.0", "", { "dependencies": { "mime-types": "^3.0.0", "negotiator": "^1.0.0" } }, "sha512-5cvg6CtKwfgdmVqY1WIiXKc3Q1bkRqGLi+2W/6ao+6Y7gu/RCwRuAhGEzh5B4KlszSuTLgZYuqFqo5bImjNKng=="],
+
+    "agentkeepalive": ["agentkeepalive@4.6.0", "", { "dependencies": { "humanize-ms": "^1.2.1" } }, "sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ=="],
+
+    "ajv": ["ajv@8.18.0", "", { "dependencies": { "fast-deep-equal": "^3.1.3", "fast-uri": "^3.0.1", "json-schema-traverse": "^1.0.0", "require-from-string": "^2.0.2" } }, "sha512-PlXPeEWMXMZ7sPYOHqmDyCJzcfNrUr3fGNKtezX14ykXOEIvyK81d+qydx89KY5O71FKMPaQ2vBfBFI5NHR63A=="],
+
+    "ajv-formats": ["ajv-formats@3.0.1", "", { "dependencies": { "ajv": "^8.0.0" } }, "sha512-8iUql50EUR+uUcdRQ3HDqa6EVyo3docL8g5WJ3FNcWmu62IbkGUue/pEyLBW8VGKKucTPgqeks4fIU1DA4yowQ=="],
+
+    "asynckit": ["asynckit@0.4.0", "", {}, "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="],
+
+    "body-parser": ["body-parser@2.2.2", "", { "dependencies": { "bytes": "^3.1.2", "content-type": "^1.0.5", "debug": "^4.4.3", "http-errors": "^2.0.0", "iconv-lite": "^0.7.0", "on-finished": "^2.4.1", "qs": "^6.14.1", "raw-body": "^3.0.1", "type-is": "^2.0.1" } }, "sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA=="],
+
+    "bun-types": ["bun-types@1.3.11", "", { "dependencies": { "@types/node": "*" } }, "sha512-1KGPpoxQWl9f6wcZh57LvrPIInQMn2TQ7jsgxqpRzg+l0QPOFvJVH7HmvHo/AiPgwXy+/Thf6Ov3EdVn1vOabg=="],
+
+    "bytes": ["bytes@3.1.2", "", {}, "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg=="],
+
+    "call-bind-apply-helpers": ["call-bind-apply-helpers@1.0.2", "", { "dependencies": { "es-errors": "^1.3.0", "function-bind": "^1.1.2" } }, "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ=="],
+
+    "call-bound": ["call-bound@1.0.4", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "get-intrinsic": "^1.3.0" } }, "sha512-+ys997U96po4Kx/ABpBCqhA9EuxJaQWDQg7295H4hBphv3IZg0boBKuwYpt4YXp6MZ5AmZQnU/tyMTlRpaSejg=="],
+
+    "combined-stream": ["combined-stream@1.0.8", "", { "dependencies": { "delayed-stream": "~1.0.0" } }, "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg=="],
+
+    "commander": ["commander@14.0.3", "", {}, "sha512-H+y0Jo/T1RZ9qPP4Eh1pkcQcLRglraJaSLoyOtHxu6AapkjWVCy2Sit1QQ4x3Dng8qDlSsZEet7g5Pq06MvTgw=="],
+
+    "content-disposition": ["content-disposition@1.0.1", "", {}, "sha512-oIXISMynqSqm241k6kcQ5UwttDILMK4BiurCfGEREw6+X9jkkpEe5T9FZaApyLGGOnFuyMWZpdolTXMtvEJ08Q=="],
+
+    "content-type": ["content-type@1.0.5", "", {}, "sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA=="],
+
+    "cookie": ["cookie@0.7.2", "", {}, "sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w=="],
+
+    "cookie-signature": ["cookie-signature@1.2.2", "", {}, "sha512-D76uU73ulSXrD1UXF4KE2TMxVVwhsnCgfAyTg9k8P6KGZjlXKrOLe4dJQKI3Bxi5wjesZoFXJWElNWBjPZMbhg=="],
+
+    "cors": ["cors@2.8.6", "", { "dependencies": { "object-assign": "^4", "vary": "^1" } }, "sha512-tJtZBBHA6vjIAaF6EnIaq6laBBP9aq/Y3ouVJjEfoHbRBcHBAHYcMh/w8LDrk2PvIMMq8gmopa5D4V8RmbrxGw=="],
+
+    "cross-spawn": ["cross-spawn@7.0.6", "", { "dependencies": { "path-key": "^3.1.0", "shebang-command": "^2.0.0", "which": "^2.0.1" } }, "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA=="],
+
+    "debug": ["debug@4.4.3", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA=="],
+
+    "delayed-stream": ["delayed-stream@1.0.0", "", {}, "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ=="],
+
+    "depd": ["depd@2.0.0", "", {}, "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw=="],
+
+    "dunder-proto": ["dunder-proto@1.0.1", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.1", "es-errors": "^1.3.0", "gopd": "^1.2.0" } }, "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A=="],
+
+    "ee-first": ["ee-first@1.1.1", "", {}, "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow=="],
+
+    "encodeurl": ["encodeurl@2.0.0", "", {}, "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg=="],
+
+    "es-define-property": ["es-define-property@1.0.1", "", {}, "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g=="],
+
+    "es-errors": ["es-errors@1.3.0", "", {}, "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw=="],
+
+    "es-object-atoms": ["es-object-atoms@1.1.1", "", { "dependencies": { "es-errors": "^1.3.0" } }, "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA=="],
+
+    "es-set-tostringtag": ["es-set-tostringtag@2.1.0", "", { "dependencies": { "es-errors": "^1.3.0", "get-intrinsic": "^1.2.6", "has-tostringtag": "^1.0.2", "hasown": "^2.0.2" } }, "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA=="],
+
+    "escape-html": ["escape-html@1.0.3", "", {}, "sha512-NiSupZ4OeuGwr68lGIeym/ksIZMJodUGOSCZ/FSnTxcrekbvqrgdUxlJOMpijaKZVjAJrWrGs/6Jy8OMuyj9ow=="],
+
+    "etag": ["etag@1.8.1", "", {}, "sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg=="],
+
+    "event-target-shim": ["event-target-shim@5.0.1", "", {}, "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ=="],
+
+    "eventsource": ["eventsource@3.0.7", "", { "dependencies": { "eventsource-parser": "^3.0.1" } }, "sha512-CRT1WTyuQoD771GW56XEZFQ/ZoSfWid1alKGDYMmkt2yl8UXrVR4pspqWNEcqKvVIzg6PAltWjxcSSPrboA4iA=="],
+
+    "eventsource-parser": ["eventsource-parser@3.0.6", "", {}, "sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg=="],
+
+    "express": ["express@5.2.1", "", { "dependencies": { "accepts": "^2.0.0", "body-parser": "^2.2.1", "content-disposition": "^1.0.0", "content-type": "^1.0.5", "cookie": "^0.7.1", "cookie-signature": "^1.2.1", "debug": "^4.4.0", "depd": "^2.0.0", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "etag": "^1.8.1", "finalhandler": "^2.1.0", "fresh": "^2.0.0", "http-errors": "^2.0.0", "merge-descriptors": "^2.0.0", "mime-types": "^3.0.0", "on-finished": "^2.4.1", "once": "^1.4.0", "parseurl": "^1.3.3", "proxy-addr": "^2.0.7", "qs": "^6.14.0", "range-parser": "^1.2.1", "router": "^2.2.0", "send": "^1.1.0", "serve-static": "^2.2.0", "statuses": "^2.0.1", "type-is": "^2.0.1", "vary": "^1.1.2" } }, "sha512-hIS4idWWai69NezIdRt2xFVofaF4j+6INOpJlVOLDO8zXGpUVEVzIYk12UUi2JzjEzWL3IOAxcTubgz9Po0yXw=="],
+
+    "express-rate-limit": ["express-rate-limit@8.3.2", "", { "dependencies": { "ip-address": "10.1.0" }, "peerDependencies": { "express": ">= 4.11" } }, "sha512-77VmFeJkO0/rvimEDuUC5H30oqUC4EyOhyGccfqoLebB0oiEYfM7nwPrsDsBL1gsTpwfzX8SFy2MT3TDyRq+bg=="],
+
+    "fast-deep-equal": ["fast-deep-equal@3.1.3", "", {}, "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q=="],
+
+    "fast-uri": ["fast-uri@3.1.0", "", {}, "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA=="],
+
+    "finalhandler": ["finalhandler@2.1.1", "", { "dependencies": { "debug": "^4.4.0", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "on-finished": "^2.4.1", "parseurl": "^1.3.3", "statuses": "^2.0.1" } }, "sha512-S8KoZgRZN+a5rNwqTxlZZePjT/4cnm0ROV70LedRHZ0p8u9fRID0hJUZQpkKLzro8LfmC8sx23bY6tVNxv8pQA=="],
+
+    "form-data": ["form-data@4.0.5", "", { "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", "es-set-tostringtag": "^2.1.0", "hasown": "^2.0.2", "mime-types": "^2.1.12" } }, "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w=="],
+
+    "form-data-encoder": ["form-data-encoder@1.7.2", "", {}, "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A=="],
+
+    "formdata-node": ["formdata-node@4.4.1", "", { "dependencies": { "node-domexception": "1.0.0", "web-streams-polyfill": "4.0.0-beta.3" } }, "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ=="],
+
+    "forwarded": ["forwarded@0.2.0", "", {}, "sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow=="],
+
+    "fresh": ["fresh@2.0.0", "", {}, "sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A=="],
+
+    "function-bind": ["function-bind@1.1.2", "", {}, "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA=="],
+
+    "get-intrinsic": ["get-intrinsic@1.3.0", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "es-define-property": "^1.0.1", "es-errors": "^1.3.0", "es-object-atoms": "^1.1.1", "function-bind": "^1.1.2", "get-proto": "^1.0.1", "gopd": "^1.2.0", "has-symbols": "^1.1.0", "hasown": "^2.0.2", "math-intrinsics": "^1.1.0" } }, "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ=="],
+
+    "get-proto": ["get-proto@1.0.1", "", { "dependencies": { "dunder-proto": "^1.0.1", "es-object-atoms": "^1.0.0" } }, "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g=="],
+
+    "gopd": ["gopd@1.2.0", "", {}, "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg=="],
+
+    "has-symbols": ["has-symbols@1.1.0", "", {}, "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ=="],
+
+    "has-tostringtag": ["has-tostringtag@1.0.2", "", { "dependencies": { "has-symbols": "^1.0.3" } }, "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw=="],
+
+    "hasown": ["hasown@2.0.2", "", { "dependencies": { "function-bind": "^1.1.2" } }, "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ=="],
+
+    "hono": ["hono@4.12.9", "", {}, "sha512-wy3T8Zm2bsEvxKZM5w21VdHDDcwVS1yUFFY6i8UobSsKfFceT7TOwhbhfKsDyx7tYQlmRM5FLpIuYvNFyjctiA=="],
+
+    "http-errors": ["http-errors@2.0.1", "", { "dependencies": { "depd": "~2.0.0", "inherits": "~2.0.4", "setprototypeof": "~1.2.0", "statuses": "~2.0.2", "toidentifier": "~1.0.1" } }, "sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ=="],
+
+    "humanize-ms": ["humanize-ms@1.2.1", "", { "dependencies": { "ms": "^2.0.0" } }, "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ=="],
+
+    "iconv-lite": ["iconv-lite@0.7.2", "", { "dependencies": { "safer-buffer": ">= 2.1.2 < 3.0.0" } }, "sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw=="],
+
+    "inherits": ["inherits@2.0.4", "", {}, "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="],
+
+    "ip-address": ["ip-address@10.1.0", "", {}, "sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q=="],
+
+    "ipaddr.js": ["ipaddr.js@1.9.1", "", {}, "sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g=="],
+
+    "is-promise": ["is-promise@4.0.0", "", {}, "sha512-hvpoI6korhJMnej285dSg6nu1+e6uxs7zG3BYAm5byqDsgJNWwxzM6z6iZiAgQR4TJ30JmBTOwqZUw3WlyH3AQ=="],
+
+    "isexe": ["isexe@2.0.0", "", {}, "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw=="],
+
+    "jose": ["jose@6.2.2", "", {}, "sha512-d7kPDd34KO/YnzaDOlikGpOurfF0ByC2sEV4cANCtdqLlTfBlw2p14O/5d/zv40gJPbIQxfES3nSx1/oYNyuZQ=="],
+
+    "json-schema-to-ts": ["json-schema-to-ts@3.1.1", "", { "dependencies": { "@babel/runtime": "^7.18.3", "ts-algebra": "^2.0.0" } }, "sha512-+DWg8jCJG2TEnpy7kOm/7/AxaYoaRbjVB4LFZLySZlWn8exGs3A4OLJR966cVvU26N7X9TWxl+Jsw7dzAqKT6g=="],
+
+    "json-schema-traverse": ["json-schema-traverse@1.0.0", "", {}, "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="],
+
+    "json-schema-typed": ["json-schema-typed@8.0.2", "", {}, "sha512-fQhoXdcvc3V28x7C7BMs4P5+kNlgUURe2jmUT1T//oBRMDrqy1QPelJimwZGo7Hg9VPV3EQV5Bnq4hbFy2vetA=="],
+
+    "math-intrinsics": ["math-intrinsics@1.1.0", "", {}, "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g=="],
+
+    "media-typer": ["media-typer@1.1.0", "", {}, "sha512-aisnrDP4GNe06UcKFnV5bfMNPBUw4jsLGaWwWfnH3v02GnBuXX2MCVn5RbrWo0j3pczUilYblq7fQ7Nw2t5XKw=="],
+
+    "merge-descriptors": ["merge-descriptors@2.0.0", "", {}, "sha512-Snk314V5ayFLhp3fkUREub6WtjBfPdCPY1Ln8/8munuLuiYhsABgBVWsozAG+MWMbVEvcdcpbi9R7ww22l9Q3g=="],
+
+    "mime-db": ["mime-db@1.54.0", "", {}, "sha512-aU5EJuIN2WDemCcAp2vFBfp/m4EAhWJnUNSSw0ixs7/kXbd6Pg64EmwJkNdFhB8aWt1sH2CTXrLxo/iAGV3oPQ=="],
+
+    "mime-types": ["mime-types@3.0.2", "", { "dependencies": { "mime-db": "^1.54.0" } }, "sha512-Lbgzdk0h4juoQ9fCKXW4by0UJqj+nOOrI9MJ1sSj4nI8aI2eo1qmvQEie4VD1glsS250n15LsWsYtCugiStS5A=="],
+
+    "ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="],
+
+    "negotiator": ["negotiator@1.0.0", "", {}, "sha512-8Ofs/AUQh8MaEcrlq5xOX0CQ9ypTF5dl78mjlMNfOK08fzpgTHQRQPBxcPlEtIw0yRpws+Zo/3r+5WRby7u3Gg=="],
+
+    "node-domexception": ["node-domexception@1.0.0", "", {}, "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ=="],
+
+    "node-fetch": ["node-fetch@2.7.0", "", { "dependencies": { "whatwg-url": "^5.0.0" }, "peerDependencies": { "encoding": "^0.1.0" }, "optionalPeers": ["encoding"] }, "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A=="],
+
+    "object-assign": ["object-assign@4.1.1", "", {}, "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg=="],
+
+    "object-inspect": ["object-inspect@1.13.4", "", {}, "sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew=="],
+
+    "on-finished": ["on-finished@2.4.1", "", { "dependencies": { "ee-first": "1.1.1" } }, "sha512-oVlzkg3ENAhCk2zdv7IJwd/QUD4z2RxRwpkcGY8psCVcCYZNq4wYnVWALHM+brtuJjePWiYF/ClmuDr8Ch5+kg=="],
+
+    "once": ["once@1.4.0", "", { "dependencies": { "wrappy": "1" } }, "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w=="],
+
+    "openai": ["openai@6.34.0", "", { "peerDependencies": { "ws": "^8.18.0", "zod": "^3.25 || ^4.0" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-yEr2jdGf4tVFYG6ohmr3pF6VJuveP0EA/sS8TBx+4Eq5NT10alu5zg2dmxMXMgqpihRDQlFGpRt2XwsGj+Fyxw=="],
+
+    "parseurl": ["parseurl@1.3.3", "", {}, "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ=="],
+
+    "path-key": ["path-key@3.1.1", "", {}, "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q=="],
+
+    "path-to-regexp": ["path-to-regexp@8.4.1", "", {}, "sha512-fvU78fIjZ+SBM9YwCknCvKOUKkLVqtWDVctl0s7xIqfmfb38t2TT4ZU2gHm+Z8xGwgW+QWEU3oQSAzIbo89Ggw=="],
+
+    "pkce-challenge": ["pkce-challenge@5.0.1", "", {}, "sha512-wQ0b/W4Fr01qtpHlqSqspcj3EhBvimsdh0KlHhH8HRZnMsEa0ea2fTULOXOS9ccQr3om+GcGRk4e+isrZWV8qQ=="],
+
+    "proxy-addr": ["proxy-addr@2.0.7", "", { "dependencies": { "forwarded": "0.2.0", "ipaddr.js": "1.9.1" } }, "sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg=="],
+
+    "qs": ["qs@6.15.0", "", { "dependencies": { "side-channel": "^1.1.0" } }, "sha512-mAZTtNCeetKMH+pSjrb76NAM8V9a05I9aBZOHztWy/UqcJdQYNsf59vrRKWnojAT9Y+GbIvoTBC++CPHqpDBhQ=="],
+
+    "range-parser": ["range-parser@1.2.1", "", {}, "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg=="],
+
+    "raw-body": ["raw-body@3.0.2", "", { "dependencies": { "bytes": "~3.1.2", "http-errors": "~2.0.1", "iconv-lite": "~0.7.0", "unpipe": "~1.0.0" } }, "sha512-K5zQjDllxWkf7Z5xJdV0/B0WTNqx6vxG70zJE4N0kBs4LovmEYWJzQGxC9bS9RAKu3bgM40lrd5zoLJ12MQ5BA=="],
+
+    "require-from-string": ["require-from-string@2.0.2", "", {}, "sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw=="],
+
+    "router": ["router@2.2.0", "", { "dependencies": { "debug": "^4.4.0", "depd": "^2.0.0", "is-promise": "^4.0.0", "parseurl": "^1.3.3", "path-to-regexp": "^8.0.0" } }, "sha512-nLTrUKm2UyiL7rlhapu/Zl45FwNgkZGaCpZbIHajDYgwlJCOzLSk+cIPAnsEqV955GjILJnKbdQC1nVPz+gAYQ=="],
+
+    "safer-buffer": ["safer-buffer@2.1.2", "", {}, "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="],
+
+    "send": ["send@1.2.1", "", { "dependencies": { "debug": "^4.4.3", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "etag": "^1.8.1", "fresh": "^2.0.0", "http-errors": "^2.0.1", "mime-types": "^3.0.2", "ms": "^2.1.3", "on-finished": "^2.4.1", "range-parser": "^1.2.1", "statuses": "^2.0.2" } }, "sha512-1gnZf7DFcoIcajTjTwjwuDjzuz4PPcY2StKPlsGAQ1+YH20IRVrBaXSWmdjowTJ6u8Rc01PoYOGHXfP1mYcZNQ=="],
+
+    "serve-static": ["serve-static@2.2.1", "", { "dependencies": { "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "parseurl": "^1.3.3", "send": "^1.2.0" } }, "sha512-xRXBn0pPqQTVQiC8wyQrKs2MOlX24zQ0POGaj0kultvoOCstBQM5yvOhAVSUwOMjQtTvsPWoNCHfPGwaaQJhTw=="],
+
+    "setprototypeof": ["setprototypeof@1.2.0", "", {}, "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw=="],
+
+    "shebang-command": ["shebang-command@2.0.0", "", { "dependencies": { "shebang-regex": "^3.0.0" } }, "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA=="],
+
+    "shebang-regex": ["shebang-regex@3.0.0", "", {}, "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A=="],
+
+    "side-channel": ["side-channel@1.1.0", "", { "dependencies": { "es-errors": "^1.3.0", "object-inspect": "^1.13.3", "side-channel-list": "^1.0.0", "side-channel-map": "^1.0.1", "side-channel-weakmap": "^1.0.2" } }, "sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw=="],
+
+    "side-channel-list": ["side-channel-list@1.0.0", "", { "dependencies": { "es-errors": "^1.3.0", "object-inspect": "^1.13.3" } }, "sha512-FCLHtRD/gnpCiCHEiJLOwdmFP+wzCmDEkc9y7NsYxeF4u7Btsn1ZuwgwJGxImImHicJArLP4R0yX4c2KCrMrTA=="],
+
+    "side-channel-map": ["side-channel-map@1.0.1", "", { "dependencies": { "call-bound": "^1.0.2", "es-errors": "^1.3.0", "get-intrinsic": "^1.2.5", "object-inspect": "^1.13.3" } }, "sha512-VCjCNfgMsby3tTdo02nbjtM/ewra6jPHmpThenkTYh8pG9ucZ/1P8So4u4FGBek/BjpOVsDCMoLA/iuBKIFXRA=="],
+
+    "side-channel-weakmap": ["side-channel-weakmap@1.0.2", "", { "dependencies": { "call-bound": "^1.0.2", "es-errors": "^1.3.0", "get-intrinsic": "^1.2.5", "object-inspect": "^1.13.3", "side-channel-map": "^1.0.1" } }, "sha512-WPS/HvHQTYnHisLo9McqBHOJk2FkHO/tlpvldyrnem4aeQp4hai3gythswg6p01oSoTl58rcpiFAjF2br2Ak2A=="],
+
+    "statuses": ["statuses@2.0.2", "", {}, "sha512-DvEy55V3DB7uknRo+4iOGT5fP1slR8wQohVdknigZPMpMstaKJQWhwiYBACJE3Ul2pTnATihhBYnRhZQHGBiRw=="],
+
+    "toidentifier": ["toidentifier@1.0.1", "", {}, "sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA=="],
+
+    "tr46": ["tr46@0.0.3", "", {}, "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw=="],
+
+    "ts-algebra": ["ts-algebra@2.0.0", "", {}, "sha512-FPAhNPFMrkwz76P7cdjdmiShwMynZYN6SgOujD1urY4oNm80Ou9oMdmbR45LotcKOXoy7wSmHkRFE6Mxbrhefw=="],
+
+    "type-is": ["type-is@2.0.1", "", { "dependencies": { "content-type": "^1.0.5", "media-typer": "^1.1.0", "mime-types": "^3.0.0" } }, "sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw=="],
+
+    "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
+
+    "undici-types": ["undici-types@5.26.5", "", {}, "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA=="],
+
+    "unpipe": ["unpipe@1.0.0", "", {}, "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ=="],
+
+    "vary": ["vary@1.1.2", "", {}, "sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg=="],
+
+    "web-streams-polyfill": ["web-streams-polyfill@4.0.0-beta.3", "", {}, "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug=="],
+
+    "webidl-conversions": ["webidl-conversions@3.0.1", "", {}, "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ=="],
+
+    "whatwg-url": ["whatwg-url@5.0.0", "", { "dependencies": { "tr46": "~0.0.3", "webidl-conversions": "^3.0.0" } }, "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw=="],
+
+    "which": ["which@2.0.2", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "node-which": "./bin/node-which" } }, "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA=="],
+
+    "wrappy": ["wrappy@1.0.2", "", {}, "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ=="],
+
+    "yaml": ["yaml@2.8.3", "", { "bin": { "yaml": "bin.mjs" } }, "sha512-AvbaCLOO2Otw/lW5bmh9d/WEdcDFdQp2Z2ZUH3pX9U2ihyUY0nvLv7J6TrWowklRGPYbB/IuIMfYgxaCPg5Bpg=="],
+
+    "zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="],
+
+    "zod-to-json-schema": ["zod-to-json-schema@3.25.2", "", { "peerDependencies": { "zod": "^3.25.28 || ^4" } }, "sha512-O/PgfnpT1xKSDeQYSCfRI5Gy3hPf91mKVDuYLUHZJMiDFptvP41MSnWofm8dnCm0256ZNfZIM7DSzuSMAFnjHA=="],
+
+    "@anthropic-ai/claude-agent-sdk/@anthropic-ai/sdk": ["@anthropic-ai/sdk@0.74.0", "", { "dependencies": { "json-schema-to-ts": "^3.1.1" }, "peerDependencies": { "zod": "^3.25.0 || ^4.0.0" }, "optionalPeers": ["zod"], "bin": { "anthropic-ai-sdk": "bin/cli" } }, "sha512-srbJV7JKsc5cQ6eVuFzjZO7UR3xEPJqPamHFIe29bs38Ij2IripoAhC0S5NslNbaFUYqBKypmmpzMTpqfHEUDw=="],
+
+    "@types/node-fetch/@types/node": ["@types/node@25.5.0", "", { "dependencies": { "undici-types": "~7.18.0" } }, "sha512-jp2P3tQMSxWugkCUKLRPVUpGaL5MVFwF8RDuSRztfwgN1wmqJeMSbKlnEtQqU8UrhTmzEmZdu2I6v2dpp7XIxw=="],
+
+    "bun-types/@types/node": ["@types/node@25.5.0", "", { "dependencies": { "undici-types": "~7.18.0" } }, "sha512-jp2P3tQMSxWugkCUKLRPVUpGaL5MVFwF8RDuSRztfwgN1wmqJeMSbKlnEtQqU8UrhTmzEmZdu2I6v2dpp7XIxw=="],
+
+    "form-data/mime-types": ["mime-types@2.1.35", "", { "dependencies": { "mime-db": "1.52.0" } }, "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw=="],
+
+    "@types/node-fetch/@types/node/undici-types": ["undici-types@7.18.2", "", {}, "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w=="],
+
+    "bun-types/@types/node/undici-types": ["undici-types@7.18.2", "", {}, "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w=="],
+
+    "form-data/mime-types/mime-db": ["mime-db@1.52.0", "", {}, "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg=="],
+  }
+}
diff --git a/ai_evals/cases/app.yaml b/ai_evals/cases/app.yaml
new file mode 100644
index 0000000000..4eed8b443e
--- /dev/null
+++ b/ai_evals/cases/app.yaml
@@ -0,0 +1,93 @@
+- id: app-test1-counter-create
+  prompt: |-
+    Create a simple counter app with increment and decrement buttons.
+  judgeChecklist:
+    - shows the current count in the UI
+    - includes an increment button
+    - includes a decrement button
+    - clicking the buttons updates the count correctly
+
+- id: app-test2-counter-reset
+  prompt: |-
+    Add a reset button that sets the counter back to 0
+  initial: ai_evals/fixtures/frontend/app/initial/test1_counter_app
+  judgeChecklist:
+    - adds a reset control to the existing counter app
+    - clicking reset sets the count back to 0
+    - keeps the existing increment and decrement behavior working
+
+- id: app-test3-shopping-cart-quantity
+  prompt: |-
+    Add a quantity selector (+ and - buttons) to each cart item so users can adjust quantities without removing and re-adding items
+  initial: ai_evals/fixtures/frontend/app/initial/shopping_cart
+  judgeChecklist:
+    - each cart item has visible plus and minus quantity controls
+    - users can increase quantity without re-adding the product
+    - users can decrease quantity from the cart UI
+    - cart totals stay in sync with quantity changes
+
+- id: app-test4-shopping-cart-discount
+  prompt: |-
+    Add a discount code input field in the cart.
+    When the code "SAVE10" is entered, apply a 10% discount to the total
+  initial: ai_evals/fixtures/frontend/app/initial/shopping_cart
+  judgeChecklist:
+    - adds a discount code input to the cart
+    - recognizes the code SAVE10
+    - applies a 10 percent discount to the displayed total
+    - keeps the rest of the cart behavior intact
+
+- id: app-test5-file-manager-search
+  prompt: |-
+    Add a search bar in the toolbar that filters files and folders by name as the user types
+  initial: ai_evals/fixtures/frontend/app/initial/file_manager
+  judgeChecklist:
+    - adds a search input in the toolbar
+    - filters files and folders by name as the user types
+    - updates the visible file list from the search query
+    - keeps the rest of the file manager usable
+
+- id: app-test6-file-manager-inline-rename
+  prompt: |-
+    Let users rename files and folders directly from the file list without leaving the page.
+  initial: ai_evals/fixtures/frontend/app/initial/file_manager
+  judgeChecklist:
+    - adds a visible rename action or inline edit mode in the file list
+    - lets users edit an item's name directly from the list
+    - saves the renamed item through the app's existing rename behavior
+    - refreshes the displayed name after a successful rename
+
+- id: app-test7-file-manager-select-all
+  prompt: |-
+    Add a "Select All" checkbox in the file list header and individual checkboxes for each file.
+    Add a "Delete Selected" button that appears when items are selected
+  initial: ai_evals/fixtures/frontend/app/initial/file_manager
+  judgeChecklist:
+    - adds a select-all control in the file list header
+    - adds per-item selection controls
+    - shows a delete-selected action only when there is a selection
+    - deleting selected items updates the visible list
+
+- id: app-test8-inventory-tracker-create
+  prompt: |-
+    Create an inventory tracker app for a small store.
+    Users should be able to add items with a name, sku, quantity, and price, search items by name or sku, and delete items.
+    The inventory should persist between sessions.
+  judgeChecklist:
+    - includes a form to add inventory items with name, sku, quantity, and price
+    - shows a list or table of saved inventory items
+    - supports searching or filtering by name or sku
+    - lets users delete existing inventory items
+    - persists the inventory data appropriately for a raw Windmill app
+
+- id: app-test9-recipe-book-create
+  prompt: |-
+    Create a recipe book app where users can add recipes with a name, ingredients list, and instructions.
+    Include a search bar to filter recipes by name and the ability to delete recipes.
+    Recipes should persist between sessions.
+  judgeChecklist:
+    - includes a form to add recipes with name, ingredients, and instructions
+    - shows saved recipes in the app
+    - supports searching recipes by name
+    - lets users delete recipes
+    - persists recipes appropriately for a raw Windmill app
diff --git a/ai_evals/cases/cli.yaml b/ai_evals/cases/cli.yaml
new file mode 100644
index 0000000000..735976781c
--- /dev/null
+++ b/ai_evals/cases/cli.yaml
@@ -0,0 +1,66 @@
+- id: bun-hello-script
+  prompt: |-
+    Create a Windmill Bun script at `f/evals/hello.ts`.
+    It should take a `name` input and return a greeting object like `{ greeting: "Hello, Alice!" }`.
+  expected: ai_evals/fixtures/cli/expected/bun-hello-script
+  judgeChecklist:
+    - creates the requested Bun script at f/evals/hello.ts
+    - takes a name input
+    - returns an object containing the greeting
+
+- id: bun-hello-flow
+  prompt: |-
+    Create a Windmill flow at `f/evals/hello__flow`.
+    It should take a `name` input and return a greeting object like `{ greeting: "Hello, Alice!" }`.
+    Put the step code in `hello.ts`.
+  expected: ai_evals/fixtures/cli/expected/bun-hello-flow
+  judgeChecklist:
+    - creates the requested flow folder with flow.yaml and hello.ts
+    - wires the name input into the flow step
+    - returns the greeting object
+
+- id: python-add-numbers-script
+  prompt: |-
+    Add a Windmill Python script at `f/evals/add_numbers.py`.
+    It should take `a` and `b` as inputs and return `{ "total": a + b }`.
+  expected: ai_evals/fixtures/cli/expected/python-add-numbers-script
+  judgeChecklist:
+    - creates the requested Python script at f/evals/add_numbers.py
+    - takes `a` and `b` as inputs
+    - returns an object with total equal to a plus b
+
+- id: bun-hello-script-uppercase
+  prompt: |-
+    Update `f/evals/hello.ts` so it accepts an optional `uppercase` boolean.
+    Keep returning `{ greeting: ... }`, but when `uppercase` is true the greeting should be uppercased before returning it.
+  initial: ai_evals/fixtures/cli/initial/bun-hello-script-uppercase
+  expected: ai_evals/fixtures/cli/expected/bun-hello-script-uppercase
+  judgeChecklist:
+    - updates the existing hello.ts file rather than creating a new script
+    - accepts an optional uppercase boolean input
+    - keeps returning an object with greeting
+    - uppercases the greeting when uppercase is true
+
+- id: bun-hello-flow-punctuation
+  prompt: |-
+    Update the existing flow in `f/evals/hello__flow` so it also accepts an optional `punctuation` input.
+    The greeting should use that punctuation and default to `!` when it is missing.
+  initial: ai_evals/fixtures/cli/initial/bun-hello-flow-punctuation
+  expected: ai_evals/fixtures/cli/expected/bun-hello-flow-punctuation
+  judgeChecklist:
+    - updates the existing hello flow instead of creating a new one
+    - adds an optional punctuation input to the flow
+    - updates the step code so the returned greeting uses punctuation
+    - defaults punctuation to an exclamation mark when omitted
+
+- id: flow-reuse-existing-script
+  prompt: |-
+    There is already a reusable greeting script at `f/lib/format_greeting.ts`.
+    Create a flow at `f/evals/reuse_greeting__flow` that takes a `name` input and reuses that existing script instead of duplicating the logic inline.
+  initial: ai_evals/fixtures/cli/initial/flow-reuse-existing-script
+  expected: ai_evals/fixtures/cli/expected/flow-reuse-existing-script
+  judgeChecklist:
+    - creates the requested flow at f/evals/reuse_greeting__flow
+    - reuses the existing script from f/lib by path
+    - does not duplicate the greeting logic in a new inline script
+    - wires the name input into the reused script
diff --git a/ai_evals/cases/flow.yaml b/ai_evals/cases/flow.yaml
new file mode 100644
index 0000000000..cdb53c696a
--- /dev/null
+++ b/ai_evals/cases/flow.yaml
@@ -0,0 +1,246 @@
+- id: flow-test0-sum-two-numbers
+  prompt: |-
+    Create a flow that takes two numbers, `a` and `b`, and returns their sum.
+    Keep it simple and use a single step named `sum_numbers`.
+  expected: ai_evals/fixtures/frontend/flow/expected/test0_sum_two_numbers.json
+  judgeChecklist:
+    - "the flow takes `a` and `b` as inputs"
+    - "the main step is named `sum_numbers`"
+    - the flow returns the sum of the two numbers
+
+- id: flow-test1-reuse-existing-script
+  prompt: |-
+    I need a flow that adds two numbers.
+    If there is already a script in the workspace that does that, reuse it instead of rewriting the logic.
+    The flow should take `a` and `b` as inputs and use a single step named `sum_numbers`.
+  initial: ai_evals/fixtures/frontend/flow/initial/test1_reuse_existing_script_initial.json
+  expected: ai_evals/fixtures/frontend/flow/expected/test1_reuse_existing_script.json
+  judgeChecklist:
+    - "the flow takes `a` and `b` as inputs"
+    - "the main step is named `sum_numbers`"
+    - the flow reuses the existing workspace script instead of rewriting the addition logic
+
+- id: flow-test2-call-existing-subflow
+  prompt: |-
+    Create a parent flow that adds two numbers by reusing an existing flow in the workspace if one already exists.
+    The parent flow should take `a` and `b` as inputs and delegate the calculation instead of inlining it.
+    Use a single step named `call_add_numbers`.
+  initial: ai_evals/fixtures/frontend/flow/initial/test2_call_existing_subflow_initial.json
+  expected: ai_evals/fixtures/frontend/flow/expected/test2_call_existing_subflow.json
+  judgeChecklist:
+    - "the parent flow takes `a` and `b` as inputs"
+    - "the main step is named `call_add_numbers`"
+    - the parent flow delegates to an existing workspace subflow instead of inlining the addition logic
+
+- id: flow-test3-branchone-routing
+  prompt: |-
+    Create a flow that routes incoming support requests based on the customer's tier.
+    The input should contain a string field named `tier`.
+    Free, pro, and enterprise requests should go to different queues, and unknown tiers should fall back to a default queue.
+    Name the main routing step `route_by_tier`.
+  expected: ai_evals/fixtures/frontend/flow/expected/test3_branchone_routing.json
+  judgeChecklist:
+    - "the input schema includes a string field named `tier`"
+    - "the main routing step is named `route_by_tier`"
+    - free requests go to a free queue
+    - pro requests go to a pro queue
+    - enterprise requests go to an enterprise queue
+    - unknown tiers fall back to a default queue
+
+- id: flow-test4-order-processing-loop
+  prompt: |-
+    Build an order-processing flow.
+
+    The input should include an order with:
+    - an `items` array containing `name`, `price`, and `quantity`
+    - `customer_email`
+    - `shipping_address`
+
+    The flow should:
+    - validate that every item has a positive price and quantity
+    - calculate the order total with 8% tax
+    - check inventory for each item using placeholder availability data
+    - create a shipment if everything is in stock, otherwise create a backorder
+    - send a confirmation using placeholder email logic
+    - return a final order summary with the status
+  validate:
+    schemaAnyOf:
+      - requiredPaths:
+          - order
+          - order.items
+          - order.customer_email
+          - order.shipping_address
+      - requiredPaths:
+          - items
+          - customer_email
+          - shipping_address
+    resolveResultsRefs: true
+  judgeChecklist:
+    - the flow validates that every item has a positive price and quantity
+    - the flow calculates the order total with 8% tax
+    - the flow checks inventory for each item using placeholder availability data
+    - the flow creates a shipment if everything is in stock, otherwise a backorder
+    - the flow sends a confirmation using placeholder email logic
+    - the flow returns a final order summary with the resulting status
+
+- id: flow-test5-parallel-data-pipeline
+  prompt: |-
+    Create a data-processing flow for three external data sources.
+
+    It should:
+    - load a small placeholder configuration listing the three sources
+    - fetch placeholder records from each source
+    - clean and validate each source's records
+    - combine everything into one dataset
+    - compute an overall quality score
+    - store the result differently depending on the score:
+      - 90 or above goes to the primary database
+      - 70 to 89 goes to a secondary database with a warning
+      - below 70 goes to quarantine and triggers an alert
+    - return a processing report with total records, quality score, and destination
+  judgeChecklist:
+    - the flow loads a placeholder configuration listing three external sources
+    - the flow fetches placeholder records from each source
+    - the flow cleans and validates each source's records
+    - the flow combines everything into one dataset
+    - the flow computes an overall quality score
+    - scores of 90 or above go to the primary database
+    - scores from 70 to 89 go to a secondary database with a warning
+    - scores below 70 go to quarantine and trigger an alert
+    - the final report includes total records, quality score, and destination
+
+- id: flow-test6-ai-agent-tools
+  prompt: |-
+    Create a customer support flow.
+
+    The input should include `customer_id` and `query_text`.
+    The flow should load the customer's profile and order history, then use an AI assistant to help with the request.
+    The assistant should be able to:
+    - look up orders
+    - check refund eligibility
+    - search FAQs
+    - open a support ticket when needed
+
+    After that, log the interaction and return the assistant's response along with any actions it took.
+  judgeChecklist:
+    - "the input schema includes `customer_id` and `query_text`"
+    - the flow loads the customer's profile and order history
+    - the flow uses an AI assistant step
+    - the assistant can look up orders
+    - the assistant can check refund eligibility
+    - the assistant can search FAQs
+    - the assistant can open a support ticket
+    - the flow logs the interaction
+    - the final output returns the assistant response along with any actions taken or resulting support action details
+
+- id: flow-test7-simple-modification
+  prompt: |-
+    Update this flow so it validates processed data before saving it.
+
+    After `process_data`, add a `validate_data` step that checks the data array is not empty.
+    If the array is empty, it should return an error object with the message `No data to save`.
+    If validation passes, let the save continue normally.
+    Update `save_results` so it handles the validation result correctly.
+  initial: ai_evals/fixtures/frontend/flow/initial/test5_initial.json
+  expected: ai_evals/fixtures/frontend/flow/expected/test5_modify_simple.json
+  judgeChecklist:
+    - the updated flow keeps the original fetch and process steps intact
+    - "a `validate_data` step is added after `process_data`"
+    - "`validate_data` checks that the processed data array is not empty"
+    - "empty data returns an error object with the message `No data to save`"
+    - "`save_results` handles the validation result correctly"
+
+- id: flow-test8-branching-in-loop
+  prompt: |-
+    Update the order-processing logic inside `loop_orders` so different order types are handled differently.
+
+    For `express`, mark the order as priority and use a shipping cost of $15.99.
+    For `standard`, use a shipping cost of $5.99.
+    For `pickup`, mark it as no shipping required with a cost of $0.
+    Keep the existing processing as a fallback for unknown order types.
+    Each path should return the orderId, shipping cost, and shipping type.
+  initial: ai_evals/fixtures/frontend/flow/initial/test6_initial.json
+  judgeChecklist:
+    - "the existing `loop_orders` flow still handles per-order processing"
+    - exact branching topology is not required as long as `loop_orders` handles the order types correctly
+    - express orders are marked as priority and use a shipping cost of 15.99
+    - standard orders use a shipping cost of 5.99
+    - pickup orders use a shipping cost of 0 and are treated as no shipping required
+    - unknown order types still follow a fallback path
+    - "each processed order returns `orderId`, `shippingCost`, and `shippingType`"
+
+- id: flow-test9-parallel-refactor
+  prompt: |-
+    Refactor this flow so the enrichment work no longer runs one step at a time.
+
+    `enrich_price`, `enrich_inventory`, and `enrich_reviews` should run independently.
+    Each one should return a fallback value if it fails.
+    Update `combine_data` so it merges the enrichment results and sets a `hasFallbacks` flag when any fallback was used.
+    Keep `get_item` as the first step and `return_result` as the last step.
+  initial: ai_evals/fixtures/frontend/flow/initial/test7_initial.json
+  expected: ai_evals/fixtures/frontend/flow/expected/test7_modify_complex.json
+  judgeChecklist:
+    - "the updated flow keeps `get_item` as the first step"
+    - "the updated flow keeps `return_result` as the last step"
+    - "`enrich_price`, `enrich_inventory`, and `enrich_reviews` run independently rather than sequentially"
+    - each enrichment path returns a fallback value if it fails
+    - "`combine_data` merges the enrichment results"
+    - "`combine_data` sets `hasFallbacks` when any fallback was used"
+
+- id: flow-test10-while-loop-counter
+  prompt: |-
+    Create a flow that keeps incrementing a counter until it reaches a target value.
+    The input should include a number field named `target`.
+    Name the looping step `count_until_target`.
+    Once the target is reached, return the final counter value.
+  expected: ai_evals/fixtures/frontend/flow/expected/test10_while_loop_counter.json
+  judgeChecklist:
+    - "the input schema includes a number field named `target`"
+    - "the looping step is named `count_until_target`"
+    - the flow keeps incrementing a counter until the target is reached
+    - the final output returns the final counter value
+
+- id: flow-test11-preprocessor-and-failure-handler
+  prompt: |-
+    Create an event-processing flow for a string payload.
+
+    Before the main processing runs, trim the payload and reject empty strings.
+    The main step should be named `process_event` and return a simple success object.
+    If anything fails, return a compact error object with the error message and the failing step id.
+  expected: ai_evals/fixtures/frontend/flow/expected/test11_preprocessor_failure.json
+  validate:
+    requireSpecialModules:
+      - preprocessor_module
+      - failure_module
+  judgeChecklist:
+    - the flow trims the payload before the main processing runs
+    - the flow rejects empty payload strings
+    - "the main step is named `process_event`"
+    - "`process_event` returns a simple success object"
+    - failures return a compact error object with the error message and failing step id
+
+- id: flow-test12-approval-step
+  prompt: |-
+    Create a purchase approval flow.
+
+    The input should include `requester_email` and `amount`.
+    Add an approval step named `request_approval` that pauses the flow and asks the approver for a comment.
+    One approval should be enough to continue.
+    After approval, add a final step named `finalize_purchase` that returns an approved status object.
+  expected: ai_evals/fixtures/frontend/flow/expected/test12_approval_step.json
+  validate:
+    schemaRequiredPaths:
+      - requester_email
+      - amount
+    requireSuspendSteps:
+      - id: request_approval
+        requiredEvents: 1
+        resumeRequiredStringFieldAnyOf:
+          - comment
+          - approver_comment
+  judgeChecklist:
+    - "the flow includes an approval step named `request_approval`"
+    - "`request_approval` pauses the flow and asks the approver for a comment"
+    - one approval is enough to continue
+    - "the flow includes a final step named `finalize_purchase`"
+    - "`finalize_purchase` returns an approved status object after approval"
diff --git a/ai_evals/cases/script.yaml b/ai_evals/cases/script.yaml
new file mode 100644
index 0000000000..6fc20199e3
--- /dev/null
+++ b/ai_evals/cases/script.yaml
@@ -0,0 +1,11 @@
+- id: script-test1-greet-user
+  prompt: |-
+    Update the current Bun script so it takes the existing `name` input and returns a plain greeting string like `Hello, Alice!`.
+    Do not wrap the result in an object or array.
+    Keep it simple and do not add external dependencies.
+  initial: ai_evals/fixtures/frontend/script/initial/test1_empty_bun.json
+  expected: ai_evals/fixtures/frontend/script/expected/test1_greet_user.json
+  judgeChecklist:
+    - uses the existing `name` input
+    - returns a plain greeting string
+    - does not wrap the result in an object or array
diff --git a/ai_evals/cli/index.ts b/ai_evals/cli/index.ts
new file mode 100644
index 0000000000..d64a547f14
--- /dev/null
+++ b/ai_evals/cli/index.ts
@@ -0,0 +1,295 @@
+#!/usr/bin/env bun
+
+import { Command, InvalidArgumentError } from "commander";
+import { loadCases, loadSelectedCases } from "../core/cases";
+import {
+  EVAL_MODELS,
+  type EvalModelSpec,
+  formatRunModelLabel,
+  getCliEvalModel,
+  getEvalModelHelpText,
+  resolveEvalModel,
+} from "../core/models";
+import {
+  appendHistoryRecord,
+  buildRunResult,
+  formatRunSummary,
+  resolveRunOutputPath,
+  writeRunArtifacts,
+  writeRunResult,
+} from "../core/results";
+import { runSuite } from "../core/runSuite";
+import { EVAL_MODES, type EvalMode } from "../core/types";
+import { DEFAULT_JUDGE_MODEL } from "../core/judge";
+import { createCliModeRunner } from "../modes/cli";
+import { runFrontendBenchmarkAdapter } from "../adapters/frontend/runtime";
+
+async function main() {
+  const program = new Command()
+    .name("bun run cli --")
+    .description("Run AI eval cases against the current production prompts and guidance")
+    .showHelpAfterError()
+    .showSuggestionAfterError()
+    .addHelpText(
+      "after",
+      [
+        "",
+        "Examples:",
+        "  bun run cli -- models",
+        "  bun run cli -- cases",
+        "  bun run cli -- cases flow",
+        "  bun run cli -- run flow",
+        "  bun run cli -- run flow --model 4o",
+        "  bun run cli -- run flow --models haiku,opus,4o",
+        "  bun run cli -- run flow flow-test0-sum-two-numbers --verbose",
+        "  bun run cli -- run flow --record",
+        "  bun run cli -- run flow flow-test5-simple-modification --runs 3",
+        "  bun run cli -- run cli bun-hello-script",
+        "",
+        "Models:",
+        getEvalModelHelpText(),
+      ].join("\n")
+    );
+
+  program
+    .command("models")
+    .description("List available model aliases")
+    .action(() => {
+      handleModels();
+    });
+
+  program
+    .command("cases")
+    .description("List available cases")
+    .argument("[mode]", "cli, flow, script, or app", parseOptionalMode)
+    .action(async (mode?: EvalMode) => {
+      await handleCases(mode);
+    });
+
+  program
+    .command("run")
+    .description("Run one benchmark mode")
+    .argument("<mode>", "cli, flow, script, or app", parseMode)
+    .argument("[caseIds...]", "specific case ids to run")
+    .option("--runs <n>", "number of attempts per case", parsePositiveInteger, 1)
+    .option("--output <path>", "write the result JSON to this path")
+    .option("--model <name>", `model alias (${EVAL_MODELS.map((entry) => entry.id).join(", ")})`)
+    .option("--models <names>", "comma-separated model aliases to run sequentially")
+    .option("--verbose", "stream assistant output during frontend runs")
+    .option("--record", "append a compact summary line to ai_evals/history/<mode>.jsonl")
+    .action(
+      async (
+        mode: EvalMode,
+        caseIds: string[],
+        options: {
+          runs: number;
+          output?: string;
+          model?: string;
+          models?: string;
+          verbose?: boolean;
+          record?: boolean;
+        }
+      ) => {
+        await handleRun({
+          mode,
+          caseIds,
+          runs: options.runs,
+          outputPath: options.output,
+          model: options.model,
+          models: options.models,
+          verbose: options.verbose ?? false,
+          record: options.record ?? false,
+        });
+      }
+    );
+
+  await program.parseAsync(process.argv);
+}
+
+async function handleCases(mode?: EvalMode) {
+  const modes = mode ? [mode] : [...EVAL_MODES];
+
+  for (const entry of modes) {
+    const cases = await loadCases(entry);
+    process.stdout.write(`${entry} (${cases.length})\n`);
+    for (const evalCase of cases) {
+      process.stdout.write(`- ${evalCase.id}\n`);
+    }
+    process.stdout.write("\n");
+  }
+}
+
+function handleModels() {
+  process.stdout.write("Available models\n");
+  for (const model of EVAL_MODELS) {
+    const supports = [
+      ...(model.frontend ? ["flow", "script", "app"] : []),
+      ...(model.cli ? ["cli"] : []),
+    ];
+    const aliases = [model.id, ...model.aliases.filter((alias) => alias !== model.id)];
+    process.stdout.write(`- ${model.id}: ${model.label}\n`);
+    process.stdout.write(`  aliases: ${aliases.join(", ")}\n`);
+    process.stdout.write(`  modes: ${supports.join(", ")}\n`);
+  }
+  process.stdout.write(`\nJudge model: ${DEFAULT_JUDGE_MODEL}\n`);
+}
+
+async function handleRun(input: {
+  mode: EvalMode;
+  caseIds: string[];
+  runs: number;
+  outputPath?: string;
+  model?: string;
+  models?: string;
+  verbose: boolean;
+  record: boolean;
+}) {
+  if (input.record && input.caseIds.length > 0) {
+    throw new Error("--record only supports full-suite runs; omit case ids to record history");
+  }
+  if (input.model && input.models) {
+    throw new Error("Use either --model or --models, not both");
+  }
+
+  const selectedCases = await loadSelectedCases(input.mode, input.caseIds);
+  const models = resolveRequestedModels(input.mode, input.model, input.models);
+  if (input.outputPath && models.length > 1) {
+    throw new Error("--output only supports a single model run");
+  }
+
+  const summaries: Array<{ label: string; passRate: number; averageDurationMs: number }> = [];
+
+  for (const [index, model] of models.entries()) {
+    const runModel = formatRunModelLabel(input.mode, model);
+    if (models.length > 1) {
+      process.stdout.write(
+        `${index > 0 ? "\n" : ""}=== ${input.mode} ${model.id} (${runModel}) ===\n`
+      );
+    }
+    process.stderr.write(`Starting ${input.mode} benchmark...\n`);
+
+    const result =
+      input.mode === "cli"
+        ? await runCliBenchmark(selectedCases, input.runs, getCliEvalModel(model), runModel)
+        : await runFrontendBenchmarkAdapter({
+            mode: input.mode,
+            caseIds: input.caseIds,
+            runs: input.runs,
+            model: model.id,
+            verbose: input.verbose,
+          });
+
+    const resolvedOutputPath =
+      models.length === 1
+        ? resolveRunOutputPath(input.mode, input.outputPath)
+        : resolveRunOutputPath(input.mode);
+    const artifactsPath = await writeRunArtifacts(result, resolvedOutputPath);
+    const resultPath = await writeRunResult(result, resolvedOutputPath);
+    const historyPath = input.record ? await appendHistoryRecord(result) : null;
+    process.stdout.write(`${formatRunSummary(result)}\n`);
+    process.stdout.write(`Saved: ${resultPath}\n`);
+    if (artifactsPath) {
+      process.stdout.write(`Artifacts: ${artifactsPath}\n`);
+    }
+    if (historyPath) {
+      process.stdout.write(`Recorded: ${historyPath}\n`);
+    }
+
+    summaries.push({
+      label: `${model.id} (${runModel})`,
+      passRate: result.passRate,
+      averageDurationMs: result.averageDurationMs,
+    });
+  }
+
+  if (summaries.length > 1) {
+    process.stdout.write("\nModel summary\n");
+    for (const summary of summaries) {
+      process.stdout.write(
+        `- ${summary.label}: ${formatPercent(summary.passRate)} | ${Math.round(summary.averageDurationMs)}ms\n`
+      );
+    }
+  }
+}
+
+async function runCliBenchmark(
+  cases: Awaited<ReturnType<typeof loadSelectedCases>>,
+  runs: number,
+  model: ReturnType<typeof getCliEvalModel>,
+  runModel: string
+) {
+  const caseResults = await runSuite({
+    modeRunner: createCliModeRunner(model),
+    cases,
+    runs,
+    runModel,
+    judgeModel: DEFAULT_JUDGE_MODEL,
+  });
+
+  return buildRunResult({
+    mode: "cli",
+    runs,
+    runModel,
+    judgeModel: DEFAULT_JUDGE_MODEL,
+    caseResults,
+  });
+}
+
+function parseMode(value: string): EvalMode {
+  if (EVAL_MODES.includes(value as EvalMode)) {
+    return value as EvalMode;
+  }
+  throw new InvalidArgumentError(`mode must be one of: ${EVAL_MODES.join(", ")}`);
+}
+
+function parseOptionalMode(value: string | undefined): EvalMode | undefined {
+  return value ? parseMode(value) : undefined;
+}
+
+function parsePositiveInteger(value: string): number {
+  const parsed = Number(value);
+  if (!Number.isInteger(parsed) || parsed <= 0) {
+    throw new InvalidArgumentError("must be a positive integer");
+  }
+  return parsed;
+}
+
+function resolveRequestedModels(
+  mode: EvalMode,
+  singleModel?: string,
+  multipleModels?: string
+): EvalModelSpec[] {
+  if (!multipleModels) {
+    return [resolveEvalModel(mode, singleModel)];
+  }
+
+  const aliases = multipleModels
+    .split(",")
+    .map((value) => value.trim())
+    .filter(Boolean);
+  if (aliases.length === 0) {
+    throw new Error("--models requires at least one model alias");
+  }
+
+  const seen = new Set<string>();
+  const models: EvalModelSpec[] = [];
+  for (const alias of aliases) {
+    const model = resolveEvalModel(mode, alias);
+    if (seen.has(model.id)) {
+      continue;
+    }
+    seen.add(model.id);
+    models.push(model);
+  }
+  return models;
+}
+
+function formatPercent(value: number): string {
+  return `${(value * 100).toFixed(1)}%`;
+}
+
+void main().catch((error) => {
+  const message = error instanceof Error ? error.message : String(error);
+  process.stderr.write(`${message}\n`);
+  process.exit(1);
+});
diff --git a/ai_evals/core/cases.ts b/ai_evals/core/cases.ts
new file mode 100644
index 0000000000..69f1e8a890
--- /dev/null
+++ b/ai_evals/core/cases.ts
@@ -0,0 +1,71 @@
+import { readFile } from "node:fs/promises";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+import { parse } from "yaml";
+import type { EvalCase, EvalMode, FlowValidationSpec } from "./types";
+
+const REPO_ROOT = fileURLToPath(new URL("../../", import.meta.url));
+const CASES_DIR = path.join(REPO_ROOT, "ai_evals", "cases");
+
+interface RawEvalCase {
+  id: string;
+  prompt: string;
+  initial?: string;
+  expected?: string;
+  validate?: FlowValidationSpec;
+  judgeChecklist?: string[];
+}
+
+export function getRepoRoot(): string {
+  return REPO_ROOT;
+}
+
+export function getAiEvalsRoot(): string {
+  return path.join(REPO_ROOT, "ai_evals");
+}
+
+export async function loadCases(mode: EvalMode): Promise<EvalCase[]> {
+  const filePath = path.join(CASES_DIR, `${mode}.yaml`);
+  const raw = await readFile(filePath, "utf8");
+  const parsed = parse(raw);
+
+  if (!Array.isArray(parsed)) {
+    throw new Error(`Expected ${filePath} to contain a YAML list of cases`);
+  }
+
+  return parsed.map((entry) => ({
+    id: entry.id,
+    prompt: entry.prompt,
+    initialPath: resolveFixturePath(entry.initial),
+    expectedPath: resolveFixturePath(entry.expected),
+    validate: entry.validate,
+    judgeChecklist: entry.judgeChecklist,
+  }));
+}
+
+export async function loadSelectedCases(
+  mode: EvalMode,
+  selectedIds: string[]
+): Promise<EvalCase[]> {
+  const allCases = await loadCases(mode);
+  if (selectedIds.length === 0) {
+    return allCases;
+  }
+
+  const caseMap = new Map(allCases.map((entry) => [entry.id, entry]));
+  const missing = selectedIds.filter((id) => !caseMap.has(id));
+  if (missing.length > 0) {
+    throw new Error(
+      `Unknown ${mode} case${missing.length === 1 ? "" : "s"}: ${missing.join(", ")}`
+    );
+  }
+
+  return selectedIds.map((id) => caseMap.get(id)!);
+}
+
+function resolveFixturePath(value: string | undefined): string | undefined {
+  if (!value) {
+    return undefined;
+  }
+  return path.isAbsolute(value) ? value : path.join(REPO_ROOT, value);
+}
diff --git a/ai_evals/core/files.ts b/ai_evals/core/files.ts
new file mode 100644
index 0000000000..7b58f76e28
--- /dev/null
+++ b/ai_evals/core/files.ts
@@ -0,0 +1,67 @@
+import { access, copyFile, mkdir, readdir, readFile } from "node:fs/promises";
+import path from "node:path";
+
+export async function exists(filePath: string): Promise<boolean> {
+  try {
+    await access(filePath);
+    return true;
+  } catch {
+    return false;
+  }
+}
+
+export async function readJsonFile<T>(filePath: string): Promise<T> {
+  const raw = await readFile(filePath, "utf8");
+  return JSON.parse(raw) as T;
+}
+
+export async function readDirectoryFiles(
+  rootDir: string,
+  options: {
+    ignore?: Set<string>;
+  } = {}
+): Promise<Record<string, string>> {
+  const files: Record<string, string> = {};
+  await walkDirectory(rootDir, "", files, options.ignore ?? new Set());
+  return files;
+}
+
+export async function copyDirectory(sourceDir: string, targetDir: string): Promise<void> {
+  const entries = await readdir(sourceDir, { withFileTypes: true });
+  await mkdir(targetDir, { recursive: true });
+
+  for (const entry of entries) {
+    const sourcePath = path.join(sourceDir, entry.name);
+    const targetPath = path.join(targetDir, entry.name);
+    if (entry.isDirectory()) {
+      await copyDirectory(sourcePath, targetPath);
+      continue;
+    }
+    await mkdir(path.dirname(targetPath), { recursive: true });
+    await copyFile(sourcePath, targetPath);
+  }
+}
+
+async function walkDirectory(
+  absoluteDir: string,
+  relativeDir: string,
+  output: Record<string, string>,
+  ignore: Set<string>
+): Promise<void> {
+  const entries = await readdir(absoluteDir, { withFileTypes: true });
+
+  for (const entry of entries) {
+    const relativePath = relativeDir ? `${relativeDir}/${entry.name}` : entry.name;
+    if (ignore.has(relativePath) || ignore.has(entry.name)) {
+      continue;
+    }
+
+    const absolutePath = path.join(absoluteDir, entry.name);
+    if (entry.isDirectory()) {
+      await walkDirectory(absolutePath, relativePath, output, ignore);
+      continue;
+    }
+
+    output[relativePath] = await readFile(absolutePath, "utf8");
+  }
+}
diff --git a/ai_evals/core/judge.ts b/ai_evals/core/judge.ts
new file mode 100644
index 0000000000..cae66721c6
--- /dev/null
+++ b/ai_evals/core/judge.ts
@@ -0,0 +1,149 @@
+import Anthropic from "@anthropic-ai/sdk";
+import type { EvalMode, JudgeResult } from "./types";
+
+export const DEFAULT_JUDGE_MODEL = "claude-sonnet-4-6";
+
+const JUDGE_TOOL_NAME = "submit_judgement";
+
+export async function judgeOutput(input: {
+  mode: EvalMode;
+  prompt: string;
+  checklist?: string[];
+  initial?: unknown;
+  expected?: unknown;
+  actual: unknown;
+  model?: string;
+}): Promise<JudgeResult> {
+  const apiKey = process.env.ANTHROPIC_API_KEY;
+  if (!apiKey) {
+    return {
+      success: false,
+      score: 0,
+      summary: "Judge unavailable",
+      error: "ANTHROPIC_API_KEY is not set",
+    };
+  }
+
+  const client = new Anthropic({ apiKey });
+  const model = input.model ?? DEFAULT_JUDGE_MODEL;
+
+  const system = [
+    "You evaluate benchmark outputs for Windmill AI generation.",
+    "Deterministic checks already run separately. Focus on whether the final output satisfies the user request.",
+    "If expected state is provided, treat it as a valid example and reward semantically equivalent outputs.",
+    "If a checklist is provided, treat it as the explicit acceptance criteria for this case.",
+    "Be strict about missing requested functionality.",
+    "When the prompt wording is ambiguous, prefer the checklist over inferred structural requirements.",
+    "Do not require exact ids, exact topology, or exact field names unless the prompt, checklist, or expected state clearly requires them.",
+    `Always respond by calling the ${JUDGE_TOOL_NAME} tool exactly once.`,
+  ].join("\n\n");
+
+  const user = [
+    `Mode: ${input.mode}`,
+    "",
+    "User prompt:",
+    input.prompt,
+    "",
+    "Checklist:",
+    formatChecklist(input.checklist),
+    "",
+    "Initial state:",
+    formatJsonBlock(input.initial),
+    "",
+    "Expected state:",
+    formatJsonBlock(input.expected),
+    "",
+    "Actual result:",
+    formatJsonBlock(input.actual),
+  ].join("\n");
+
+  try {
+    const response = await client.messages.create({
+      model,
+      max_tokens: 1024,
+      temperature: 0,
+      system,
+      messages: [{ role: "user", content: user }],
+      tools: [
+        {
+          name: JUDGE_TOOL_NAME,
+          description: "Submit the benchmark judgement as structured data.",
+          input_schema: {
+            type: "object",
+            properties: {
+              score: {
+                type: "integer",
+                minimum: 0,
+                maximum: 100,
+              },
+              summary: {
+                type: "string",
+              },
+            },
+            required: ["score", "summary"],
+          },
+        },
+      ],
+      tool_choice: {
+        type: "tool",
+        name: JUDGE_TOOL_NAME,
+        disable_parallel_tool_use: true,
+      },
+    });
+
+    const toolUseBlock = response.content.find(
+      (block): block is Anthropic.ToolUseBlock =>
+        block.type === "tool_use" && block.name === JUDGE_TOOL_NAME
+    );
+
+    if (!toolUseBlock) {
+      return {
+        success: false,
+        score: 0,
+        summary: "Judge returned no tool output",
+        error: "Expected structured tool output from judge",
+      };
+    }
+
+    const parsed = toolUseBlock.input as {
+      score: number;
+      summary: string;
+    };
+
+    return {
+      success: true,
+      score: normalizeScore(parsed.score),
+      summary: parsed.summary,
+    };
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    return {
+      success: false,
+      score: 0,
+      summary: "Judge failed",
+      error: message,
+    };
+  }
+}
+
+function formatJsonBlock(value: unknown): string {
+  if (value === undefined) {
+    return "(none)";
+  }
+  return JSON.stringify(value, null, 2);
+}
+
+function formatChecklist(checklist: string[] | undefined): string {
+  if (!checklist || checklist.length === 0) {
+    return "(none)";
+  }
+
+  return checklist.map((item) => `- ${item}`).join("\n");
+}
+
+function normalizeScore(value: number): number {
+  if (!Number.isFinite(value)) {
+    return 0;
+  }
+  return Math.max(0, Math.min(100, Math.round(value)));
+}
diff --git a/ai_evals/core/models.test.ts b/ai_evals/core/models.test.ts
new file mode 100644
index 0000000000..86bf1c6a9a
--- /dev/null
+++ b/ai_evals/core/models.test.ts
@@ -0,0 +1,29 @@
+import { describe, expect, it } from "bun:test";
+import { resolveEvalModel } from "./models";
+
+describe("resolveEvalModel", () => {
+  it("supports Gemini aliases for frontend evals", () => {
+    expect(resolveEvalModel("flow", "gemini").frontend).toEqual({
+      provider: "googleai",
+      model: "gemini-2.5-flash",
+    });
+    expect(resolveEvalModel("app", "gemini-pro").frontend).toEqual({
+      provider: "googleai",
+      model: "gemini-2.5-pro",
+    });
+    expect(resolveEvalModel("script", "gemini-3-flash-preview").frontend).toEqual({
+      provider: "googleai",
+      model: "gemini-3-flash-preview",
+    });
+    expect(resolveEvalModel("flow", "gemini-3.1-pro-preview").frontend).toEqual({
+      provider: "googleai",
+      model: "gemini-3.1-pro-preview",
+    });
+  });
+
+  it("rejects Gemini aliases for cli evals", () => {
+    expect(() => resolveEvalModel("cli", "gemini")).toThrow(
+      "Model gemini-flash is not supported for cli mode"
+    );
+  });
+});
diff --git a/ai_evals/core/models.ts b/ai_evals/core/models.ts
new file mode 100644
index 0000000000..9cc0ab0597
--- /dev/null
+++ b/ai_evals/core/models.ts
@@ -0,0 +1,185 @@
+import type { EvalMode } from "./types";
+
+export interface FrontendEvalModelConfig {
+  provider: "anthropic" | "openai" | "googleai";
+  model: string;
+}
+
+export interface CliEvalModelConfig {
+  provider: "anthropic";
+  model: string;
+}
+
+export interface EvalModelSpec {
+  id: string;
+  label: string;
+  aliases: string[];
+  frontend?: FrontendEvalModelConfig;
+  cli?: CliEvalModelConfig;
+}
+
+export const EVAL_MODELS: EvalModelSpec[] = [
+  {
+    id: "haiku",
+    label: "Claude Haiku 4.5",
+    aliases: [
+      "haiku",
+      "haiku-4.5",
+      "claude-haiku",
+      "claude-haiku-4.5",
+      "claude-haiku-4-5",
+      "claude-haiku-4-5-20251001",
+    ],
+    frontend: {
+      provider: "anthropic",
+      model: "claude-haiku-4-5-20251001",
+    },
+    cli: {
+      provider: "anthropic",
+      model: "haiku",
+    },
+  },
+  {
+    id: "sonnet",
+    label: "Claude Sonnet 4.5",
+    aliases: [
+      "sonnet",
+      "sonnet-4.5",
+      "claude-sonnet",
+      "claude-sonnet-4.5",
+      "claude-sonnet-4-5",
+      "claude-sonnet-4-5-20250929",
+    ],
+    frontend: {
+      provider: "anthropic",
+      model: "claude-sonnet-4-5-20250929",
+    },
+    cli: {
+      provider: "anthropic",
+      model: "sonnet",
+    },
+  },
+  {
+    id: "opus",
+    label: "Claude Opus 4.6",
+    aliases: [
+      "opus",
+      "opus-4.6",
+      "claude-opus",
+      "claude-opus-4.6",
+      "claude-opus-4-6",
+    ],
+    frontend: {
+      provider: "anthropic",
+      model: "claude-opus-4-6",
+    },
+    cli: {
+      provider: "anthropic",
+      model: "opus",
+    },
+  },
+  {
+    id: "4o",
+    label: "GPT-4o",
+    aliases: ["4o", "gpt-4o"],
+    frontend: {
+      provider: "openai",
+      model: "gpt-4o",
+    },
+  },
+  {
+    id: "gemini-flash",
+    label: "Gemini 2.5 Flash",
+    aliases: ["gemini", "gemini-flash", "gemini-2.5-flash"],
+    frontend: {
+      provider: "googleai",
+      model: "gemini-2.5-flash",
+    },
+  },
+  {
+    id: "gemini-pro",
+    label: "Gemini 2.5 Pro",
+    aliases: ["gemini-pro", "gemini-2.5-pro"],
+    frontend: {
+      provider: "googleai",
+      model: "gemini-2.5-pro",
+    },
+  },
+  {
+    id: "gemini-3-flash-preview",
+    label: "Gemini 3 Flash Preview",
+    aliases: ["gemini-3-flash-preview", "gemini-3-flash"],
+    frontend: {
+      provider: "googleai",
+      model: "gemini-3-flash-preview",
+    },
+  },
+  {
+    id: "gemini-3.1-pro-preview",
+    label: "Gemini 3.1 Pro Preview",
+    aliases: ["gemini-3.1-pro-preview", "gemini-3.1-pro", "gemini-3-pro-preview"],
+    frontend: {
+      provider: "googleai",
+      model: "gemini-3.1-pro-preview",
+    },
+  },
+];
+
+export function resolveEvalModel(mode: EvalMode, alias?: string): EvalModelSpec {
+  const spec = alias ? findEvalModel(alias) : getDefaultEvalModel(mode);
+  if (!spec) {
+    throw new Error(`Unknown model: ${alias}`);
+  }
+
+  if (mode === "cli" && !spec.cli) {
+    throw new Error(`Model ${spec.id} is not supported for cli mode`);
+  }
+
+  if (mode !== "cli" && !spec.frontend) {
+    throw new Error(`Model ${spec.id} is not supported for ${mode} mode`);
+  }
+
+  return spec;
+}
+
+export function getEvalModelHelpText(): string {
+  return EVAL_MODELS.map((model) => {
+    const modes = [
+      ...(model.frontend ? ["flow", "script", "app"] : []),
+      ...(model.cli ? ["cli"] : []),
+    ];
+    return `  ${model.id.padEnd(8)} ${model.label} (${modes.join(", ")})`;
+  }).join("\n");
+}
+
+export function formatRunModelLabel(mode: EvalMode, model: EvalModelSpec): string {
+  if (mode === "cli") {
+    return `${model.cli!.provider}:${model.cli!.model}`;
+  }
+  return `${model.frontend!.provider}:${model.frontend!.model}`;
+}
+
+export function getFrontendEvalModel(model: EvalModelSpec): FrontendEvalModelConfig {
+  if (!model.frontend) {
+    throw new Error(`Model ${model.id} does not support frontend evals`);
+  }
+  return model.frontend;
+}
+
+export function getCliEvalModel(model: EvalModelSpec): CliEvalModelConfig {
+  if (!model.cli) {
+    throw new Error(`Model ${model.id} does not support cli evals`);
+  }
+  return model.cli;
+}
+
+function getDefaultEvalModel(mode: EvalMode): EvalModelSpec {
+  return mode === "cli" ? EVAL_MODELS[0]! : EVAL_MODELS[0]!;
+}
+
+function findEvalModel(alias: string): EvalModelSpec | undefined {
+  const normalized = alias.trim().toLowerCase();
+  return EVAL_MODELS.find((model) =>
+    [model.id, ...model.aliases].some((candidate) => candidate.toLowerCase() === normalized)
+  );
+}
diff --git a/ai_evals/core/results.ts b/ai_evals/core/results.ts
new file mode 100644
index 0000000000..f2ea5b7dc4
--- /dev/null
+++ b/ai_evals/core/results.ts
@@ -0,0 +1,296 @@
+import { appendFile, mkdir, rm, writeFile } from "node:fs/promises";
+import path from "node:path";
+import { execFileSync } from "node:child_process";
+import { getAiEvalsRoot, getRepoRoot } from "./cases";
+import type {
+  BenchmarkArtifactFile,
+  BenchmarkCaseResult,
+  BenchmarkRunResult,
+  BenchmarkTokenUsage,
+  EvalMode,
+} from "./types";
+
+export async function writeRunResult(
+  result: BenchmarkRunResult,
+  outputPath?: string
+): Promise<string> {
+  const targetPath = resolveRunOutputPath(result.mode, outputPath);
+  await mkdir(path.dirname(targetPath), { recursive: true });
+  await writeFile(targetPath, JSON.stringify(toSerializableRunResult(result), null, 2) + "\n", "utf8");
+  return targetPath;
+}
+
+export async function appendHistoryRecord(
+  result: BenchmarkRunResult,
+  historyPath = resolveHistoryPath(result.mode)
+): Promise<string> {
+  await mkdir(path.dirname(historyPath), { recursive: true });
+  await appendFile(historyPath, JSON.stringify(toHistoryRecord(result)) + "\n", "utf8");
+  return historyPath;
+}
+
+export async function writeRunArtifacts(
+  result: BenchmarkRunResult,
+  outputPath?: string
+): Promise<string | null> {
+  const targetPath = resolveRunOutputPath(result.mode, outputPath);
+  const artifactRoot = defaultArtifactsRoot(targetPath);
+
+  await rm(artifactRoot, { recursive: true, force: true });
+
+  let wroteArtifacts = false;
+  for (const caseResult of result.cases) {
+    for (const attempt of caseResult.attempts) {
+      const artifactFiles = attempt.artifactFiles ?? [];
+      if (artifactFiles.length === 0) {
+        attempt.artifactsPath = null;
+        continue;
+      }
+
+      const attemptDir = path.join(artifactRoot, caseResult.id, `attempt-${attempt.attempt}`);
+      await writeArtifactFiles(attemptDir, artifactFiles);
+      attempt.artifactsPath = attemptDir;
+      wroteArtifacts = true;
+    }
+  }
+
+  result.artifactsPath = wroteArtifacts ? artifactRoot : null;
+  return result.artifactsPath ?? null;
+}
+
+export function buildRunResult(input: {
+  mode: EvalMode;
+  runs: number;
+  runModel: string | null;
+  judgeModel: string | null;
+  caseResults: BenchmarkCaseResult[];
+}): BenchmarkRunResult {
+  const attemptCount = input.caseResults.reduce((sum, entry) => sum + entry.attempts.length, 0);
+  const passedAttempts = input.caseResults.reduce(
+    (sum, entry) => sum + entry.attempts.filter((attempt) => attempt.passed).length,
+    0
+  );
+  const durationTotal = input.caseResults.reduce(
+    (sum, entry) => sum + entry.attempts.reduce((inner, attempt) => inner + attempt.durationMs, 0),
+    0
+  );
+  const tokenUsageTotal = input.caseResults.reduce<BenchmarkTokenUsage | null>(
+    (sum, entry) => {
+      for (const attempt of entry.attempts) {
+        if (!attempt.tokenUsage) {
+          continue;
+        }
+        sum ??= { prompt: 0, completion: 0, total: 0 };
+        sum.prompt += attempt.tokenUsage.prompt;
+        sum.completion += attempt.tokenUsage.completion;
+        sum.total += attempt.tokenUsage.total;
+      }
+      return sum;
+    },
+    null
+  );
+
+  return {
+    version: 1,
+    mode: input.mode,
+    createdAt: new Date().toISOString(),
+    gitSha: getGitSha(),
+    runs: input.runs,
+    runModel: input.runModel,
+    judgeModel: input.judgeModel,
+    caseCount: input.caseResults.length,
+    attemptCount,
+    passedAttempts,
+    passRate: attemptCount === 0 ? 0 : passedAttempts / attemptCount,
+    averageDurationMs: attemptCount === 0 ? 0 : durationTotal / attemptCount,
+    totalTokenUsage: tokenUsageTotal,
+    averageTokenUsagePerAttempt:
+      attemptCount === 0 || !tokenUsageTotal
+        ? null
+        : {
+            prompt: tokenUsageTotal.prompt / attemptCount,
+            completion: tokenUsageTotal.completion / attemptCount,
+            total: tokenUsageTotal.total / attemptCount,
+          },
+    cases: input.caseResults,
+  };
+}
+
+export function formatRunSummary(result: BenchmarkRunResult): string {
+  const lines = [
+    `${result.mode} benchmark complete`,
+    `Pass rate: ${formatPercent(result.passRate)} (${result.passedAttempts}/${result.attemptCount})`,
+    `Average duration: ${Math.round(result.averageDurationMs)}ms`,
+  ];
+
+  const failures = collectFailures(result);
+  if (failures.length > 0) {
+    lines.push("Failures:");
+    for (const entry of failures.slice(0, 10)) {
+      lines.push(`- ${entry}`);
+    }
+  }
+
+  return lines.join("\n");
+}
+
+function collectFailures(result: BenchmarkRunResult): string[] {
+  const failures: string[] = [];
+
+  for (const caseResult of result.cases) {
+    for (const attempt of caseResult.attempts) {
+      if (attempt.passed) {
+        continue;
+      }
+      const failedChecks = attempt.checks.filter((check) => !check.passed).map((check) => check.name);
+      failures.push(
+        `${caseResult.id} attempt ${attempt.attempt}: ${failedChecks.join(", ") || attempt.error || "failed"}`
+      );
+    }
+  }
+
+  return failures;
+}
+
+function defaultFileName(mode: EvalMode): string {
+  return `${new Date().toISOString().replaceAll(":", "-")}__${mode}.json`;
+}
+
+export function resolveRunOutputPath(mode: EvalMode, outputPath?: string): string {
+  return outputPath ?? path.join(getAiEvalsRoot(), "results", defaultFileName(mode));
+}
+
+export function resolveHistoryPath(mode: EvalMode): string {
+  return path.join(getAiEvalsRoot(), "history", `${mode}.jsonl`);
+}
+
+function defaultArtifactsRoot(resultPath: string): string {
+  return resultPath.endsWith(".json")
+    ? resultPath.slice(0, -".json".length)
+    : `${resultPath}.artifacts`;
+}
+
+async function writeArtifactFiles(
+  rootDir: string,
+  files: BenchmarkArtifactFile[]
+): Promise<void> {
+  for (const file of files) {
+    const relativePath = normalizeArtifactPath(file.path);
+    const targetPath = path.join(rootDir, relativePath);
+    await mkdir(path.dirname(targetPath), { recursive: true });
+    await writeFile(targetPath, file.content, "utf8");
+  }
+}
+
+function normalizeArtifactPath(filePath: string): string {
+  const normalized = filePath.replaceAll("\\", "/").replace(/^\/+/, "");
+  const parts = normalized.split("/").filter(Boolean);
+  if (parts.length === 0 || parts.some((part) => part === "." || part === "..")) {
+    throw new Error(`Invalid artifact path: ${filePath}`);
+  }
+  return parts.join("/");
+}
+
+function toSerializableRunResult(result: BenchmarkRunResult): BenchmarkRunResult {
+  return {
+    ...result,
+    cases: result.cases.map((caseResult) => ({
+      ...caseResult,
+      attempts: caseResult.attempts.map(({ artifactFiles, ...attempt }) => attempt),
+    })),
+  };
+}
+
+function toHistoryRecord(result: BenchmarkRunResult) {
+  const judgeScores = result.cases.flatMap((caseResult) =>
+    caseResult.attempts.flatMap((attempt) =>
+      typeof attempt.judgeScore === "number" ? [attempt.judgeScore] : []
+    )
+  );
+
+  return {
+    createdAt: result.createdAt,
+    gitSha: result.gitSha,
+    mode: result.mode,
+    runs: result.runs,
+    runModel: result.runModel,
+    judgeModel: result.judgeModel,
+    caseCount: result.caseCount,
+    attemptCount: result.attemptCount,
+    passedAttempts: result.passedAttempts,
+    passRate: result.passRate,
+    averageDurationMs: result.averageDurationMs,
+    averageJudgeScore:
+      judgeScores.length === 0
+        ? null
+        : judgeScores.reduce((sum, score) => sum + score, 0) / judgeScores.length,
+    averageTokenUsagePerAttempt: result.averageTokenUsagePerAttempt ?? null,
+    failedCaseIds: Array.from(
+      new Set(
+        result.cases
+          .filter((caseResult) => caseResult.attempts.some((attempt) => !attempt.passed))
+          .map((caseResult) => caseResult.id)
+      )
+    ),
+    cases: result.cases.map((caseResult) => {
+      const attemptCount = caseResult.attempts.length;
+      const passedAttempts = caseResult.attempts.filter((attempt) => attempt.passed).length;
+      const totalDurationMs = caseResult.attempts.reduce(
+        (sum, attempt) => sum + attempt.durationMs,
+        0
+      );
+      const judgeScores = caseResult.attempts.flatMap((attempt) =>
+        typeof attempt.judgeScore === "number" ? [attempt.judgeScore] : []
+      );
+      const totalTokenUsage = caseResult.attempts.reduce<BenchmarkTokenUsage | null>(
+        (sum, attempt) => {
+          if (!attempt.tokenUsage) {
+            return sum;
+          }
+          sum ??= { prompt: 0, completion: 0, total: 0 };
+          sum.prompt += attempt.tokenUsage.prompt;
+          sum.completion += attempt.tokenUsage.completion;
+          sum.total += attempt.tokenUsage.total;
+          return sum;
+        },
+        null
+      );
+
+      return {
+        id: caseResult.id,
+        attemptCount,
+        passedAttempts,
+        passRate: attemptCount === 0 ? 0 : passedAttempts / attemptCount,
+        averageDurationMs: attemptCount === 0 ? 0 : totalDurationMs / attemptCount,
+        averageJudgeScore:
+          judgeScores.length === 0
+            ? null
+            : judgeScores.reduce((sum, score) => sum + score, 0) / judgeScores.length,
+        averageTokenUsagePerAttempt:
+          attemptCount === 0 || !totalTokenUsage
+            ? null
+            : {
+                prompt: totalTokenUsage.prompt / attemptCount,
+                completion: totalTokenUsage.completion / attemptCount,
+                total: totalTokenUsage.total / attemptCount,
+              },
+      };
+    }),
+  };
+}
+
+function getGitSha(): string | null {
+  try {
+    return execFileSync("git", ["rev-parse", "HEAD"], {
+      cwd: getRepoRoot(),
+      encoding: "utf8",
+      stdio: ["ignore", "pipe", "ignore"],
+    }).trim();
+  } catch {
+    return null;
+  }
+}
+
+function formatPercent(value: number): string {
+  return `${(value * 100).toFixed(1)}%`;
+}
diff --git a/ai_evals/core/runSuite.ts b/ai_evals/core/runSuite.ts
new file mode 100644
index 0000000000..9e155298f0
--- /dev/null
+++ b/ai_evals/core/runSuite.ts
@@ -0,0 +1,264 @@
+import { judgeOutput, DEFAULT_JUDGE_MODEL } from "./judge";
+import type {
+  BenchmarkAttemptResult,
+  BenchmarkCaseResult,
+  BenchmarkCheck,
+  EvalCase,
+  FrontendBenchmarkProgressEvent,
+  ModeRunner,
+} from "./types";
+
+export async function runSuite<TInitial, TExpected, TActual>(input: {
+  modeRunner: ModeRunner<TInitial, TExpected, TActual>;
+  cases: EvalCase[];
+  runs: number;
+  runModel: string | null;
+  judgeModel?: string | null;
+  concurrency?: number;
+  verbose?: boolean;
+  onProgress?: (event: FrontendBenchmarkProgressEvent) => void;
+}): Promise<BenchmarkCaseResult[]> {
+  const judgeModel = input.judgeModel ?? DEFAULT_JUDGE_MODEL;
+  const concurrency = Math.max(1, input.concurrency ?? input.modeRunner.concurrency);
+  const results = new Array<BenchmarkCaseResult>(input.cases.length);
+  let cursor = 0;
+
+  if (input.modeRunner.mode !== "cli") {
+    input.onProgress?.({
+      type: "run-start",
+      surface: input.modeRunner.mode,
+      totalCases: input.cases.length,
+      runs: input.runs,
+      concurrency,
+    });
+  }
+
+  async function worker(): Promise<void> {
+    while (true) {
+      const caseIndex = cursor++;
+      if (caseIndex >= input.cases.length) {
+        return;
+      }
+      const evalCase = input.cases[caseIndex];
+      results[caseIndex] = {
+        id: evalCase.id,
+        prompt: evalCase.prompt,
+        initialPath: evalCase.initialPath,
+        expectedPath: evalCase.expectedPath,
+        attempts: await runCaseAttempts({
+          caseIndex,
+          evalCase,
+          runs: input.runs,
+          judgeModel,
+          judgeThreshold: input.modeRunner.judgeThreshold ?? 80,
+          modeRunner: input.modeRunner,
+          totalCases: input.cases.length,
+          verbose: input.verbose ?? false,
+          onProgress: input.onProgress,
+        }),
+      };
+    }
+  }
+
+  await Promise.all(
+    Array.from({ length: Math.min(concurrency, input.cases.length) }, () => worker())
+  );
+
+  return results;
+}
+
+async function runCaseAttempts<TInitial, TExpected, TActual>(input: {
+  caseIndex: number;
+  evalCase: EvalCase;
+  runs: number;
+  judgeModel: string;
+  judgeThreshold: number;
+  modeRunner: ModeRunner<TInitial, TExpected, TActual>;
+  totalCases: number;
+  verbose: boolean;
+  onProgress?: (event: FrontendBenchmarkProgressEvent) => void;
+}): Promise<BenchmarkAttemptResult[]> {
+  const attempts: BenchmarkAttemptResult[] = [];
+  const surface = input.modeRunner.mode === "cli" ? null : input.modeRunner.mode;
+
+  for (let attempt = 1; attempt <= input.runs; attempt += 1) {
+    if (surface) {
+      input.onProgress?.({
+        type: "attempt-start",
+        surface,
+        caseId: input.evalCase.id,
+        caseNumber: input.caseIndex + 1,
+        totalCases: input.totalCases,
+        attempt,
+        runs: input.runs,
+      });
+    }
+
+    const startedAt = Date.now();
+
+    try {
+      const initial = await input.modeRunner.loadInitial(input.evalCase.initialPath);
+      const expected = await input.modeRunner.loadExpected(input.evalCase.expectedPath);
+      const run = await input.modeRunner.run(input.evalCase.prompt, initial, {
+        caseId: input.evalCase.id,
+        caseNumber: input.caseIndex + 1,
+        totalCases: input.totalCases,
+        attempt,
+        runs: input.runs,
+        verbose: input.verbose,
+        onAssistantMessageStart: input.verbose && surface
+          ? () =>
+              input.onProgress?.({
+                type: "assistant-message-start",
+                surface,
+                caseId: input.evalCase.id,
+                caseNumber: input.caseIndex + 1,
+                totalCases: input.totalCases,
+                attempt,
+                runs: input.runs,
+              })
+          : undefined,
+        onAssistantChunk: input.verbose && surface
+          ? (chunk: string) =>
+              input.onProgress?.({
+                type: "assistant-chunk",
+                surface,
+                caseId: input.evalCase.id,
+                caseNumber: input.caseIndex + 1,
+                totalCases: input.totalCases,
+                attempt,
+                runs: input.runs,
+                chunk,
+              })
+          : undefined,
+        onAssistantMessageEnd: input.verbose && surface
+          ? () =>
+              input.onProgress?.({
+                type: "assistant-message-end",
+                surface,
+                caseId: input.evalCase.id,
+                caseNumber: input.caseIndex + 1,
+                totalCases: input.totalCases,
+                attempt,
+                runs: input.runs,
+              })
+          : undefined,
+      });
+      const checks: BenchmarkCheck[] = [
+        buildCheck("run succeeded", run.success, run.error),
+        ...input.modeRunner.validate({
+          evalCase: input.evalCase,
+          prompt: input.evalCase.prompt,
+          initial,
+          expected,
+          actual: run.actual,
+          run,
+        }),
+      ];
+
+      let judgeScore: number | null = null;
+      let judgeSummary: string | null = null;
+
+      if (run.success) {
+        const judge = await judgeOutput({
+          mode: input.modeRunner.mode,
+          prompt: input.evalCase.prompt,
+          checklist: input.evalCase.judgeChecklist,
+          initial,
+          expected: input.modeRunner.mode === "cli" ? undefined : expected,
+          actual: run.actual,
+          model: input.judgeModel,
+        });
+
+        judgeScore = judge.success ? judge.score : null;
+        judgeSummary = judge.summary;
+        checks.push(buildCheck("judge succeeded", judge.success, judge.error));
+        checks.push(
+          buildCheck(
+            `judge score >= ${input.judgeThreshold}`,
+            (judgeScore ?? 0) >= input.judgeThreshold,
+            judge.success ? `score=${judgeScore}` : judge.error
+          )
+        );
+      }
+
+      const artifactFiles = input.modeRunner.buildArtifacts?.(run.actual) ?? [];
+      const attemptResult: BenchmarkAttemptResult = {
+        attempt,
+        passed: checks.every((check) => check.passed),
+        durationMs: Date.now() - startedAt,
+        assistantMessageCount: run.assistantMessageCount,
+        toolCallCount: run.toolCallCount,
+        toolsUsed: uniqueStrings(run.toolsUsed),
+        skillsInvoked: uniqueStrings(run.skillsInvoked),
+        checks,
+        judgeScore,
+        judgeSummary,
+        error: run.error ?? null,
+        tokenUsage: run.tokenUsage ?? null,
+        artifactsPath: null,
+        artifactFiles,
+      };
+
+      if (surface) {
+        input.onProgress?.({
+          type: "attempt-finish",
+          surface,
+          caseId: input.evalCase.id,
+          caseNumber: input.caseIndex + 1,
+          totalCases: input.totalCases,
+          attempt,
+          runs: input.runs,
+          passed: attemptResult.passed,
+          durationMs: attemptResult.durationMs,
+          judgeScore: attemptResult.judgeScore,
+          error: attemptResult.error,
+        });
+      }
+
+      attempts.push(attemptResult);
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      const failedAttempt: BenchmarkAttemptResult = {
+        attempt,
+        passed: false,
+        durationMs: Date.now() - startedAt,
+        assistantMessageCount: 0,
+        toolCallCount: 0,
+        toolsUsed: [],
+        skillsInvoked: [],
+        checks: [buildCheck("run crashed", false, message)],
+        judgeScore: null,
+        judgeSummary: null,
+        error: message,
+        tokenUsage: null,
+      };
+      if (surface) {
+        input.onProgress?.({
+          type: "attempt-finish",
+          surface,
+          caseId: input.evalCase.id,
+          caseNumber: input.caseIndex + 1,
+          totalCases: input.totalCases,
+          attempt,
+          runs: input.runs,
+          passed: false,
+          durationMs: failedAttempt.durationMs,
+          judgeScore: null,
+          error: message,
+        });
+      }
+      attempts.push(failedAttempt);
+    }
+  }
+
+  return attempts;
+}
+
+function buildCheck(name: string, passed: boolean, details?: string): BenchmarkCheck {
+  return details ? { name, passed, details } : { name, passed };
+}
+
+function uniqueStrings(values: string[]): string[] {
+  return [...new Set(values)];
+}
diff --git a/ai_evals/core/types.ts b/ai_evals/core/types.ts
new file mode 100644
index 0000000000..a8ed0baa28
--- /dev/null
+++ b/ai_evals/core/types.ts
@@ -0,0 +1,198 @@
+export const EVAL_MODES = ["cli", "flow", "script", "app"] as const;
+
+export type EvalMode = (typeof EVAL_MODES)[number];
+
+export interface FlowValidationSpec {
+  schemaRequiredPaths?: string[];
+  schemaAnyOf?: Array<{
+    requiredPaths: string[];
+  }>;
+  resolveResultsRefs?: boolean;
+  requireSpecialModules?: Array<"preprocessor_module" | "failure_module">;
+  requireSuspendSteps?: Array<{
+    id: string;
+    requiredEvents?: number;
+    resumeRequiredStringFieldAnyOf?: string[];
+  }>;
+}
+
+export interface EvalCase {
+  id: string;
+  prompt: string;
+  initialPath?: string;
+  expectedPath?: string;
+  validate?: FlowValidationSpec;
+  judgeChecklist?: string[];
+}
+
+export interface BenchmarkCheck {
+  name: string;
+  passed: boolean;
+  details?: string;
+}
+
+export interface JudgeResult {
+  success: boolean;
+  score: number;
+  summary: string;
+  error?: string;
+}
+
+export interface BenchmarkArtifactFile {
+  path: string;
+  content: string;
+}
+
+export interface BenchmarkTokenUsage {
+  prompt: number;
+  completion: number;
+  total: number;
+}
+
+export interface ModeRunOutput<TActual> {
+  success: boolean;
+  actual: TActual;
+  error?: string;
+  assistantMessageCount: number;
+  toolCallCount: number;
+  toolsUsed: string[];
+  skillsInvoked: string[];
+  tokenUsage?: BenchmarkTokenUsage | null;
+}
+
+export interface ModeRunContext {
+  caseId: string;
+  caseNumber: number;
+  totalCases: number;
+  attempt: number;
+  runs: number;
+  verbose: boolean;
+  onAssistantMessageStart?: () => void;
+  onAssistantChunk?: (chunk: string) => void;
+  onAssistantMessageEnd?: () => void;
+}
+
+export interface ModeRunner<TInitial, TExpected, TActual> {
+  mode: EvalMode;
+  concurrency: number;
+  judgeThreshold?: number;
+  loadInitial(path?: string): Promise<TInitial | undefined>;
+  loadExpected(path?: string): Promise<TExpected | undefined>;
+  run(
+    prompt: string,
+    initial: TInitial | undefined,
+    context: ModeRunContext
+  ): Promise<ModeRunOutput<TActual>>;
+  validate(input: {
+    evalCase: EvalCase;
+    prompt: string;
+    initial: TInitial | undefined;
+    expected: TExpected | undefined;
+    actual: TActual;
+    run: ModeRunOutput<TActual>;
+  }): BenchmarkCheck[];
+  buildArtifacts?(actual: TActual): BenchmarkArtifactFile[];
+}
+
+export interface BenchmarkAttemptResult {
+  attempt: number;
+  passed: boolean;
+  durationMs: number;
+  assistantMessageCount: number;
+  toolCallCount: number;
+  toolsUsed: string[];
+  skillsInvoked: string[];
+  checks: BenchmarkCheck[];
+  judgeScore: number | null;
+  judgeSummary: string | null;
+  error: string | null;
+  tokenUsage?: BenchmarkTokenUsage | null;
+  artifactsPath?: string | null;
+  artifactFiles?: BenchmarkArtifactFile[];
+}
+
+export interface BenchmarkCaseResult {
+  id: string;
+  prompt: string;
+  initialPath?: string;
+  expectedPath?: string;
+  attempts: BenchmarkAttemptResult[];
+}
+
+export interface BenchmarkRunResult {
+  version: 1;
+  mode: EvalMode;
+  createdAt: string;
+  gitSha: string | null;
+  runs: number;
+  runModel: string | null;
+  judgeModel: string | null;
+  caseCount: number;
+  attemptCount: number;
+  passedAttempts: number;
+  passRate: number;
+  averageDurationMs: number;
+  totalTokenUsage?: BenchmarkTokenUsage | null;
+  averageTokenUsagePerAttempt?: BenchmarkTokenUsage | null;
+  artifactsPath?: string | null;
+  cases: BenchmarkCaseResult[];
+}
+
+export type FrontendBenchmarkProgressEvent =
+  | {
+      type: "run-start";
+      surface: Exclude<EvalMode, "cli">;
+      totalCases: number;
+      runs: number;
+      concurrency: number;
+    }
+  | {
+      type: "attempt-start";
+      surface: Exclude<EvalMode, "cli">;
+      caseId: string;
+      caseNumber: number;
+      totalCases: number;
+      attempt: number;
+      runs: number;
+    }
+  | {
+      type: "attempt-finish";
+      surface: Exclude<EvalMode, "cli">;
+      caseId: string;
+      caseNumber: number;
+      totalCases: number;
+      attempt: number;
+      runs: number;
+      passed: boolean;
+      durationMs: number;
+      judgeScore: number | null;
+      error: string | null;
+    }
+  | {
+      type: "assistant-message-start";
+      surface: Exclude<EvalMode, "cli">;
+      caseId: string;
+      caseNumber: number;
+      totalCases: number;
+      attempt: number;
+      runs: number;
+    }
+  | {
+      type: "assistant-chunk";
+      surface: Exclude<EvalMode, "cli">;
+      caseId: string;
+      caseNumber: number;
+      totalCases: number;
+      attempt: number;
+      runs: number;
+      chunk: string;
+    }
+  | {
+      type: "assistant-message-end";
+      surface: Exclude<EvalMode, "cli">;
+      caseId: string;
+      caseNumber: number;
+      totalCases: number;
+      attempt: number;
+      runs: number;
+    };
diff --git a/ai_evals/core/validators.test.ts b/ai_evals/core/validators.test.ts
new file mode 100644
index 0000000000..93578f6c5f
--- /dev/null
+++ b/ai_evals/core/validators.test.ts
@@ -0,0 +1,36 @@
+import { describe, expect, it } from "bun:test";
+import { validateScriptState } from "./validators";
+
+describe("validateScriptState", () => {
+  it("accepts semantically equivalent script implementations", () => {
+    const checks = validateScriptState({
+      actual: {
+        path: "f/evals/greet_user.ts",
+        lang: "bun",
+        code: "export async function main(name: string): Promise<string> {\n  return `Hello, ${name}!`;\n}\n",
+      },
+      expected: {
+        path: "f/evals/greet_user.ts",
+        lang: "bun",
+        code: "export async function main(name: string) {\n\treturn `Hello, ${name}!`\n}\n",
+      },
+    });
+
+    expect(checks.every((check) => check.passed)).toBe(true);
+  });
+
+  it("still requires an exported main entrypoint", () => {
+    const checks = validateScriptState({
+      actual: {
+        path: "f/evals/greet_user.ts",
+        lang: "bun",
+        code: "async function main(name: string) {\n  return `Hello, ${name}!`;\n}\n",
+      },
+    });
+
+    expect(checks).toContainEqual({
+      name: "script exports entrypoint",
+      passed: false,
+    });
+  });
+});
diff --git a/ai_evals/core/validators.ts b/ai_evals/core/validators.ts
new file mode 100644
index 0000000000..86ddc70566
--- /dev/null
+++ b/ai_evals/core/validators.ts
@@ -0,0 +1,997 @@
+import path from "node:path";
+import ts from "typescript";
+import type { BenchmarkCheck, FlowValidationSpec } from "./types";
+
+export interface ScriptState {
+  path: string;
+  lang: string;
+  args?: Record<string, unknown>;
+  code: string;
+}
+
+export interface FlowState {
+  summary?: string;
+  value?: {
+    preprocessor_module?: Record<string, unknown>;
+    failure_module?: Record<string, unknown>;
+    modules?: Array<Record<string, unknown>>;
+    [key: string]: unknown;
+  };
+  schema?: Record<string, unknown>;
+}
+
+export interface AppFilesState {
+  frontend: Record<string, string>;
+  backend: Record<string, AppRunnableState>;
+}
+
+export interface AppRunnableState {
+  type?: string;
+  name?: string;
+  path?: string;
+  inlineScript?: {
+    language?: string;
+    content?: string;
+  };
+}
+
+const TS_LIKE_LANGUAGES = new Set(["bun", "deno", "nativets", "bunnative", "ts", "typescript"]);
+const CONTROL_FLOW_MODULE_TYPES = new Set(["branchone", "branchall", "forloopflow", "whileloopflow"]);
+
+export function validateScriptState(input: {
+  actual: ScriptState;
+  initial?: ScriptState;
+  expected?: ScriptState;
+}): BenchmarkCheck[] {
+  const checks: BenchmarkCheck[] = [
+    check("script exports entrypoint", hasSupportedEntrypoint(input.actual.code)),
+    check("script has no syntax errors", getScriptSyntaxErrors(input.actual.code, input.actual.lang).length === 0),
+  ];
+
+  if (input.expected) {
+    checks.push(
+      check(
+        "script path matches expected",
+        input.actual.path === input.expected.path,
+        `expected ${input.expected.path}, got ${input.actual.path}`
+      )
+    );
+    checks.push(
+      check(
+        "script language matches expected",
+        input.actual.lang === input.expected.lang,
+        `expected ${input.expected.lang}, got ${input.actual.lang}`
+      )
+    );
+  }
+
+  if (input.initial) {
+    checks.push(
+      check(
+        "script differs from initial",
+        normalizeText(input.actual.code) !== normalizeText(input.initial.code)
+      )
+    );
+  }
+
+  return checks;
+}
+
+export function validateFlowState(input: {
+  actual: FlowState;
+  initial?: FlowState;
+  expected?: FlowState;
+  validate?: FlowValidationSpec;
+}): BenchmarkCheck[] {
+  const actualModules = getFlowModules(input.actual);
+  const placeholderModuleIds = getInlineScriptPlaceholderModuleIds(input.actual);
+  const checks: BenchmarkCheck[] = [
+    check("flow has modules", actualModules.length > 0),
+    check(
+      "flow has no inline placeholder code",
+      placeholderModuleIds.length === 0,
+      placeholderModuleIds.length > 0
+        ? `placeholder content in: ${placeholderModuleIds.join(", ")}`
+        : undefined
+    ),
+  ];
+
+  if (input.initial) {
+    checks.push(
+      check(
+        "flow differs from initial",
+        normalizeJson(input.actual) !== normalizeJson(input.initial)
+      )
+    );
+  }
+
+  if (input.expected) {
+    checks.push(...validateFlowExpectedStructure(input.actual, input.expected));
+  }
+
+  if (input.validate) {
+    checks.push(...validateFlowRequirements(input.actual, input.validate));
+  }
+
+  return checks;
+}
+
+export function validateAppState(input: {
+  actual: AppFilesState;
+  initial?: AppFilesState;
+  expected?: AppFilesState;
+}): BenchmarkCheck[] {
+  const checks: BenchmarkCheck[] = [];
+  const frontendEntries = Object.entries(input.actual.frontend ?? {});
+  const backendEntries = Object.entries(input.actual.backend ?? {});
+  const frontendSyntaxProblems = getAppFrontendSyntaxProblems(input.actual.frontend);
+  const backendSyntaxProblems = getAppBackendSyntaxProblems(input.actual.backend);
+  const unresolvedBackendRefs = getUnresolvedBackendReferences(
+    input.actual.frontend,
+    input.actual.backend
+  );
+
+  checks.push(check("app has frontend entrypoint", Boolean(input.actual.frontend["/index.tsx"])));
+  checks.push(
+    check(
+      "app has non-empty frontend files",
+      frontendEntries.some(([, content]) => content.trim().length > 0)
+    )
+  );
+  checks.push(
+    check(
+      "frontend files have no syntax errors",
+      frontendSyntaxProblems.length === 0,
+      summarizeProblems(frontendSyntaxProblems)
+    )
+  );
+  checks.push(
+    check(
+      "backend inline scripts have entrypoints",
+      backendEntries.every(([, runnable]) => {
+        if (runnable.type !== "inline") {
+          return true;
+        }
+        return hasSupportedEntrypoint(runnable.inlineScript?.content ?? "");
+      })
+    )
+  );
+  checks.push(
+    check(
+      "backend inline scripts have no syntax errors",
+      backendSyntaxProblems.length === 0,
+      summarizeProblems(backendSyntaxProblems)
+    )
+  );
+  checks.push(
+    check(
+      "frontend backend references resolve",
+      unresolvedBackendRefs.length === 0,
+      summarizeProblems(unresolvedBackendRefs)
+    )
+  );
+
+  if (input.initial) {
+    checks.push(check("app differs from initial", !appStatesEqual(input.actual, input.initial)));
+  }
+
+  if (input.expected) {
+    for (const [filePath, content] of Object.entries(input.expected.frontend)) {
+      checks.push(
+        check(
+          `frontend includes ${filePath}`,
+          normalizeText(input.actual.frontend[filePath] ?? "") === normalizeText(content)
+        )
+      );
+    }
+    for (const [runnableName, runnable] of Object.entries(input.expected.backend)) {
+      const actualRunnable = input.actual.backend[runnableName];
+      checks.push(check(`backend includes ${runnableName}`, Boolean(actualRunnable)));
+      if (actualRunnable && runnable.inlineScript?.content) {
+        checks.push(
+          check(
+            `${runnableName} code matches expected`,
+            normalizeText(actualRunnable.inlineScript?.content ?? "") ===
+              normalizeText(runnable.inlineScript.content)
+          )
+        );
+      }
+    }
+  }
+
+  return checks;
+}
+
+export function validateCliWorkspace(input: {
+  actualFiles: Record<string, string>;
+  expectedFiles?: Record<string, string>;
+  initialFiles?: Record<string, string>;
+}): BenchmarkCheck[] {
+  const checks: BenchmarkCheck[] = [];
+
+  if (input.expectedFiles) {
+    for (const [filePath, expectedContent] of Object.entries(input.expectedFiles)) {
+      const actualContent = input.actualFiles[filePath];
+      checks.push(check(`creates ${filePath}`, actualContent !== undefined));
+      if (actualContent !== undefined) {
+        checks.push(
+          check(
+            `${filePath} contains expected content`,
+            cliFileContainsExpectedContent(actualContent, expectedContent)
+          )
+        );
+      }
+    }
+
+    const expectedPaths = new Set(Object.keys(input.expectedFiles));
+    const unexpectedPaths = Object.keys(input.actualFiles).filter((filePath) => !expectedPaths.has(filePath));
+    checks.push(
+      check(
+        "workspace contains no unexpected files",
+        unexpectedPaths.length === 0,
+        summarizeProblems(unexpectedPaths)
+      )
+    );
+  }
+
+  if (input.initialFiles) {
+    checks.push(check("workspace differs from initial", !fileMapsEqual(input.actualFiles, input.initialFiles)));
+  }
+
+  return checks;
+}
+
+function cliFileContainsExpectedContent(actualContent: string, expectedContent: string): boolean {
+  const expectedSnippets = expectedContent
+    .replace(/\r\n/g, "\n")
+    .split("\n")
+    .map((line) => line.trim())
+    .filter((line) => line.length > 0);
+
+  if (expectedSnippets.length === 0) {
+    return true;
+  }
+
+  const normalizedActual = actualContent.replace(/\r\n/g, "\n");
+
+  return expectedSnippets.every((snippet) => normalizedActual.includes(snippet));
+}
+
+function check(name: string, passed: boolean, details?: string): BenchmarkCheck {
+  return !passed && details ? { name, passed, details } : { name, passed };
+}
+
+function normalizeText(value: string): string {
+  return value.replace(/\r\n/g, "\n").trim();
+}
+
+function normalizeJson(value: unknown): string {
+  return JSON.stringify(value);
+}
+
+function summarizeProblems(problems: string[], limit = 5): string | undefined {
+  if (problems.length === 0) {
+    return undefined;
+  }
+
+  if (problems.length <= limit) {
+    return problems.join("; ");
+  }
+
+  return `${problems.slice(0, limit).join("; ")}; ...and ${problems.length - limit} more`;
+}
+
+function hasSupportedEntrypoint(code: string): boolean {
+  return (
+    /export\s+(async\s+)?function\s+main\s*\(/.test(code) ||
+    /export\s+default\s+(async\s+)?function\s*\(/.test(code)
+  );
+}
+
+function getScriptSyntaxErrors(code: string, lang: string): string[] {
+  if (!TS_LIKE_LANGUAGES.has(lang)) {
+    return [];
+  }
+
+  return getTypeScriptSyntaxErrors(code, "eval.ts");
+}
+
+function getTypeScriptSyntaxErrors(code: string, fileName: string): string[] {
+  const result = ts.transpileModule(code, {
+    compilerOptions: {
+      target: ts.ScriptTarget.ES2022,
+      module: ts.ModuleKind.ESNext,
+      jsx: ts.JsxEmit.ReactJSX,
+    },
+    reportDiagnostics: true,
+    fileName,
+  });
+
+  return (result.diagnostics ?? []).map((diagnostic) =>
+    ts.flattenDiagnosticMessageText(diagnostic.messageText, "\n")
+  );
+}
+
+function getAppFrontendSyntaxProblems(frontend: Record<string, string>): string[] {
+  const problems: string[] = [];
+
+  for (const [filePath, content] of Object.entries(frontend)) {
+    if (!isFrontendCodeFile(filePath)) {
+      continue;
+    }
+
+    const errors = getTypeScriptSyntaxErrors(content, filePath);
+    for (const error of errors) {
+      problems.push(`${filePath}: ${error}`);
+    }
+  }
+
+  return problems;
+}
+
+function getAppBackendSyntaxProblems(backend: Record<string, AppRunnableState>): string[] {
+  const problems: string[] = [];
+
+  for (const [key, runnable] of Object.entries(backend)) {
+    if (runnable.type !== "inline") {
+      continue;
+    }
+
+    const language = runnable.inlineScript?.language ?? "";
+    const content = runnable.inlineScript?.content ?? "";
+    for (const error of getScriptSyntaxErrors(content, language)) {
+      problems.push(`${key}: ${error}`);
+    }
+  }
+
+  return problems;
+}
+
+function isFrontendCodeFile(filePath: string): boolean {
+  const extension = path.extname(filePath).toLowerCase();
+  return extension === ".ts" || extension === ".tsx" || extension === ".js" || extension === ".jsx";
+}
+
+function getUnresolvedBackendReferences(
+  frontend: Record<string, string>,
+  backend: Record<string, AppRunnableState>
+): string[] {
+  const backendKeys = new Set(Object.keys(backend));
+  const unresolved = new Set<string>();
+
+  for (const [filePath, content] of Object.entries(frontend)) {
+    for (const key of extractBackendCallKeys(content)) {
+      if (!backendKeys.has(key)) {
+        unresolved.add(`${filePath} references missing backend.${key}()`);
+      }
+    }
+  }
+
+  return [...unresolved];
+}
+
+function extractBackendCallKeys(content: string): string[] {
+  const matches = content.matchAll(/\bbackend\.([A-Za-z_][A-Za-z0-9_]*)\s*\(/g);
+  return [...new Set([...matches].map((match) => match[1]))];
+}
+
+function getFlowModules(flow: FlowState): Array<Record<string, unknown>> {
+  return Array.isArray(flow.value?.modules) ? flow.value.modules : [];
+}
+
+function validateFlowExpectedStructure(
+  actual: FlowState,
+  expected: FlowState
+): BenchmarkCheck[] {
+  const checks: BenchmarkCheck[] = [];
+  const expectedTopLevelModules = getFlowModules(expected);
+  const actualTopLevelModules = getFlowModules(actual);
+
+  const expectedSchemaFields = getTopLevelSchemaFields(expected.schema);
+  if (expectedSchemaFields.length > 0) {
+    checks.push(
+      check(
+        "flow schema includes expected top-level fields",
+        expectedSchemaFields.every((field) => hasSchemaPath(actual.schema, field)),
+        `missing one of: ${expectedSchemaFields.join(", ")}`
+      )
+    );
+  }
+
+  if (expectedTopLevelModules.length > 0) {
+    const actualIds = actualTopLevelModules
+      .map((module) => (typeof module.id === "string" ? module.id : null))
+      .filter((id): id is string => Boolean(id));
+    const expectedIds = expectedTopLevelModules
+      .map((module) => (typeof module.id === "string" ? module.id : null))
+      .filter((id): id is string => Boolean(id));
+
+    checks.push(
+      check(
+        "flow includes expected top-level step ids",
+        expectedIds.every((id) => actualIds.includes(id)),
+        `expected ids: ${expectedIds.join(", ")}; actual ids: ${actualIds.join(", ")}`
+      )
+    );
+
+    checks.push(
+      check(
+        "flow preserves expected top-level step order",
+        preservesRelativeOrder(actualIds, expectedIds),
+        `expected order: ${expectedIds.join(" -> ")}; actual ids: ${actualIds.join(" -> ")}`
+      )
+    );
+
+    for (const expectedModule of expectedTopLevelModules) {
+      const moduleId = typeof expectedModule.id === "string" ? expectedModule.id : null;
+      if (!moduleId) {
+        continue;
+      }
+
+      const actualModule = actualTopLevelModules.find((module) => module.id === moduleId);
+      if (!actualModule) {
+        continue;
+      }
+
+      const expectedType = getModuleType(expectedModule);
+      if (expectedType && !(hasSuspendConfig(expectedModule) || hasSuspendConfig(actualModule))) {
+        checks.push(
+          check(
+            `${moduleId} type matches expected`,
+            getModuleType(actualModule) === expectedType,
+            `expected ${expectedType}, got ${getModuleType(actualModule) ?? "(missing)"}`
+          )
+        );
+      }
+
+      const expectedPath = getModulePath(expectedModule);
+      if (expectedPath) {
+        checks.push(
+          check(
+            `${moduleId} path matches expected`,
+            getModulePath(actualModule) === expectedPath,
+            `expected ${expectedPath}, got ${getModulePath(actualModule) ?? "(missing)"}`
+          )
+        );
+      }
+    }
+  }
+
+  for (const specialModuleKey of ["preprocessor_module", "failure_module"] as const) {
+    const expectedSpecialModule = getSpecialFlowModule(expected, specialModuleKey);
+    if (!expectedSpecialModule) {
+      continue;
+    }
+
+    const actualSpecialModule = getSpecialFlowModule(actual, specialModuleKey);
+    checks.push(check(`${specialModuleKey} matches expected presence`, Boolean(actualSpecialModule)));
+
+    if (!actualSpecialModule) {
+      continue;
+    }
+
+    const expectedType = getModuleType(expectedSpecialModule);
+    if (expectedType) {
+      checks.push(
+        check(
+          `${specialModuleKey} type matches expected`,
+          getModuleType(actualSpecialModule) === expectedType,
+          `expected ${expectedType}, got ${getModuleType(actualSpecialModule) ?? "(missing)"}`
+        )
+      );
+    }
+  }
+
+  return checks;
+}
+
+function validateFlowRequirements(
+  flow: FlowState,
+  validate: FlowValidationSpec
+): BenchmarkCheck[] {
+  const checks: BenchmarkCheck[] = [];
+
+  for (const requiredPath of validate.schemaRequiredPaths ?? []) {
+    checks.push(
+      check(
+        `schema includes ${requiredPath}`,
+        hasSchemaPath(flow.schema, requiredPath),
+        `missing schema path ${requiredPath}`
+      )
+    );
+  }
+
+  if (validate.schemaAnyOf && validate.schemaAnyOf.length > 0) {
+    const matchingVariant = validate.schemaAnyOf.find((variant) =>
+      variant.requiredPaths.every((requiredPath) => hasSchemaPath(flow.schema, requiredPath))
+    );
+
+    checks.push(
+      check(
+        "schema matches one accepted input shape",
+        Boolean(matchingVariant),
+        matchingVariant
+          ? undefined
+          : `expected one of: ${validate.schemaAnyOf
+              .map((variant) => `[${variant.requiredPaths.join(", ")}]`)
+              .join(" or ")}`
+      )
+    );
+  }
+
+  if (validate.resolveResultsRefs) {
+    const unresolved = collectUnresolvedResultsRefs(flow);
+    checks.push(
+      check(
+        "results references resolve",
+        unresolved.length === 0,
+        unresolved.length > 0 ? unresolved.join("; ") : undefined
+      )
+    );
+  }
+
+  for (const specialModule of validate.requireSpecialModules ?? []) {
+    checks.push(
+      check(
+        `${specialModule} exists`,
+        Boolean(getSpecialFlowModule(flow, specialModule))
+      )
+    );
+  }
+
+  for (const suspendStep of validate.requireSuspendSteps ?? []) {
+    const module = findFlowModuleById(flow, suspendStep.id);
+    checks.push(check(`${suspendStep.id} step exists`, Boolean(module)));
+    if (!module) {
+      continue;
+    }
+
+    checks.push(check(`${suspendStep.id} includes suspend config`, hasSuspendConfig(module)));
+    if (!hasSuspendConfig(module)) {
+      continue;
+    }
+
+    if (suspendStep.requiredEvents !== undefined) {
+      checks.push(
+        check(
+          `${suspendStep.id} requires ${suspendStep.requiredEvents} approval event${suspendStep.requiredEvents === 1 ? "" : "s"}`,
+          getSuspendRequiredEvents(module) === suspendStep.requiredEvents,
+          `expected ${suspendStep.requiredEvents}, got ${getSuspendRequiredEvents(module) ?? "(missing)"}`
+        )
+      );
+    }
+
+    if (
+      suspendStep.resumeRequiredStringFieldAnyOf &&
+      suspendStep.resumeRequiredStringFieldAnyOf.length > 0
+    ) {
+      const stringFields = getSuspendResumeStringFields(module);
+      checks.push(
+        check(
+          `${suspendStep.id} resume form includes one accepted comment field`,
+          suspendStep.resumeRequiredStringFieldAnyOf.some((field) =>
+            stringFields.includes(field)
+          ),
+          `expected one of [${suspendStep.resumeRequiredStringFieldAnyOf.join(", ")}], got [${stringFields.join(", ")}]`
+        )
+      );
+    }
+  }
+
+  return checks;
+}
+
+function hasSchemaPath(schema: Record<string, unknown> | undefined, dottedPath: string): boolean {
+  if (!schema || typeof schema !== "object") {
+    return false;
+  }
+
+  const segments = dottedPath.split(".").filter(Boolean);
+  if (segments.length === 0) {
+    return false;
+  }
+
+  let current: Record<string, unknown> | undefined = schema;
+  for (const segment of segments) {
+    const properties = current?.properties;
+    if (!properties || typeof properties !== "object") {
+      return false;
+    }
+
+    const next = (properties as Record<string, unknown>)[segment];
+    if (!next || typeof next !== "object") {
+      return false;
+    }
+    current = next as Record<string, unknown>;
+  }
+
+  return true;
+}
+
+function getTopLevelSchemaFields(schema: Record<string, unknown> | undefined): string[] {
+  if (!schema || typeof schema !== "object") {
+    return [];
+  }
+
+  const properties = schema.properties;
+  if (!properties || typeof properties !== "object") {
+    return [];
+  }
+
+  return Object.keys(properties as Record<string, unknown>).filter((key) => key.length > 0);
+}
+
+function preservesRelativeOrder(actualIds: string[], expectedIds: string[]): boolean {
+  if (expectedIds.length === 0) {
+    return true;
+  }
+
+  let cursor = 0;
+  for (const actualId of actualIds) {
+    if (actualId === expectedIds[cursor]) {
+      cursor += 1;
+      if (cursor === expectedIds.length) {
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+function collectUnresolvedResultsRefs(flow: FlowState): string[] {
+  const unresolved = new Set<string>();
+  validateModuleSequence(getFlowModules(flow), new Map<string, Record<string, unknown>>(), unresolved);
+  return [...unresolved];
+}
+
+function validateModuleSequence(
+  modules: Array<Record<string, unknown>>,
+  parentVisibleModules: Map<string, Record<string, unknown>>,
+  unresolved: Set<string>
+): void {
+  const visibleModules = new Map(parentVisibleModules);
+
+  for (const module of modules) {
+    validateResultsRefsInRecord(module, visibleModules, unresolved);
+    validateNestedModuleResultsRefs(module, visibleModules, unresolved);
+
+    if (typeof module.id === "string" && module.id.length > 0) {
+      visibleModules.set(module.id, module);
+    }
+  }
+}
+
+function validateNestedModuleResultsRefs(
+  module: Record<string, unknown>,
+  visibleModules: Map<string, Record<string, unknown>>,
+  unresolved: Set<string>
+): void {
+  const value = isObjectRecord(module.value) ? module.value : null;
+  if (!value) {
+    return;
+  }
+
+  const nestedSequences: Array<Array<Record<string, unknown>>> = [];
+
+  if (Array.isArray(value.modules)) {
+    nestedSequences.push(asModuleArray(value.modules));
+  }
+
+  if (Array.isArray(value.default)) {
+    nestedSequences.push(asModuleArray(value.default));
+  }
+
+  if (Array.isArray(value.branches)) {
+    for (const branch of value.branches) {
+      if (!isObjectRecord(branch)) {
+        continue;
+      }
+      if (typeof branch.expr === "string") {
+        validateResultsRefsInExpression(
+          branch.expr,
+          `branch ${module.id ?? "(unnamed)"}`,
+          visibleModules,
+          unresolved
+        );
+      }
+      if (Array.isArray(branch.modules)) {
+        nestedSequences.push(asModuleArray(branch.modules));
+      }
+    }
+  }
+
+  for (const sequence of nestedSequences) {
+    validateModuleSequence(sequence, visibleModules, unresolved);
+  }
+}
+
+function validateResultsRefsInRecord(
+  value: unknown,
+  visibleModules: Map<string, Record<string, unknown>>,
+  unresolved: Set<string>,
+  context = "expression"
+): void {
+  if (typeof value === "string") {
+    validateResultsRefsInExpression(value, context, visibleModules, unresolved);
+    return;
+  }
+
+  if (Array.isArray(value)) {
+    for (const entry of value) {
+      validateResultsRefsInRecord(entry, visibleModules, unresolved, context);
+    }
+    return;
+  }
+
+  if (!isObjectRecord(value)) {
+    return;
+  }
+
+  for (const [key, entry] of Object.entries(value)) {
+    if (key === "content" || key === "modules" || key === "branches" || key === "default") {
+      continue;
+    }
+    validateResultsRefsInRecord(entry, visibleModules, unresolved, key);
+  }
+}
+
+function validateResultsRefsInExpression(
+  expression: string,
+  context: string,
+  visibleModules: Map<string, Record<string, unknown>>,
+  unresolved: Set<string>
+): void {
+  for (const ref of extractResultsRefs(expression)) {
+    const module = visibleModules.get(ref.root);
+    if (!module) {
+      unresolved.add(`${context} references missing results.${ref.root}`);
+      continue;
+    }
+    validateNestedResultsRefPath(ref.root, ref.path, module, context, unresolved);
+  }
+}
+
+function extractResultsRefs(
+  expression: string
+): Array<{ root: string; path: string[] }> {
+  const matches = expression.matchAll(/\bresults\.([A-Za-z0-9_-]+)((?:\.[A-Za-z0-9_-]+)*)/g);
+  const refs = new Map<string, { root: string; path: string[] }>();
+
+  for (const match of matches) {
+    const root = match[1];
+    const path = match[2]
+      .split(".")
+      .filter(Boolean);
+    const key = `${root}:${path.join(".")}`;
+    refs.set(key, { root, path });
+  }
+
+  return [...refs.values()];
+}
+
+function validateNestedResultsRefPath(
+  rootId: string,
+  path: string[],
+  module: Record<string, unknown>,
+  context: string,
+  unresolved: Set<string>
+): void {
+  if (path.length === 0) {
+    return;
+  }
+
+  const moduleType = getModuleType(module);
+  if (!moduleType || !CONTROL_FLOW_MODULE_TYPES.has(moduleType)) {
+    return;
+  }
+
+  const nestedIds = new Set(getImmediateNestedModuleIds(module));
+  const [firstSegment] = path;
+  if (nestedIds.has(firstSegment)) {
+    unresolved.add(
+      `${context} references nested results.${rootId}.${firstSegment} inside ${moduleType} ${rootId}`
+    );
+  }
+}
+
+function getAllFlowModules(flow: FlowState): Array<Record<string, unknown>> {
+  const modules: Array<Record<string, unknown>> = [];
+  const specialModules = ["preprocessor_module", "failure_module"] as const;
+
+  for (const key of specialModules) {
+    const specialModule = getSpecialFlowModule(flow, key);
+    if (specialModule) {
+      modules.push(specialModule);
+      modules.push(...collectNestedModules(specialModule));
+    }
+  }
+
+  for (const module of getFlowModules(flow)) {
+    modules.push(module);
+    modules.push(...collectNestedModules(module));
+  }
+
+  return modules;
+}
+
+function collectNestedModules(module: Record<string, unknown>): Array<Record<string, unknown>> {
+  const nested: Array<Record<string, unknown>> = [];
+  const value = isObjectRecord(module.value) ? module.value : null;
+  if (!value) {
+    return nested;
+  }
+
+  if (Array.isArray(value.modules)) {
+    for (const child of asModuleArray(value.modules)) {
+      nested.push(child, ...collectNestedModules(child));
+    }
+  }
+
+  if (Array.isArray(value.default)) {
+    for (const child of asModuleArray(value.default)) {
+      nested.push(child, ...collectNestedModules(child));
+    }
+  }
+
+  if (Array.isArray(value.branches)) {
+    for (const branch of value.branches) {
+      if (!isObjectRecord(branch) || !Array.isArray(branch.modules)) {
+        continue;
+      }
+      for (const child of asModuleArray(branch.modules)) {
+        nested.push(child, ...collectNestedModules(child));
+      }
+    }
+  }
+
+  return nested;
+}
+
+function findFlowModuleById(flow: FlowState, id: string): Record<string, unknown> | null {
+  for (const module of getAllFlowModules(flow)) {
+    if (module.id === id) {
+      return module;
+    }
+  }
+  return null;
+}
+
+function getInlineScriptPlaceholderModuleIds(flow: FlowState): string[] {
+  return getAllFlowModules(flow).flatMap((module) => {
+    const code = getModuleCode(module)?.trim();
+    if (!code || !/^inline_script\.[A-Za-z0-9_-]+$/.test(code)) {
+      return [];
+    }
+
+    if (typeof module.id === "string" && module.id.length > 0) {
+      return [module.id];
+    }
+
+    return ["(unnamed)"];
+  });
+}
+
+function getImmediateNestedModuleIds(module: Record<string, unknown>): string[] {
+  const ids: string[] = [];
+  const value = isObjectRecord(module.value) ? module.value : null;
+  if (!value) {
+    return ids;
+  }
+
+  if (Array.isArray(value.modules)) {
+    ids.push(...asModuleArray(value.modules).flatMap((child) => (typeof child.id === "string" ? [child.id] : [])));
+  }
+
+  if (Array.isArray(value.default)) {
+    ids.push(...asModuleArray(value.default).flatMap((child) => (typeof child.id === "string" ? [child.id] : [])));
+  }
+
+  if (Array.isArray(value.branches)) {
+    for (const branch of value.branches) {
+      if (!isObjectRecord(branch) || !Array.isArray(branch.modules)) {
+        continue;
+      }
+      ids.push(
+        ...asModuleArray(branch.modules).flatMap((child) => (typeof child.id === "string" ? [child.id] : []))
+      );
+    }
+  }
+
+  return ids;
+}
+
+function getModuleCode(module: Record<string, unknown>): string | null {
+  const value = isObjectRecord(module.value) ? module.value : null;
+  return typeof value?.content === "string" ? value.content : null;
+}
+
+function asModuleArray(value: unknown[]): Array<Record<string, unknown>> {
+  return value.filter(isObjectRecord);
+}
+
+function isObjectRecord(value: unknown): value is Record<string, any> {
+  return typeof value === "object" && value !== null && !Array.isArray(value);
+}
+
+function getSpecialFlowModule(
+  flow: FlowState,
+  key: "preprocessor_module" | "failure_module"
+): Record<string, unknown> | null {
+  if (!flow.value || typeof flow.value !== "object") {
+    return null;
+  }
+  const module = (flow.value as Record<string, unknown>)[key];
+  return module && typeof module === "object" ? (module as Record<string, unknown>) : null;
+}
+
+function getModuleType(module: Record<string, unknown>): string | null {
+  const value = module.value;
+  if (!value || typeof value !== "object") {
+    return null;
+  }
+  return typeof (value as Record<string, unknown>).type === "string"
+    ? ((value as Record<string, string>).type)
+    : null;
+}
+
+function getModulePath(module: Record<string, unknown>): string | null {
+  const value = module.value;
+  if (!value || typeof value !== "object") {
+    return null;
+  }
+
+  return typeof (value as Record<string, unknown>).path === "string"
+    ? ((value as Record<string, string>).path)
+    : null;
+}
+
+function hasSuspendConfig(module: Record<string, unknown>): boolean {
+  return typeof module.suspend === "object" && module.suspend !== null;
+}
+
+function getSuspendRequiredEvents(module: Record<string, unknown>): number | null {
+  const suspend = isObjectRecord(module.suspend) ? module.suspend : null;
+  return typeof suspend?.required_events === "number" ? suspend.required_events : null;
+}
+
+function getSuspendResumeStringFields(module: Record<string, unknown>): string[] {
+  const suspend = isObjectRecord(module.suspend) ? module.suspend : null;
+  const resumeForm = isObjectRecord(suspend?.resume_form) ? suspend.resume_form : null;
+  const schema = isObjectRecord(resumeForm?.schema) ? resumeForm.schema : null;
+  const properties = isObjectRecord(schema?.properties) ? schema.properties : null;
+  if (!properties) {
+    return [];
+  }
+
+  return Object.entries(properties).flatMap(([field, property]) => {
+    if (!isObjectRecord(property) || property.type !== "string") {
+      return [];
+    }
+    return [field];
+  });
+}
+
+function appStatesEqual(left: AppFilesState, right: AppFilesState): boolean {
+  return fileMapsEqual(left.frontend, right.frontend) && fileMapsEqual(stringifyBackend(left.backend), stringifyBackend(right.backend));
+}
+
+function stringifyBackend(backend: Record<string, AppRunnableState>): Record<string, string> {
+  const result: Record<string, string> = {};
+  for (const [key, value] of Object.entries(backend)) {
+    result[key] = JSON.stringify(value);
+  }
+  return result;
+}
+
+function fileMapsEqual(left: Record<string, string>, right: Record<string, string>): boolean {
+  const leftEntries = Object.entries(left).sort(([a], [b]) => a.localeCompare(b));
+  const rightEntries = Object.entries(right).sort(([a], [b]) => a.localeCompare(b));
+  if (leftEntries.length !== rightEntries.length) {
+    return false;
+  }
+  return leftEntries.every(([key, value], index) => {
+    const [otherKey, otherValue] = rightEntries[index];
+    return key === otherKey && normalizeText(value) === normalizeText(otherValue);
+  });
+}
diff --git a/ai_evals/fixtures/cli/expected/bun-hello-flow-punctuation/f/evals/hello__flow/flow.yaml b/ai_evals/fixtures/cli/expected/bun-hello-flow-punctuation/f/evals/hello__flow/flow.yaml
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/ai_evals/fixtures/cli/expected/bun-hello-flow-punctuation/f/evals/hello__flow/hello.ts b/ai_evals/fixtures/cli/expected/bun-hello-flow-punctuation/f/evals/hello__flow/hello.ts
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/ai_evals/fixtures/cli/expected/bun-hello-flow/f/evals/hello__flow/flow.yaml b/ai_evals/fixtures/cli/expected/bun-hello-flow/f/evals/hello__flow/flow.yaml
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/ai_evals/fixtures/cli/expected/bun-hello-flow/f/evals/hello__flow/hello.ts b/ai_evals/fixtures/cli/expected/bun-hello-flow/f/evals/hello__flow/hello.ts
new file mode 100644
index 0000000000..8ce89048c9
--- /dev/null
+++ b/ai_evals/fixtures/cli/expected/bun-hello-flow/f/evals/hello__flow/hello.ts
@@ -0,0 +1,2 @@
+main(name: string)
+greeting: `Hello, ${name}!`
diff --git a/ai_evals/fixtures/cli/expected/bun-hello-script-uppercase/f/evals/hello.ts b/ai_evals/fixtures/cli/expected/bun-hello-script-uppercase/f/evals/hello.ts
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/ai_evals/fixtures/cli/expected/bun-hello-script/f/evals/hello.ts b/ai_evals/fixtures/cli/expected/bun-hello-script/f/evals/hello.ts
new file mode 100644
index 0000000000..e489a7b3fc
--- /dev/null
+++ b/ai_evals/fixtures/cli/expected/bun-hello-script/f/evals/hello.ts
@@ -0,0 +1,3 @@
+export async function main(name: string) {
+  return { greeting: `Hello, ${name}!` };
+}
diff --git a/ai_evals/fixtures/cli/expected/flow-reuse-existing-script/f/evals/reuse_greeting__flow/flow.yaml b/ai_evals/fixtures/cli/expected/flow-reuse-existing-script/f/evals/reuse_greeting__flow/flow.yaml
new file mode 100644
index 0000000000..b24c7ba77a
--- /dev/null
+++ b/ai_evals/fixtures/cli/expected/flow-reuse-existing-script/f/evals/reuse_greeting__flow/flow.yaml
@@ -0,0 +1,2 @@
+type: script
+path: f/lib/format_greeting
diff --git a/ai_evals/fixtures/cli/expected/flow-reuse-existing-script/f/lib/format_greeting.ts b/ai_evals/fixtures/cli/expected/flow-reuse-existing-script/f/lib/format_greeting.ts
new file mode 100644
index 0000000000..e489a7b3fc
--- /dev/null
+++ b/ai_evals/fixtures/cli/expected/flow-reuse-existing-script/f/lib/format_greeting.ts
@@ -0,0 +1,3 @@
+export async function main(name: string) {
+  return { greeting: `Hello, ${name}!` };
+}
diff --git a/ai_evals/fixtures/cli/expected/python-add-numbers-script/f/evals/add_numbers.py b/ai_evals/fixtures/cli/expected/python-add-numbers-script/f/evals/add_numbers.py
new file mode 100644
index 0000000000..cbf4ed11cb
--- /dev/null
+++ b/ai_evals/fixtures/cli/expected/python-add-numbers-script/f/evals/add_numbers.py
@@ -0,0 +1,2 @@
+def main(
+return {"total": a + b}
diff --git a/ai_evals/fixtures/cli/initial/bun-hello-flow-punctuation/f/evals/hello__flow/flow.yaml b/ai_evals/fixtures/cli/initial/bun-hello-flow-punctuation/f/evals/hello__flow/flow.yaml
new file mode 100644
index 0000000000..65a93ca42a
--- /dev/null
+++ b/ai_evals/fixtures/cli/initial/bun-hello-flow-punctuation/f/evals/hello__flow/flow.yaml
@@ -0,0 +1,20 @@
+summary: Simple greeting flow
+schema:
+  type: object
+  properties:
+    name:
+      type: string
+      description: Name to greet
+  required:
+    - name
+value:
+  modules:
+    - id: hello_step
+      value:
+        type: rawscript
+        language: bun
+        content: !inline hello.ts
+        input_transforms:
+          name:
+            type: javascript
+            expr: flow_input.name
diff --git a/ai_evals/fixtures/cli/initial/bun-hello-flow-punctuation/f/evals/hello__flow/hello.ts b/ai_evals/fixtures/cli/initial/bun-hello-flow-punctuation/f/evals/hello__flow/hello.ts
new file mode 100644
index 0000000000..e489a7b3fc
--- /dev/null
+++ b/ai_evals/fixtures/cli/initial/bun-hello-flow-punctuation/f/evals/hello__flow/hello.ts
@@ -0,0 +1,3 @@
+export async function main(name: string) {
+  return { greeting: `Hello, ${name}!` };
+}
diff --git a/ai_evals/fixtures/cli/initial/bun-hello-script-uppercase/f/evals/hello.ts b/ai_evals/fixtures/cli/initial/bun-hello-script-uppercase/f/evals/hello.ts
new file mode 100644
index 0000000000..e489a7b3fc
--- /dev/null
+++ b/ai_evals/fixtures/cli/initial/bun-hello-script-uppercase/f/evals/hello.ts
@@ -0,0 +1,3 @@
+export async function main(name: string) {
+  return { greeting: `Hello, ${name}!` };
+}
diff --git a/ai_evals/fixtures/cli/initial/flow-reuse-existing-script/f/lib/format_greeting.ts b/ai_evals/fixtures/cli/initial/flow-reuse-existing-script/f/lib/format_greeting.ts
new file mode 100644
index 0000000000..e489a7b3fc
--- /dev/null
+++ b/ai_evals/fixtures/cli/initial/flow-reuse-existing-script/f/lib/format_greeting.ts
@@ -0,0 +1,3 @@
+export async function main(name: string) {
+  return { greeting: `Hello, ${name}!` };
+}
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/createFolder/main.ts b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/createFolder/main.ts
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/createFolder/main.ts
rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/createFolder/main.ts
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/createFolder/meta.json b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/createFolder/meta.json
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/createFolder/meta.json
rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/createFolder/meta.json
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/deleteItem/main.ts b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/deleteItem/main.ts
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/deleteItem/main.ts
rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/deleteItem/main.ts
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/deleteItem/meta.json b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/deleteItem/meta.json
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/deleteItem/meta.json
rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/deleteItem/meta.json
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/listFiles/main.ts b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/listFiles/main.ts
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/listFiles/main.ts
rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/listFiles/main.ts
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/listFiles/meta.json b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/listFiles/meta.json
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/listFiles/meta.json
rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/listFiles/meta.json
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/listFolders/main.ts b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/listFolders/main.ts
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/listFolders/main.ts
rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/listFolders/main.ts
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/listFolders/meta.json b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/listFolders/meta.json
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/listFolders/meta.json
rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/listFolders/meta.json
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/moveItem/main.ts b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/moveItem/main.ts
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/moveItem/main.ts
rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/moveItem/main.ts
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/moveItem/meta.json b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/moveItem/meta.json
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/moveItem/meta.json
rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/moveItem/meta.json
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/renameItem/main.ts b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/renameItem/main.ts
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/renameItem/main.ts
rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/renameItem/main.ts
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/renameItem/meta.json b/ai_evals/fixtures/frontend/app/initial/file_manager/backend/renameItem/meta.json
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/backend/renameItem/meta.json
rename to ai_evals/fixtures/frontend/app/initial/file_manager/backend/renameItem/meta.json
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/components/Breadcrumb.tsx b/ai_evals/fixtures/frontend/app/initial/file_manager/frontend/components/Breadcrumb.tsx
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/components/Breadcrumb.tsx
rename to ai_evals/fixtures/frontend/app/initial/file_manager/frontend/components/Breadcrumb.tsx
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/components/FileItem.tsx b/ai_evals/fixtures/frontend/app/initial/file_manager/frontend/components/FileItem.tsx
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/components/FileItem.tsx
rename to ai_evals/fixtures/frontend/app/initial/file_manager/frontend/components/FileItem.tsx
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/components/FileList.tsx b/ai_evals/fixtures/frontend/app/initial/file_manager/frontend/components/FileList.tsx
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/components/FileList.tsx
rename to ai_evals/fixtures/frontend/app/initial/file_manager/frontend/components/FileList.tsx
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/components/FolderTree.tsx b/ai_evals/fixtures/frontend/app/initial/file_manager/frontend/components/FolderTree.tsx
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/components/FolderTree.tsx
rename to ai_evals/fixtures/frontend/app/initial/file_manager/frontend/components/FolderTree.tsx
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/components/Toolbar.tsx b/ai_evals/fixtures/frontend/app/initial/file_manager/frontend/components/Toolbar.tsx
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/components/Toolbar.tsx
rename to ai_evals/fixtures/frontend/app/initial/file_manager/frontend/components/Toolbar.tsx
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/index.tsx b/ai_evals/fixtures/frontend/app/initial/file_manager/frontend/index.tsx
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/file_manager/frontend/index.tsx
rename to ai_evals/fixtures/frontend/app/initial/file_manager/frontend/index.tsx
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/addToCart/main.ts b/ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/addToCart/main.ts
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/addToCart/main.ts
rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/addToCart/main.ts
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/addToCart/meta.json b/ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/addToCart/meta.json
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/addToCart/meta.json
rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/addToCart/meta.json
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/calculateTotal/main.ts b/ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/calculateTotal/main.ts
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/calculateTotal/main.ts
rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/calculateTotal/main.ts
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/calculateTotal/meta.json b/ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/calculateTotal/meta.json
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/calculateTotal/meta.json
rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/calculateTotal/meta.json
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/getProducts/main.ts b/ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/getProducts/main.ts
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/getProducts/main.ts
rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/getProducts/main.ts
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/getProducts/meta.json b/ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/getProducts/meta.json
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/getProducts/meta.json
rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/getProducts/meta.json
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/removeFromCart/main.ts b/ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/removeFromCart/main.ts
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/removeFromCart/main.ts
rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/removeFromCart/main.ts
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/removeFromCart/meta.json b/ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/removeFromCart/meta.json
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/backend/removeFromCart/meta.json
rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/backend/removeFromCart/meta.json
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/frontend/components/Cart.tsx b/ai_evals/fixtures/frontend/app/initial/shopping_cart/frontend/components/Cart.tsx
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/frontend/components/Cart.tsx
rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/frontend/components/Cart.tsx
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/frontend/components/ProductCard.tsx b/ai_evals/fixtures/frontend/app/initial/shopping_cart/frontend/components/ProductCard.tsx
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/frontend/components/ProductCard.tsx
rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/frontend/components/ProductCard.tsx
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/frontend/components/ProductList.tsx b/ai_evals/fixtures/frontend/app/initial/shopping_cart/frontend/components/ProductList.tsx
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/frontend/components/ProductList.tsx
rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/frontend/components/ProductList.tsx
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/frontend/index.tsx b/ai_evals/fixtures/frontend/app/initial/shopping_cart/frontend/index.tsx
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/shopping_cart/frontend/index.tsx
rename to ai_evals/fixtures/frontend/app/initial/shopping_cart/frontend/index.tsx
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/test1_counter_app/backend/decrementCounter/main.ts b/ai_evals/fixtures/frontend/app/initial/test1_counter_app/backend/decrementCounter/main.ts
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/test1_counter_app/backend/decrementCounter/main.ts
rename to ai_evals/fixtures/frontend/app/initial/test1_counter_app/backend/decrementCounter/main.ts
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/test1_counter_app/backend/decrementCounter/meta.json b/ai_evals/fixtures/frontend/app/initial/test1_counter_app/backend/decrementCounter/meta.json
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/test1_counter_app/backend/decrementCounter/meta.json
rename to ai_evals/fixtures/frontend/app/initial/test1_counter_app/backend/decrementCounter/meta.json
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/test1_counter_app/backend/incrementCounter/main.ts b/ai_evals/fixtures/frontend/app/initial/test1_counter_app/backend/incrementCounter/main.ts
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/test1_counter_app/backend/incrementCounter/main.ts
rename to ai_evals/fixtures/frontend/app/initial/test1_counter_app/backend/incrementCounter/main.ts
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/test1_counter_app/backend/incrementCounter/meta.json b/ai_evals/fixtures/frontend/app/initial/test1_counter_app/backend/incrementCounter/meta.json
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/test1_counter_app/backend/incrementCounter/meta.json
rename to ai_evals/fixtures/frontend/app/initial/test1_counter_app/backend/incrementCounter/meta.json
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/initial/test1_counter_app/frontend/index.tsx b/ai_evals/fixtures/frontend/app/initial/test1_counter_app/frontend/index.tsx
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/app/initial/test1_counter_app/frontend/index.tsx
rename to ai_evals/fixtures/frontend/app/initial/test1_counter_app/frontend/index.tsx
diff --git a/ai_evals/fixtures/frontend/flow/expected/test0_sum_two_numbers.json b/ai_evals/fixtures/frontend/flow/expected/test0_sum_two_numbers.json
new file mode 100644
index 0000000000..30b2133004
--- /dev/null
+++ b/ai_evals/fixtures/frontend/flow/expected/test0_sum_two_numbers.json
@@ -0,0 +1,31 @@
+{
+  "summary": "",
+  "value": {
+    "modules": [
+      {
+        "id": "sum_numbers",
+        "value": {
+          "type": "rawscript",
+          "language": "bun",
+          "content": "export async function main(a: number, b: number) {\n  return a + b;\n}",
+          "input_transforms": {
+            "a": {
+              "type": "javascript",
+              "expr": "flow_input.a"
+            },
+            "b": {
+              "type": "javascript",
+              "expr": "flow_input.b"
+            }
+          }
+        }
+      }
+    ]
+  },
+  "schema": {
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "properties": {},
+    "required": [],
+    "type": "object"
+  }
+}
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test1.json b/ai_evals/fixtures/frontend/flow/expected/test1.json
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test1.json
rename to ai_evals/fixtures/frontend/flow/expected/test1.json
diff --git a/ai_evals/fixtures/frontend/flow/expected/test10_while_loop_counter.json b/ai_evals/fixtures/frontend/flow/expected/test10_while_loop_counter.json
new file mode 100644
index 0000000000..328cf34651
--- /dev/null
+++ b/ai_evals/fixtures/frontend/flow/expected/test10_while_loop_counter.json
@@ -0,0 +1,30 @@
+{
+  "value": {
+    "modules": [
+      {
+        "id": "count_until_target",
+        "value": {
+          "type": "whileloopflow"
+        }
+      },
+      {
+        "id": "return_final_count",
+        "value": {
+          "type": "rawscript"
+        }
+      }
+    ]
+  },
+  "schema": {
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "type": "object",
+    "properties": {
+      "target": {
+        "type": "number"
+      }
+    },
+    "required": [
+      "target"
+    ]
+  }
+}
diff --git a/ai_evals/fixtures/frontend/flow/expected/test11_preprocessor_failure.json b/ai_evals/fixtures/frontend/flow/expected/test11_preprocessor_failure.json
new file mode 100644
index 0000000000..c1a79f269d
--- /dev/null
+++ b/ai_evals/fixtures/frontend/flow/expected/test11_preprocessor_failure.json
@@ -0,0 +1,36 @@
+{
+  "value": {
+    "preprocessor_module": {
+      "id": "preprocessor",
+      "value": {
+        "type": "rawscript"
+      }
+    },
+    "failure_module": {
+      "id": "failure",
+      "value": {
+        "type": "rawscript"
+      }
+    },
+    "modules": [
+      {
+        "id": "process_event",
+        "value": {
+          "type": "rawscript"
+        }
+      }
+    ]
+  },
+  "schema": {
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "type": "object",
+    "properties": {
+      "payload": {
+        "type": "string"
+      }
+    },
+    "required": [
+      "payload"
+    ]
+  }
+}
diff --git a/ai_evals/fixtures/frontend/flow/expected/test12_approval_step.json b/ai_evals/fixtures/frontend/flow/expected/test12_approval_step.json
new file mode 100644
index 0000000000..5970c3ef1c
--- /dev/null
+++ b/ai_evals/fixtures/frontend/flow/expected/test12_approval_step.json
@@ -0,0 +1,44 @@
+{
+  "value": {
+    "modules": [
+      {
+        "id": "request_approval",
+        "suspend": {
+          "required_events": 1,
+          "resume_form": {
+            "schema": {
+              "approver_comment": {
+                "type": "string"
+              }
+            }
+          }
+        },
+        "value": {
+          "type": "rawscript"
+        }
+      },
+      {
+        "id": "finalize_purchase",
+        "value": {
+          "type": "rawscript"
+        }
+      }
+    ]
+  },
+  "schema": {
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "type": "object",
+    "properties": {
+      "requester_email": {
+        "type": "string"
+      },
+      "amount": {
+        "type": "number"
+      }
+    },
+    "required": [
+      "requester_email",
+      "amount"
+    ]
+  }
+}
diff --git a/ai_evals/fixtures/frontend/flow/expected/test1_reuse_existing_script.json b/ai_evals/fixtures/frontend/flow/expected/test1_reuse_existing_script.json
new file mode 100644
index 0000000000..f5ab58c476
--- /dev/null
+++ b/ai_evals/fixtures/frontend/flow/expected/test1_reuse_existing_script.json
@@ -0,0 +1,39 @@
+{
+  "value": {
+    "modules": [
+      {
+        "id": "sum_numbers",
+        "value": {
+          "type": "script",
+          "path": "f/evals/add_two_numbers.ts",
+          "input_transforms": {
+            "a": {
+              "type": "javascript",
+              "expr": "flow_input.a"
+            },
+            "b": {
+              "type": "javascript",
+              "expr": "flow_input.b"
+            }
+          }
+        }
+      }
+    ]
+  },
+  "schema": {
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "type": "object",
+    "properties": {
+      "a": {
+        "type": "number"
+      },
+      "b": {
+        "type": "number"
+      }
+    },
+    "required": [
+      "a",
+      "b"
+    ]
+  }
+}
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test2.json b/ai_evals/fixtures/frontend/flow/expected/test2.json
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test2.json
rename to ai_evals/fixtures/frontend/flow/expected/test2.json
diff --git a/ai_evals/fixtures/frontend/flow/expected/test2_call_existing_subflow.json b/ai_evals/fixtures/frontend/flow/expected/test2_call_existing_subflow.json
new file mode 100644
index 0000000000..33021252a9
--- /dev/null
+++ b/ai_evals/fixtures/frontend/flow/expected/test2_call_existing_subflow.json
@@ -0,0 +1,39 @@
+{
+  "value": {
+    "modules": [
+      {
+        "id": "call_add_numbers",
+        "value": {
+          "type": "flow",
+          "path": "f/evals/add_numbers_flow",
+          "input_transforms": {
+            "a": {
+              "type": "javascript",
+              "expr": "flow_input.a"
+            },
+            "b": {
+              "type": "javascript",
+              "expr": "flow_input.b"
+            }
+          }
+        }
+      }
+    ]
+  },
+  "schema": {
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "type": "object",
+    "properties": {
+      "a": {
+        "type": "number"
+      },
+      "b": {
+        "type": "number"
+      }
+    },
+    "required": [
+      "a",
+      "b"
+    ]
+  }
+}
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test3.json b/ai_evals/fixtures/frontend/flow/expected/test3.json
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test3.json
rename to ai_evals/fixtures/frontend/flow/expected/test3.json
diff --git a/ai_evals/fixtures/frontend/flow/expected/test3_branchone_routing.json b/ai_evals/fixtures/frontend/flow/expected/test3_branchone_routing.json
new file mode 100644
index 0000000000..912919a435
--- /dev/null
+++ b/ai_evals/fixtures/frontend/flow/expected/test3_branchone_routing.json
@@ -0,0 +1,24 @@
+{
+  "value": {
+    "modules": [
+      {
+        "id": "route_by_tier",
+        "value": {
+          "type": "branchone"
+        }
+      }
+    ]
+  },
+  "schema": {
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "type": "object",
+    "properties": {
+      "tier": {
+        "type": "string"
+      }
+    },
+    "required": [
+      "tier"
+    ]
+  }
+}
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test4.json b/ai_evals/fixtures/frontend/flow/expected/test4.json
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test4.json
rename to ai_evals/fixtures/frontend/flow/expected/test4.json
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test5_modify_simple.json b/ai_evals/fixtures/frontend/flow/expected/test5_modify_simple.json
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test5_modify_simple.json
rename to ai_evals/fixtures/frontend/flow/expected/test5_modify_simple.json
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test6_modify_medium.json b/ai_evals/fixtures/frontend/flow/expected/test6_modify_medium.json
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test6_modify_medium.json
rename to ai_evals/fixtures/frontend/flow/expected/test6_modify_medium.json
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test7_modify_complex.json b/ai_evals/fixtures/frontend/flow/expected/test7_modify_complex.json
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/flow/expected/test7_modify_complex.json
rename to ai_evals/fixtures/frontend/flow/expected/test7_modify_complex.json
diff --git a/ai_evals/fixtures/frontend/flow/initial/test1_reuse_existing_script_initial.json b/ai_evals/fixtures/frontend/flow/initial/test1_reuse_existing_script_initial.json
new file mode 100644
index 0000000000..6540a36c26
--- /dev/null
+++ b/ai_evals/fixtures/frontend/flow/initial/test1_reuse_existing_script_initial.json
@@ -0,0 +1,29 @@
+{
+  "workspace": {
+    "scripts": [
+      {
+        "path": "f/evals/add_two_numbers.ts",
+        "summary": "Add two numbers",
+        "description": "Returns the sum of two numeric inputs.",
+        "language": "bun",
+        "schema": {
+          "$schema": "https://json-schema.org/draft/2020-12/schema",
+          "type": "object",
+          "properties": {
+            "a": {
+              "type": "number"
+            },
+            "b": {
+              "type": "number"
+            }
+          },
+          "required": [
+            "a",
+            "b"
+          ]
+        },
+        "content": "export async function main(a: number, b: number) {\n  return a + b;\n}\n"
+      }
+    ]
+  }
+}
diff --git a/ai_evals/fixtures/frontend/flow/initial/test2_call_existing_subflow_initial.json b/ai_evals/fixtures/frontend/flow/initial/test2_call_existing_subflow_initial.json
new file mode 100644
index 0000000000..03a29c28e4
--- /dev/null
+++ b/ai_evals/fixtures/frontend/flow/initial/test2_call_existing_subflow_initial.json
@@ -0,0 +1,49 @@
+{
+  "workspace": {
+    "flows": [
+      {
+        "path": "f/evals/add_numbers_flow",
+        "summary": "Add two numbers in a subflow",
+        "description": "Takes two numeric inputs and returns their sum.",
+        "schema": {
+          "$schema": "https://json-schema.org/draft/2020-12/schema",
+          "type": "object",
+          "properties": {
+            "a": {
+              "type": "number"
+            },
+            "b": {
+              "type": "number"
+            }
+          },
+          "required": [
+            "a",
+            "b"
+          ]
+        },
+        "value": {
+          "modules": [
+            {
+              "id": "sum_numbers",
+              "value": {
+                "type": "rawscript",
+                "language": "bun",
+                "content": "export async function main(a: number, b: number) {\n  return a + b;\n}",
+                "input_transforms": {
+                  "a": {
+                    "type": "javascript",
+                    "expr": "flow_input.a"
+                  },
+                  "b": {
+                    "type": "javascript",
+                    "expr": "flow_input.b"
+                  }
+                }
+              }
+            }
+          ]
+        }
+      }
+    ]
+  }
+}
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/initial/test5_initial.json b/ai_evals/fixtures/frontend/flow/initial/test5_initial.json
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/flow/initial/test5_initial.json
rename to ai_evals/fixtures/frontend/flow/initial/test5_initial.json
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/initial/test6_initial.json b/ai_evals/fixtures/frontend/flow/initial/test6_initial.json
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/flow/initial/test6_initial.json
rename to ai_evals/fixtures/frontend/flow/initial/test6_initial.json
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/initial/test7_initial.json b/ai_evals/fixtures/frontend/flow/initial/test7_initial.json
similarity index 100%
rename from frontend/src/lib/components/copilot/chat/__tests__/flow/initial/test7_initial.json
rename to ai_evals/fixtures/frontend/flow/initial/test7_initial.json
diff --git a/ai_evals/fixtures/frontend/script/expected/test1_greet_user.json b/ai_evals/fixtures/frontend/script/expected/test1_greet_user.json
new file mode 100644
index 0000000000..b6cd4c7395
--- /dev/null
+++ b/ai_evals/fixtures/frontend/script/expected/test1_greet_user.json
@@ -0,0 +1,8 @@
+{
+  "path": "f/evals/greet_user.ts",
+  "lang": "bun",
+  "args": {
+    "name": "Alice"
+  },
+  "code": "export async function main(name: string) {\n\treturn `Hello, ${name}!`\n}\n"
+}
diff --git a/ai_evals/fixtures/frontend/script/initial/test1_empty_bun.json b/ai_evals/fixtures/frontend/script/initial/test1_empty_bun.json
new file mode 100644
index 0000000000..f1e1e90df7
--- /dev/null
+++ b/ai_evals/fixtures/frontend/script/initial/test1_empty_bun.json
@@ -0,0 +1,8 @@
+{
+  "path": "f/evals/greet_user.ts",
+  "lang": "bun",
+  "args": {
+    "name": "Alice"
+  },
+  "code": "export async function main(name: string) {\n\treturn ''\n}\n"
+}
diff --git a/ai_evals/history/app.jsonl b/ai_evals/history/app.jsonl
new file mode 100644
index 0000000000..3f174ec671
--- /dev/null
+++ b/ai_evals/history/app.jsonl
@@ -0,0 +1,3 @@
+{"createdAt":"2026-04-10T14:24:42.248Z","gitSha":"8f8b487be517a0bdd318c36857c1d46d5ab0723a","mode":"app","runs":1,"runModel":"anthropic:claude-haiku-4-5-20251001","judgeModel":"claude-sonnet-4-6","caseCount":9,"attemptCount":9,"passedAttempts":7,"passRate":0.7777777777777778,"averageDurationMs":25680.777777777777,"averageJudgeScore":76.55555555555556,"averageTokenUsagePerAttempt":{"prompt":53989.22222222222,"completion":2629.222222222222,"total":56618.444444444445},"failedCaseIds":["app-test8-inventory-tracker-create","app-test9-recipe-book-create"],"cases":[{"id":"app-test1-counter-create","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":11071,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":17912,"completion":1079,"total":18991}},{"id":"app-test2-counter-reset","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":12121,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":19088,"completion":833,"total":19921}},{"id":"app-test3-shopping-cart-quantity","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":25852,"averageJudgeScore":98,"averageTokenUsagePerAttempt":{"prompt":58834,"completion":2446,"total":61280}},{"id":"app-test4-shopping-cart-discount","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":42350,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":90882,"completion":4984,"total":95866}},{"id":"app-test5-file-manager-search","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":29129,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":81980,"completion":2817,"total":84797}},{"id":"app-test6-file-manager-inline-rename","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":51576,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":110023,"completion":6328,"total":116351}},{"id":"app-test7-file-manager-select-all","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":39256,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":72006,"completion":4188,"total":76194}},{"id":"app-test8-inventory-tracker-create","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":10514,"averageJudgeScore":0,"averageTokenUsagePerAttempt":{"prompt":17600,"completion":511,"total":18111}},{"id":"app-test9-recipe-book-create","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":9258,"averageJudgeScore":0,"averageTokenUsagePerAttempt":{"prompt":17578,"completion":477,"total":18055}}]}
+{"createdAt":"2026-04-10T14:27:49.271Z","gitSha":"8f8b487be517a0bdd318c36857c1d46d5ab0723a","mode":"app","runs":1,"runModel":"anthropic:claude-opus-4-6","judgeModel":"claude-sonnet-4-6","caseCount":9,"attemptCount":9,"passedAttempts":6,"passRate":0.6666666666666666,"averageDurationMs":57285.666666666664,"averageJudgeScore":82.55555555555556,"averageTokenUsagePerAttempt":{"prompt":54435.77777777778,"completion":3668.6666666666665,"total":58104.444444444445},"failedCaseIds":["app-test7-file-manager-select-all","app-test8-inventory-tracker-create","app-test9-recipe-book-create"],"cases":[{"id":"app-test1-counter-create","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":17930,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":17620,"completion":743,"total":18363}},{"id":"app-test2-counter-reset","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":17852,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":18887,"completion":701,"total":19588}},{"id":"app-test3-shopping-cart-quantity","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":43501,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":38855,"completion":2692,"total":41547}},{"id":"app-test4-shopping-cart-discount","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":60820,"averageJudgeScore":98,"averageTokenUsagePerAttempt":{"prompt":61707,"completion":3420,"total":65127}},{"id":"app-test5-file-manager-search","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":45253,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":67244,"completion":3031,"total":70275}},{"id":"app-test6-file-manager-inline-rename","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":104837,"averageJudgeScore":98,"averageTokenUsagePerAttempt":{"prompt":116979,"completion":6834,"total":123813}},{"id":"app-test7-file-manager-select-all","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":73325,"averageJudgeScore":78,"averageTokenUsagePerAttempt":{"prompt":76351,"completion":5239,"total":81590}},{"id":"app-test8-inventory-tracker-create","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":133705,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":68546,"completion":9881,"total":78427}},{"id":"app-test9-recipe-book-create","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":18348,"averageJudgeScore":0,"averageTokenUsagePerAttempt":{"prompt":23733,"completion":477,"total":24210}}]}
+{"createdAt":"2026-04-10T14:29:28.396Z","gitSha":"8f8b487be517a0bdd318c36857c1d46d5ab0723a","mode":"app","runs":1,"runModel":"openai:gpt-4o","judgeModel":"claude-sonnet-4-6","caseCount":9,"attemptCount":9,"passedAttempts":5,"passRate":0.5555555555555556,"averageDurationMs":31682.555555555555,"averageJudgeScore":73.11111111111111,"averageTokenUsagePerAttempt":{"prompt":27221.222222222223,"completion":1564.6666666666667,"total":28785.88888888889},"failedCaseIds":["app-test6-file-manager-inline-rename","app-test7-file-manager-select-all","app-test8-inventory-tracker-create","app-test9-recipe-book-create"],"cases":[{"id":"app-test1-counter-create","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":9911,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":8116,"completion":525,"total":8641}},{"id":"app-test2-counter-reset","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":15146,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":13096,"completion":576,"total":13672}},{"id":"app-test3-shopping-cart-quantity","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":31146,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":42424,"completion":1691,"total":44115}},{"id":"app-test4-shopping-cart-discount","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":49382,"averageJudgeScore":92,"averageTokenUsagePerAttempt":{"prompt":35785,"completion":3345,"total":39130}},{"id":"app-test5-file-manager-search","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":62963,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":46902,"completion":3590,"total":50492}},{"id":"app-test6-file-manager-inline-rename","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":24203,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":33121,"completion":498,"total":33619}},{"id":"app-test7-file-manager-select-all","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":74058,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":46026,"completion":3591,"total":49617}},{"id":"app-test8-inventory-tracker-create","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":6757,"averageJudgeScore":0,"averageTokenUsagePerAttempt":{"prompt":7770,"completion":165,"total":7935}},{"id":"app-test9-recipe-book-create","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":11577,"averageJudgeScore":0,"averageTokenUsagePerAttempt":{"prompt":11751,"completion":101,"total":11852}}]}
diff --git a/ai_evals/history/cli.jsonl b/ai_evals/history/cli.jsonl
new file mode 100644
index 0000000000..45be5669fa
--- /dev/null
+++ b/ai_evals/history/cli.jsonl
@@ -0,0 +1,2 @@
+{"createdAt":"2026-04-10T14:25:39.106Z","gitSha":"8f8b487be517a0bdd318c36857c1d46d5ab0723a","mode":"cli","runs":1,"runModel":"anthropic:haiku","judgeModel":"claude-sonnet-4-6","caseCount":6,"attemptCount":6,"passedAttempts":6,"passRate":1,"averageDurationMs":21746,"averageJudgeScore":99.16666666666667,"averageTokenUsagePerAttempt":null,"failedCaseIds":[],"cases":[{"id":"bun-hello-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":16588,"averageJudgeScore":100,"averageTokenUsagePerAttempt":null},{"id":"bun-hello-flow","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":27642,"averageJudgeScore":100,"averageTokenUsagePerAttempt":null},{"id":"python-add-numbers-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":23640,"averageJudgeScore":100,"averageTokenUsagePerAttempt":null},{"id":"bun-hello-script-uppercase","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":19379,"averageJudgeScore":100,"averageTokenUsagePerAttempt":null},{"id":"bun-hello-flow-punctuation","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":21993,"averageJudgeScore":100,"averageTokenUsagePerAttempt":null},{"id":"flow-reuse-existing-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":21234,"averageJudgeScore":95,"averageTokenUsagePerAttempt":null}]}
+{"createdAt":"2026-04-10T14:28:09.045Z","gitSha":"8f8b487be517a0bdd318c36857c1d46d5ab0723a","mode":"cli","runs":1,"runModel":"anthropic:opus","judgeModel":"claude-sonnet-4-6","caseCount":6,"attemptCount":6,"passedAttempts":6,"passRate":1,"averageDurationMs":24988.833333333332,"averageJudgeScore":99.66666666666667,"averageTokenUsagePerAttempt":null,"failedCaseIds":[],"cases":[{"id":"bun-hello-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":22034,"averageJudgeScore":100,"averageTokenUsagePerAttempt":null},{"id":"bun-hello-flow","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":28030,"averageJudgeScore":100,"averageTokenUsagePerAttempt":null},{"id":"python-add-numbers-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":16668,"averageJudgeScore":100,"averageTokenUsagePerAttempt":null},{"id":"bun-hello-script-uppercase","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":21269,"averageJudgeScore":100,"averageTokenUsagePerAttempt":null},{"id":"bun-hello-flow-punctuation","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":30126,"averageJudgeScore":100,"averageTokenUsagePerAttempt":null},{"id":"flow-reuse-existing-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":31806,"averageJudgeScore":98,"averageTokenUsagePerAttempt":null}]}
diff --git a/ai_evals/history/flow.jsonl b/ai_evals/history/flow.jsonl
new file mode 100644
index 0000000000..9bdb9a6c77
--- /dev/null
+++ b/ai_evals/history/flow.jsonl
@@ -0,0 +1,3 @@
+{"createdAt":"2026-04-10T14:25:16.664Z","gitSha":"8f8b487be517a0bdd318c36857c1d46d5ab0723a","mode":"flow","runs":1,"runModel":"anthropic:claude-haiku-4-5-20251001","judgeModel":"claude-sonnet-4-6","caseCount":13,"attemptCount":13,"passedAttempts":8,"passRate":0.6153846153846154,"averageDurationMs":33424.692307692305,"averageJudgeScore":82.61538461538461,"averageTokenUsagePerAttempt":{"prompt":131901,"completion":3121.230769230769,"total":135022.23076923078},"failedCaseIds":["flow-test6-ai-agent-tools","flow-test7-simple-modification","flow-test9-parallel-refactor","flow-test10-while-loop-counter","flow-test11-preprocessor-and-failure-handler"],"cases":[{"id":"flow-test0-sum-two-numbers","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":16943,"averageJudgeScore":98,"averageTokenUsagePerAttempt":{"prompt":126615,"completion":839,"total":127454}},{"id":"flow-test1-reuse-existing-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":15220,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":75614,"completion":805,"total":76419}},{"id":"flow-test2-call-existing-subflow","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":15699,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":76182,"completion":887,"total":77069}},{"id":"flow-test3-branchone-routing","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":21605,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":62230,"completion":1509,"total":63739}},{"id":"flow-test4-order-processing-loop","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":47228,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":143511,"completion":5443,"total":148954}},{"id":"flow-test5-parallel-data-pipeline","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":81870,"averageJudgeScore":92,"averageTokenUsagePerAttempt":{"prompt":194542,"completion":12409,"total":206951}},{"id":"flow-test6-ai-agent-tools","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":51878,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":142071,"completion":5720,"total":147791}},{"id":"flow-test7-simple-modification","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":49113,"averageJudgeScore":42,"averageTokenUsagePerAttempt":{"prompt":318525,"completion":2702,"total":321227}},{"id":"flow-test8-branching-in-loop","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":18244,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":78441,"completion":979,"total":79420}},{"id":"flow-test9-parallel-refactor","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":49485,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":135237,"completion":5467,"total":140704}},{"id":"flow-test10-while-loop-counter","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":21210,"averageJudgeScore":90,"averageTokenUsagePerAttempt":{"prompt":127844,"completion":1179,"total":129023}},{"id":"flow-test11-preprocessor-and-failure-handler","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":25142,"averageJudgeScore":42,"averageTokenUsagePerAttempt":{"prompt":128648,"completion":1337,"total":129985}},{"id":"flow-test12-approval-step","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":20884,"averageJudgeScore":90,"averageTokenUsagePerAttempt":{"prompt":105253,"completion":1300,"total":106553}}]}
+{"createdAt":"2026-04-10T14:57:17.513Z","gitSha":"2a58402cfc5c320748839e92b51a1291b937bf26","mode":"flow","runs":1,"runModel":"anthropic:claude-opus-4-6","judgeModel":"claude-sonnet-4-6","caseCount":13,"attemptCount":13,"passedAttempts":8,"passRate":0.6153846153846154,"averageDurationMs":58074.53846153846,"averageJudgeScore":87.53846153846153,"averageTokenUsagePerAttempt":{"prompt":125452.76923076923,"completion":2957.769230769231,"total":128410.53846153847},"failedCaseIds":["flow-test4-order-processing-loop","flow-test6-ai-agent-tools","flow-test7-simple-modification","flow-test10-while-loop-counter","flow-test11-preprocessor-and-failure-handler"],"cases":[{"id":"flow-test0-sum-two-numbers","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":26967,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":103796,"completion":634,"total":104430}},{"id":"flow-test1-reuse-existing-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":29009,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":75507,"completion":743,"total":76250}},{"id":"flow-test2-call-existing-subflow","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":26828,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":76172,"completion":807,"total":76979}},{"id":"flow-test3-branchone-routing","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":44418,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":130440,"completion":1787,"total":132227}},{"id":"flow-test4-order-processing-loop","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":82185,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":116133,"completion":4905,"total":121038}},{"id":"flow-test5-parallel-data-pipeline","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":110344,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":122092,"completion":6980,"total":129072}},{"id":"flow-test6-ai-agent-tools","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":119901,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":154916,"completion":8908,"total":163824}},{"id":"flow-test7-simple-modification","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":44333,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":109935,"completion":1536,"total":111471}},{"id":"flow-test8-branching-in-loop","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":54247,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":136872,"completion":2638,"total":139510}},{"id":"flow-test9-parallel-refactor","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":63274,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":137794,"completion":3686,"total":141480}},{"id":"flow-test10-while-loop-counter","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":38813,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":105075,"completion":1157,"total":106232}},{"id":"flow-test11-preprocessor-and-failure-handler","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":77267,"averageJudgeScore":52,"averageTokenUsagePerAttempt":{"prompt":256547,"completion":3398,"total":259945}},{"id":"flow-test12-approval-step","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":37383,"averageJudgeScore":90,"averageTokenUsagePerAttempt":{"prompt":105607,"completion":1272,"total":106879}}]}
+{"createdAt":"2026-04-10T14:29:52.249Z","gitSha":"8f8b487be517a0bdd318c36857c1d46d5ab0723a","mode":"flow","runs":1,"runModel":"openai:gpt-4o","judgeModel":"claude-sonnet-4-6","caseCount":13,"attemptCount":13,"passedAttempts":6,"passRate":0.46153846153846156,"averageDurationMs":29841.53846153846,"averageJudgeScore":68.46153846153847,"averageTokenUsagePerAttempt":{"prompt":72815.92307692308,"completion":770.7692307692307,"total":73586.69230769231},"failedCaseIds":["flow-test5-parallel-data-pipeline","flow-test6-ai-agent-tools","flow-test7-simple-modification","flow-test9-parallel-refactor","flow-test10-while-loop-counter","flow-test11-preprocessor-and-failure-handler","flow-test12-approval-step"],"cases":[{"id":"flow-test0-sum-two-numbers","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":20059,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":64091,"completion":265,"total":64356}},{"id":"flow-test1-reuse-existing-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":20728,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":46594,"completion":270,"total":46864}},{"id":"flow-test2-call-existing-subflow","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":21533,"averageJudgeScore":98,"averageTokenUsagePerAttempt":{"prompt":46859,"completion":232,"total":47091}},{"id":"flow-test3-branchone-routing","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":29004,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":64593,"completion":568,"total":65161}},{"id":"flow-test4-order-processing-loop","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":36250,"averageJudgeScore":95,"averageTokenUsagePerAttempt":{"prompt":66346,"completion":1259,"total":67605}},{"id":"flow-test5-parallel-data-pipeline","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":46151,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":104676,"completion":1698,"total":106374}},{"id":"flow-test6-ai-agent-tools","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":72403,"averageJudgeScore":62,"averageTokenUsagePerAttempt":{"prompt":105280,"completion":2216,"total":107496}},{"id":"flow-test7-simple-modification","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":41599,"averageJudgeScore":20,"averageTokenUsagePerAttempt":{"prompt":103053,"completion":707,"total":103760}},{"id":"flow-test8-branching-in-loop","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":23352,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":97955,"completion":468,"total":98423}},{"id":"flow-test9-parallel-refactor","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":19341,"averageJudgeScore":0,"averageTokenUsagePerAttempt":{"prompt":12254,"completion":1057,"total":13311}},{"id":"flow-test10-while-loop-counter","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":16143,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":64480,"completion":445,"total":64925}},{"id":"flow-test11-preprocessor-and-failure-handler","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":24231,"averageJudgeScore":52,"averageTokenUsagePerAttempt":{"prompt":106068,"completion":472,"total":106540}},{"id":"flow-test12-approval-step","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":17146,"averageJudgeScore":30,"averageTokenUsagePerAttempt":{"prompt":64358,"completion":363,"total":64721}}]}
diff --git a/ai_evals/history/script.jsonl b/ai_evals/history/script.jsonl
new file mode 100644
index 0000000000..779ddeda4c
--- /dev/null
+++ b/ai_evals/history/script.jsonl
@@ -0,0 +1,3 @@
+{"createdAt":"2026-04-10T14:23:51.580Z","gitSha":"8f8b487be517a0bdd318c36857c1d46d5ab0723a","mode":"script","runs":1,"runModel":"anthropic:claude-haiku-4-5-20251001","judgeModel":"claude-sonnet-4-6","caseCount":1,"attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":12112,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":48134,"completion":452,"total":48586},"failedCaseIds":[],"cases":[{"id":"script-test1-greet-user","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":12112,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":48134,"completion":452,"total":48586}}]}
+{"createdAt":"2026-04-10T14:24:18.129Z","gitSha":"8f8b487be517a0bdd318c36857c1d46d5ab0723a","mode":"script","runs":1,"runModel":"anthropic:claude-opus-4-6","judgeModel":"claude-sonnet-4-6","caseCount":1,"attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":16595,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":38264,"completion":254,"total":38518},"failedCaseIds":[],"cases":[{"id":"script-test1-greet-user","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":16595,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":38264,"completion":254,"total":38518}}]}
+{"createdAt":"2026-04-10T14:24:41.534Z","gitSha":"8f8b487be517a0bdd318c36857c1d46d5ab0723a","mode":"script","runs":1,"runModel":"openai:gpt-4o","judgeModel":"claude-sonnet-4-6","caseCount":1,"attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":13643,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":28961,"completion":137,"total":29098},"failedCaseIds":[],"cases":[{"id":"script-test1-greet-user","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":13643,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":28961,"completion":137,"total":29098}}]}
diff --git a/ai_evals/modes/app.ts b/ai_evals/modes/app.ts
new file mode 100644
index 0000000000..66bf139c44
--- /dev/null
+++ b/ai_evals/modes/app.ts
@@ -0,0 +1,79 @@
+import { loadAppFixture } from "../adapters/frontend/core/app/appFixtureLoader";
+import type { AppFiles } from "../../frontend/src/lib/components/copilot/chat/app/core";
+import type { FrontendEvalModelConfig } from "../core/models";
+import { validateAppState, type AppFilesState } from "../core/validators";
+import type { BenchmarkArtifactFile, ModeRunner } from "../core/types";
+import { runAppEval } from "../adapters/frontend/core/app/appEvalRunner";
+import { DEFAULT_FRONTEND_EVAL_MODEL, getFrontendApiKey } from "./frontendCommon";
+
+export function createAppModeRunner(
+  modelConfig: FrontendEvalModelConfig = DEFAULT_FRONTEND_EVAL_MODEL
+): ModeRunner<AppFilesState, AppFilesState, AppFilesState> {
+  return {
+    mode: "app",
+    concurrency: 5,
+    judgeThreshold: 80,
+    async loadInitial(path) {
+      return path ? (await loadAppFixture(path)) : undefined;
+    },
+    async loadExpected(path) {
+      return path ? (await loadAppFixture(path)) : undefined;
+    },
+    async run(prompt, initial, context) {
+      const result = await runAppEval(prompt, getFrontendApiKey(modelConfig.provider), {
+        initialFrontend: initial?.frontend,
+        initialBackend: initial?.backend as AppFiles["backend"] | undefined,
+        provider: modelConfig.provider,
+        model: modelConfig.model,
+        runContext: context,
+      });
+
+      return {
+        success: result.success,
+        actual: result.files as AppFilesState,
+        error: result.error,
+        assistantMessageCount: result.assistantMessageCount,
+        toolCallCount: result.toolCallCount,
+        toolsUsed: result.toolsUsed,
+        skillsInvoked: [],
+        tokenUsage: result.tokenUsage,
+      };
+    },
+    validate({ actual, initial, expected }) {
+      return validateAppState({ actual, initial, expected });
+    },
+    buildArtifacts(actual): BenchmarkArtifactFile[] {
+      const artifacts: BenchmarkArtifactFile[] = [
+        {
+          path: "app.json",
+          content: JSON.stringify(actual, null, 2) + "\n",
+        },
+      ];
+
+      for (const [filePath, content] of Object.entries(actual.frontend)) {
+        artifacts.push({
+          path: `frontend${filePath.startsWith("/") ? filePath : `/${filePath}`}`,
+          content,
+        });
+      }
+
+      for (const [key, runnable] of Object.entries(actual.backend)) {
+        artifacts.push({
+          path: `backend/${key}/meta.json`,
+          content: JSON.stringify(runnable, null, 2) + "\n",
+        });
+
+        const inlineContent = runnable.inlineScript?.content;
+        if (inlineContent) {
+          const extension = runnable.inlineScript?.language === "python3" ? "py" : "ts";
+          artifacts.push({
+            path: `backend/${key}/main.${extension}`,
+            content: inlineContent,
+          });
+        }
+      }
+
+      return artifacts;
+    },
+  };
+}
diff --git a/ai_evals/modes/cli.ts b/ai_evals/modes/cli.ts
new file mode 100644
index 0000000000..718983f2c3
--- /dev/null
+++ b/ai_evals/modes/cli.ts
@@ -0,0 +1,162 @@
+import { mkdtemp, rm, writeFile } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import path from "node:path";
+import { join } from "node:path";
+import { readFile } from "node:fs/promises";
+import { writeAiGuidanceFiles } from "../../cli/src/guidance/writer.ts";
+import type { CliEvalModelConfig } from "../core/models";
+import {
+  DEFAULT_CLI_EVAL_MODEL,
+  formatCliRunModelLabel,
+  getGeneratedSkillsSource,
+  runPromptAndCapture,
+} from "../adapters/cli/runtime";
+import { copyDirectory, readDirectoryFiles } from "../core/files";
+import { validateCliWorkspace } from "../core/validators";
+import type { BenchmarkArtifactFile, ModeRunner } from "../core/types";
+
+const IGNORE_WORKSPACE_FILES = new Set([".claude", "AGENTS.md", "CLAUDE.md", "rt.d.ts"]);
+
+interface CliWorkspaceFixture {
+  sourceDir: string;
+  files: Record<string, string>;
+}
+
+interface CliRunActual {
+  assistantOutput: string;
+  workspaceFiles: Record<string, string>;
+}
+
+const CLAUDE_PROJECT_PREAMBLE = [
+  "Follow the project instructions from AGENTS.md exactly.",
+  "Before creating or modifying any Windmill entity, you MUST invoke the relevant Skill tool and follow it.",
+  "Use the skill guidance for file layout, implementation details, and the exact next commands to tell the user.",
+  "Do not skip the Skill step.",
+  "You are running inside an automated benchmark harness, not an interactive user session.",
+  "Act autonomously and complete the requested file changes directly in the workspace.",
+  "Do not ask for confirmation, do not ask the user to save or create files manually, and do not wait for approval.",
+  "Do not respond with a plan when you can make the change directly.",
+  "Only describe what was done after you have written the files.",
+].join(" ");
+
+export function createCliModeRunner(
+  modelConfig: CliEvalModelConfig = DEFAULT_CLI_EVAL_MODEL
+): ModeRunner<CliWorkspaceFixture, CliWorkspaceFixture, CliRunActual> {
+  return {
+    mode: "cli",
+    concurrency: 1,
+    judgeThreshold: 80,
+    async loadInitial(path) {
+      return path
+        ? {
+            sourceDir: path,
+            files: await readDirectoryFiles(path),
+          }
+        : undefined;
+    },
+    async loadExpected(path) {
+      return path
+        ? {
+            sourceDir: path,
+            files: await readDirectoryFiles(path),
+          }
+        : undefined;
+    },
+    async run(prompt, initial, _context) {
+      const workspaceDir = await mkdtemp(join(tmpdir(), "wmill-cli-benchmark-"));
+
+      try {
+        if (initial) {
+          await copyDirectory(initial.sourceDir, workspaceDir);
+        }
+        await writeAiGuidanceFiles({
+          targetDir: workspaceDir,
+          nonDottedPaths: true,
+          overwriteProjectGuidance: true,
+          skillsSourcePath: getGeneratedSkillsSource(),
+        });
+        await writeFile(join(workspaceDir, "rt.d.ts"), "export namespace RT {}\n", "utf8");
+
+        const renderedPrompt = await renderPrompt(prompt, workspaceDir);
+        const run = await runPromptAndCapture(renderedPrompt, workspaceDir, 6, modelConfig);
+        const workspaceFiles = await readDirectoryFiles(workspaceDir, { ignore: IGNORE_WORKSPACE_FILES });
+
+        return {
+          success: true,
+          actual: {
+            assistantOutput: run.output,
+            workspaceFiles,
+          },
+          assistantMessageCount: run.assistantMessageCount,
+          toolCallCount: run.toolsUsed.length,
+          toolsUsed: run.toolsUsed.map((entry) => entry.tool),
+          skillsInvoked: run.skillsInvoked,
+          tokenUsage: run.tokenUsage ?? null,
+        };
+      } catch (error) {
+        const message = error instanceof Error ? error.message : String(error);
+        return {
+          success: false,
+          actual: {
+            assistantOutput: "",
+            workspaceFiles: {},
+          },
+          error: message,
+          assistantMessageCount: 0,
+          toolCallCount: 0,
+          toolsUsed: [],
+          skillsInvoked: [],
+          tokenUsage: null,
+        };
+      } finally {
+        await rm(workspaceDir, { recursive: true, force: true });
+      }
+    },
+    validate({ actual, initial, expected }) {
+      return validateCliWorkspace({
+        actualFiles: actual.workspaceFiles,
+        expectedFiles: expected?.files,
+        initialFiles: initial?.files,
+      });
+    },
+    buildArtifacts(actual): BenchmarkArtifactFile[] {
+      const artifacts: BenchmarkArtifactFile[] = [
+        {
+          path: "assistant-output.txt",
+          content: `${actual.assistantOutput}\n`,
+        },
+      ];
+
+      for (const [filePath, content] of Object.entries(actual.workspaceFiles)) {
+        artifacts.push({
+          path: filePath,
+          content,
+        });
+      }
+
+      return artifacts;
+    },
+  };
+}
+
+export function getCliRunModelLabel(
+  modelConfig: CliEvalModelConfig = DEFAULT_CLI_EVAL_MODEL
+): string {
+  return formatCliRunModelLabel(modelConfig);
+}
+
+async function renderPrompt(prompt: string, workspaceDir: string): Promise<string> {
+  const renderedUserPrompt = prompt.replaceAll("{{workspace_root}}", workspaceDir);
+  const agentsInstructions = await readFile(path.join(workspaceDir, "AGENTS.md"), "utf8");
+
+  return [
+    "# Project Instructions",
+    agentsInstructions.trim(),
+    "",
+    "# Benchmark Harness",
+    CLAUDE_PROJECT_PREAMBLE,
+    "",
+    "# User Request",
+    renderedUserPrompt,
+  ].join("\n");
+}
diff --git a/ai_evals/modes/flow.ts b/ai_evals/modes/flow.ts
new file mode 100644
index 0000000000..36dee80658
--- /dev/null
+++ b/ai_evals/modes/flow.ts
@@ -0,0 +1,104 @@
+import { readJsonFile } from "../core/files";
+import type { FrontendEvalModelConfig } from "../core/models";
+import { validateFlowState, type FlowState } from "../core/validators";
+import type { BenchmarkArtifactFile, ModeRunner } from "../core/types";
+import {
+  runFlowEval,
+  type FlowFixture,
+} from "../adapters/frontend/core/flow/flowEvalRunner";
+import type { FlowWorkspaceFixtures } from "../adapters/frontend/core/flow/fileHelpers";
+import { DEFAULT_FRONTEND_EVAL_MODEL, getFrontendApiKey } from "./frontendCommon";
+
+interface FlowInitialFixture {
+  flow?: FlowFixture;
+  workspace?: FlowWorkspaceFixtures;
+}
+
+export function createFlowModeRunner(
+  modelConfig: FrontendEvalModelConfig = DEFAULT_FRONTEND_EVAL_MODEL
+): ModeRunner<FlowInitialFixture, FlowState, FlowState> {
+  return {
+    mode: "flow",
+    concurrency: 5,
+    judgeThreshold: 80,
+    async loadInitial(path) {
+      if (!path) {
+        return undefined;
+      }
+      return normalizeFlowInitialFixture(await readJsonFile<unknown>(path));
+    },
+    async loadExpected(path) {
+      if (!path) {
+        return undefined;
+      }
+      return normalizeFlowStateFixture(await readJsonFile<unknown>(path));
+    },
+    async run(prompt, initial, context) {
+      const result = await runFlowEval(prompt, getFrontendApiKey(modelConfig.provider), {
+        initialFlow: initial?.flow,
+        workspaceFixtures: initial?.workspace,
+        provider: modelConfig.provider,
+        model: modelConfig.model,
+        runContext: context,
+      });
+
+      return {
+        success: result.success,
+        actual: normalizeFlowStateFixture(result.flow),
+        error: result.error,
+        assistantMessageCount: result.assistantMessageCount,
+        toolCallCount: result.toolCallCount,
+        toolsUsed: result.toolsUsed,
+        skillsInvoked: [],
+        tokenUsage: result.tokenUsage,
+      };
+    },
+    validate({ evalCase, actual, initial, expected }) {
+      return validateFlowState({
+        actual,
+        initial: initial?.flow,
+        expected,
+        validate: evalCase.validate,
+      });
+    },
+    buildArtifacts(actual): BenchmarkArtifactFile[] {
+      return [
+        {
+          path: "flow.json",
+          content: JSON.stringify(actual, null, 2) + "\n",
+        },
+      ];
+    },
+  };
+}
+
+function normalizeFlowInitialFixture(value: unknown): FlowInitialFixture {
+  if (isObject(value) && ("flow" in value || "workspace" in value)) {
+    const fixture = value as {
+      flow?: FlowFixture;
+      workspace?: FlowWorkspaceFixtures;
+    };
+    return {
+      flow: fixture.flow,
+      workspace: fixture.workspace,
+    };
+  }
+
+  return {
+    flow: normalizeFlowStateFixture(value),
+  };
+}
+
+function normalizeFlowStateFixture(value: unknown): FlowState {
+  if (!isObject(value)) {
+    return {};
+  }
+  if ("flow" in value && isObject((value as { flow?: unknown }).flow)) {
+    return (value as { flow: FlowState }).flow;
+  }
+  return value as FlowState;
+}
+
+function isObject(value: unknown): value is Record<string, unknown> {
+  return typeof value === "object" && value !== null && !Array.isArray(value);
+}
diff --git a/ai_evals/modes/frontendCommon.test.ts b/ai_evals/modes/frontendCommon.test.ts
new file mode 100644
index 0000000000..cac10ffcab
--- /dev/null
+++ b/ai_evals/modes/frontendCommon.test.ts
@@ -0,0 +1,28 @@
+import { afterEach, describe, expect, it } from "bun:test";
+import { getFrontendApiKey } from "./frontendCommon";
+
+const ORIGINAL_ENV = {
+  ANTHROPIC_API_KEY: process.env.ANTHROPIC_API_KEY,
+  OPENAI_API_KEY: process.env.OPENAI_API_KEY,
+  GEMINI_API_KEY: process.env.GEMINI_API_KEY,
+};
+
+afterEach(() => {
+  process.env.ANTHROPIC_API_KEY = ORIGINAL_ENV.ANTHROPIC_API_KEY;
+  process.env.OPENAI_API_KEY = ORIGINAL_ENV.OPENAI_API_KEY;
+  process.env.GEMINI_API_KEY = ORIGINAL_ENV.GEMINI_API_KEY;
+});
+
+describe("getFrontendApiKey", () => {
+  it("reads the Gemini API key for googleai models", () => {
+    process.env.GEMINI_API_KEY = "gemini-test-key";
+    expect(getFrontendApiKey("googleai")).toBe("gemini-test-key");
+  });
+
+  it("throws a provider-specific error when the key is missing", () => {
+    delete process.env.GEMINI_API_KEY;
+    expect(() => getFrontendApiKey("googleai")).toThrow(
+      "GEMINI_API_KEY is required for frontend evals"
+    );
+  });
+});
diff --git a/ai_evals/modes/frontendCommon.ts b/ai_evals/modes/frontendCommon.ts
new file mode 100644
index 0000000000..2619d21821
--- /dev/null
+++ b/ai_evals/modes/frontendCommon.ts
@@ -0,0 +1,23 @@
+import {
+  getFrontendEvalModel,
+  resolveEvalModel,
+  type FrontendEvalModelConfig,
+} from "../core/models";
+
+export const DEFAULT_FRONTEND_EVAL_MODEL: FrontendEvalModelConfig = getFrontendEvalModel(
+  resolveEvalModel("flow")
+);
+
+export function getFrontendApiKey(provider: FrontendEvalModelConfig["provider"]): string {
+  const envName =
+    provider === "anthropic"
+      ? "ANTHROPIC_API_KEY"
+      : provider === "googleai"
+        ? "GEMINI_API_KEY"
+        : "OPENAI_API_KEY";
+  const apiKey = process.env[envName];
+  if (!apiKey) {
+    throw new Error(`${envName} is required for frontend evals`);
+  }
+  return apiKey;
+}
diff --git a/ai_evals/modes/script.ts b/ai_evals/modes/script.ts
new file mode 100644
index 0000000000..f3ab232cc3
--- /dev/null
+++ b/ai_evals/modes/script.ts
@@ -0,0 +1,61 @@
+import { readJsonFile } from "../core/files";
+import type { FrontendEvalModelConfig } from "../core/models";
+import { validateScriptState } from "../core/validators";
+import type { BenchmarkArtifactFile, ModeRunner } from "../core/types";
+import { runScriptEval } from "../adapters/frontend/core/script/scriptEvalRunner";
+import type { ScriptEvalState } from "../adapters/frontend/core/script/fileHelpers";
+import { DEFAULT_FRONTEND_EVAL_MODEL, getFrontendApiKey } from "./frontendCommon";
+
+export function createScriptModeRunner(
+  modelConfig: FrontendEvalModelConfig = DEFAULT_FRONTEND_EVAL_MODEL
+): ModeRunner<ScriptEvalState, ScriptEvalState, ScriptEvalState> {
+  return {
+    mode: "script",
+    concurrency: 5,
+    judgeThreshold: 80,
+    async loadInitial(path) {
+      return path ? await readJsonFile<ScriptEvalState>(path) : undefined;
+    },
+    async loadExpected(path) {
+      return path ? await readJsonFile<ScriptEvalState>(path) : undefined;
+    },
+    async run(prompt, initial, context) {
+      if (!initial) {
+        throw new Error("Script evals require an initial script fixture");
+      }
+
+      const result = await runScriptEval(prompt, getFrontendApiKey(modelConfig.provider), {
+        initialScript: initial,
+        provider: modelConfig.provider,
+        model: modelConfig.model,
+        runContext: context,
+      });
+
+      return {
+        success: result.success,
+        actual: result.script,
+        error: result.error,
+        assistantMessageCount: result.assistantMessageCount,
+        toolCallCount: result.toolCallCount,
+        toolsUsed: result.toolsUsed,
+        skillsInvoked: [],
+        tokenUsage: result.tokenUsage,
+      };
+    },
+    validate({ actual, initial, expected }) {
+      return validateScriptState({ actual, initial, expected });
+    },
+    buildArtifacts(actual): BenchmarkArtifactFile[] {
+      return [
+        {
+          path: "script.json",
+          content: JSON.stringify(actual, null, 2) + "\n",
+        },
+        {
+          path: actual.path,
+          content: actual.code,
+        },
+      ];
+    },
+  };
+}
diff --git a/ai_evals/package.json b/ai_evals/package.json
new file mode 100644
index 0000000000..569562ad3c
--- /dev/null
+++ b/ai_evals/package.json
@@ -0,0 +1,19 @@
+{
+  "name": "windmill-ai-evals",
+  "private": true,
+  "type": "module",
+  "scripts": {
+    "cli": "bun cli/index.ts"
+  },
+  "dependencies": {
+    "@anthropic-ai/claude-agent-sdk": "^0.2.25",
+    "@anthropic-ai/sdk": "^0.39.0",
+    "commander": "^14.0.3",
+    "openai": "^6.9.1",
+    "yaml": "^2.8.3"
+  },
+  "devDependencies": {
+    "@types/bun": "latest",
+    "typescript": "^5.0.0"
+  }
+}
diff --git a/cli/README.md b/cli/README.md
index ee3b68c35f..478aa2a936 100644
--- a/cli/README.md
+++ b/cli/README.md
@@ -110,6 +110,28 @@ source <(wmill completions zsh)
 
 ## Development
 
+### AI Guidance Variants
+
+`wmill init` can now materialize alternate AI guidance bundles without changing
+the generated defaults in the repo, but this is exposed as internal env-var
+overrides rather than public CLI flags.
+
+Examples:
+
+```bash
+WMILL_INIT_AI_SKILLS_SOURCE=/path/to/custom/skills wmill init --use-default
+WMILL_INIT_AI_SKILLS_SOURCE=/path/to/custom/skills WMILL_INIT_AI_AGENTS_SOURCE=/path/to/AGENTS.md wmill init --use-default
+WMILL_INIT_AI_SKILLS_SOURCE=/path/to/custom/skills WMILL_INIT_AI_CLAUDE_SOURCE=/path/to/CLAUDE.md wmill init --use-default
+```
+
+This is the same guidance-writing path used by the benchmark CLI under
+`ai_evals/`, so the benchmark harness and `wmill init` now generate the same
+project guidance shape:
+
+- `AGENTS.md`
+- `CLAUDE.md`
+- `.claude/skills/*`
+
 ### Testing with a local `windmill-yaml-validator`
 
 To test local changes to the validator before publishing, use `npm link`:
diff --git a/cli/TESTING.md b/cli/TESTING.md
index 542baab368..235a95c4b1 100644
--- a/cli/TESTING.md
+++ b/cli/TESTING.md
@@ -29,6 +29,20 @@ binary and starts a shared backend instance.
 
 Examples: `sync_pull_push`, `dev_server`, `standalone_commands`
 
+## AI Benchmark Caveats
+
+The repo-level benchmark CLI lives under `ai_evals/`, but it currently depends on
+mocked frontend flow execution in a few places. Treat `flow` benchmark passes as
+artifact-shape signal, not full runtime correctness, when either of these apply:
+
+- deterministic flow validation does not currently reject syntactically invalid
+  `rawscript` module bodies
+- frontend benchmark calls to `test_run_flow` and `test_run_step` return mocked
+  completed jobs for `mock-job-id-*` workspaces instead of executing the flow
+
+If a prompt change depends on flow wiring or script runtime behavior, verify it
+with additional validation or a real run before trusting the benchmark result.
+
 ## Environment Variables
 
 | Variable | Purpose | Default |
diff --git a/cli/src/commands/init/init.ts b/cli/src/commands/init/init.ts
index 345eda8e77..519f572394 100644
--- a/cli/src/commands/init/init.ts
+++ b/cli/src/commands/init/init.ts
@@ -1,4 +1,4 @@
-import { stat, writeFile, rm, mkdir } from "node:fs/promises";
+import { stat, writeFile, rm } from "node:fs/promises";
 import { colors } from "@cliffy/ansi/colors";
 import { Command } from "@cliffy/command";
 import { Confirm } from "@cliffy/prompt/confirm";
@@ -16,23 +16,14 @@ import {
   type Workspace,
 } from "../workspace/workspace.ts";
 import { generateRTNamespace } from "../resource-type/resource-type.ts";
-import { SKILLS, SKILL_CONTENT, SCHEMAS, SCHEMA_MAPPINGS } from "../../guidance/skills.ts";
-import { generateAgentsMdContent } from "../../guidance/core.ts";
+import {
+  WMILL_INIT_AI_AGENTS_SOURCE_ENV,
+  WMILL_INIT_AI_CLAUDE_SOURCE_ENV,
+  WMILL_INIT_AI_SKILLS_SOURCE_ENV,
+  writeAiGuidanceFiles,
+} from "../../guidance/writer.ts";
 import { generateCommentedTemplate } from "./template.ts";
 
-/**
- * Format a YAML schema for inclusion in skill markdown files.
- */
-function formatSchemaForMarkdown(schemaYaml: string, schemaName: string, filePattern: string): string {
-  return `## ${schemaName} (\`${filePattern}\`)
-
-Must be a YAML file that adheres to the following schema:
-
-\`\`\`yaml
-${schemaYaml.trim()}
-\`\`\``;
-}
-
 export interface InitOptions {
   useDefault?: boolean;
   useBackend?: boolean;
@@ -255,88 +246,24 @@ async function initAction(opts: InitOptions) {
 
   // Create guidance files (AGENTS.md, CLAUDE.md, and Claude skills)
   try {
-    // Generate skills reference section for AGENTS.md
-    const skills_base_dir = ".claude/skills";
-    const skillsReference = SKILLS.map(
-      (s) => `- \`${skills_base_dir}/${s.name}/SKILL.md\` - ${s.description}`
-    ).join("\n");
+    const guidanceResult = await writeAiGuidanceFiles({
+      targetDir: ".",
+      nonDottedPaths,
+      overwriteProjectGuidance: false,
+      skillsSourcePath: process.env[WMILL_INIT_AI_SKILLS_SOURCE_ENV],
+      agentsSourcePath: process.env[WMILL_INIT_AI_AGENTS_SOURCE_ENV],
+      claudeSourcePath: process.env[WMILL_INIT_AI_CLAUDE_SOURCE_ENV],
+    });
 
-    // Create AGENTS.md file with minimal instructions
-    if (!(await stat("AGENTS.md").catch(() => null))) {
-      await writeFile(
-        "AGENTS.md",
-        generateAgentsMdContent(skillsReference), "utf-8"
-      );
+    if (guidanceResult.agentsWritten) {
       log.info(colors.green("Created AGENTS.md"));
     }
-
-    // Create CLAUDE.md file, referencing AGENTS.md
-    if (!(await stat("CLAUDE.md").catch(() => null))) {
-      await writeFile(
-        "CLAUDE.md",
-        `Instructions are in @AGENTS.md
-`, "utf-8"
-      );
+    if (guidanceResult.claudeWritten) {
       log.info(colors.green("Created CLAUDE.md"));
     }
-
-    // Create .claude/skills/ directory and skill files
-    try {
-      await mkdir(".claude/skills", { recursive: true });
-
-      await Promise.all(
-        SKILLS.map(async (skill) => {
-          const skillDir = `.claude/skills/${skill.name}`;
-          await mkdir(skillDir, { recursive: true });
-
-          let skillContent = SKILL_CONTENT[skill.name];
-          if (skillContent) {
-            // Replace placeholders with actual suffixes based on nonDottedPaths
-            if (nonDottedPaths) {
-              skillContent = skillContent
-                .replaceAll("{{FLOW_SUFFIX}}", "__flow")
-                .replaceAll("{{APP_SUFFIX}}", "__app")
-                .replaceAll("{{RAW_APP_SUFFIX}}", "__raw_app")
-                .replaceAll("{{INLINE_SCRIPT_NAMING}}", "Inline script files should NOT include `.inline_script.` in their names (e.g. use `a.ts`, not `a.inline_script.ts`).");
-            } else {
-              skillContent = skillContent
-                .replaceAll("{{FLOW_SUFFIX}}", ".flow")
-                .replaceAll("{{APP_SUFFIX}}", ".app")
-                .replaceAll("{{RAW_APP_SUFFIX}}", ".raw_app")
-                .replaceAll("{{INLINE_SCRIPT_NAMING}}", "Inline script files use the `.inline_script.` naming convention (e.g. `a.inline_script.ts`).");
-            }
-            // Check if this skill has schemas that need to be appended
-            const schemaMappings = SCHEMA_MAPPINGS[skill.name];
-            if (schemaMappings && schemaMappings.length > 0) {
-              // Combine base content with schemas
-              const schemaDocs = schemaMappings
-                .map((mapping) => {
-                  const schemaYaml = SCHEMAS[mapping.schemaKey];
-                  if (schemaYaml) {
-                    return formatSchemaForMarkdown(schemaYaml, mapping.name, mapping.filePattern);
-                  }
-                  return null;
-                })
-                .filter((doc): doc is string => doc !== null);
-
-              if (schemaDocs.length > 0) {
-                skillContent = skillContent + "\n\n" + schemaDocs.join("\n\n");
-              }
-            }
-
-            await writeFile(`${skillDir}/SKILL.md`, skillContent, "utf-8");
-          }
-        })
-      );
-
-      log.info(colors.green(`Created .claude/skills/ with ${SKILLS.length} skills`));
-    } catch (skillError) {
-      if (skillError instanceof Error) {
-        log.warn(`Could not create skills: ${skillError.message}`);
-      } else {
-        log.warn(`Could not create skills: ${skillError}`);
-      }
-    }
+    log.info(
+      colors.green(`Created .claude/skills/ with ${guidanceResult.skillCount} skills`)
+    );
   } catch (error) {
     if (error instanceof Error) {
       log.warn(`Could not create guidance files: ${error.message}`);
diff --git a/cli/src/guidance/writer.ts b/cli/src/guidance/writer.ts
new file mode 100644
index 0000000000..0acab52784
--- /dev/null
+++ b/cli/src/guidance/writer.ts
@@ -0,0 +1,269 @@
+import { cp, mkdir, readdir, readFile, stat, writeFile } from "node:fs/promises";
+import { join } from "node:path";
+import { generateAgentsMdContent } from "./core.ts";
+import {
+  SCHEMAS,
+  SCHEMA_MAPPINGS,
+  SKILLS,
+  SKILL_CONTENT,
+  type SkillMetadata,
+} from "./skills.ts";
+
+type ResolvedSkillMetadata = SkillMetadata & {
+  directoryName: string;
+};
+
+export interface WriteAiGuidanceOptions {
+  targetDir: string;
+  nonDottedPaths?: boolean;
+  overwriteProjectGuidance?: boolean;
+  skillsSourcePath?: string;
+  agentsSourcePath?: string;
+  claudeSourcePath?: string;
+}
+
+export interface WriteAiGuidanceResult {
+  agentsWritten: boolean;
+  claudeWritten: boolean;
+  skillCount: number;
+}
+
+export const WMILL_INIT_AI_SKILLS_SOURCE_ENV = "WMILL_INIT_AI_SKILLS_SOURCE";
+export const WMILL_INIT_AI_AGENTS_SOURCE_ENV = "WMILL_INIT_AI_AGENTS_SOURCE";
+export const WMILL_INIT_AI_CLAUDE_SOURCE_ENV = "WMILL_INIT_AI_CLAUDE_SOURCE";
+
+const CLAUDE_MD_DEFAULT = "Instructions are in @AGENTS.md\n";
+
+export async function writeAiGuidanceFiles(
+  options: WriteAiGuidanceOptions
+): Promise<WriteAiGuidanceResult> {
+  const nonDottedPaths = options.nonDottedPaths ?? true;
+  const skillMetadata = options.skillsSourcePath
+    ? await readSkillMetadataFromDirectory(options.skillsSourcePath)
+    : getGeneratedSkillMetadata();
+
+  const agentsWritten = await writeProjectGuidanceFile({
+    targetPath: join(options.targetDir, "AGENTS.md"),
+    overwrite: options.overwriteProjectGuidance ?? false,
+    content:
+      options.agentsSourcePath != null
+        ? await readFile(options.agentsSourcePath, "utf8")
+        : generateAgentsMdContent(buildSkillsReference(skillMetadata)),
+  });
+
+  const claudeWritten = await writeProjectGuidanceFile({
+    targetPath: join(options.targetDir, "CLAUDE.md"),
+    overwrite: options.overwriteProjectGuidance ?? false,
+    content:
+      options.claudeSourcePath != null
+        ? await readFile(options.claudeSourcePath, "utf8")
+        : CLAUDE_MD_DEFAULT,
+  });
+
+  if (options.skillsSourcePath) {
+    await copySkillsFromSource(options.targetDir, options.skillsSourcePath);
+  } else {
+    await writeGeneratedSkills(options.targetDir, nonDottedPaths);
+  }
+
+  return {
+    agentsWritten,
+    claudeWritten,
+    skillCount: skillMetadata.length,
+  };
+}
+
+function buildSkillsReference(
+  skills: Pick<ResolvedSkillMetadata, "directoryName" | "description">[]
+): string {
+  return skills
+    .map((skill) => `- \`.claude/skills/${skill.directoryName}/SKILL.md\` - ${skill.description}`)
+    .join("\n");
+}
+
+async function copySkillsFromSource(
+  targetDir: string,
+  skillsSourcePath: string
+): Promise<ResolvedSkillMetadata[]> {
+  const skillsDir = await ensureSkillsDirectory(targetDir);
+  await copyDirectoryContents(skillsSourcePath, skillsDir);
+  return await readSkillMetadataFromDirectory(skillsDir);
+}
+
+async function writeGeneratedSkills(
+  targetDir: string,
+  nonDottedPaths: boolean
+): Promise<ResolvedSkillMetadata[]> {
+  const skillsDir = await ensureSkillsDirectory(targetDir);
+
+  await Promise.all(
+    SKILLS.map(async (skill) => {
+      const skillDir = join(skillsDir, skill.name);
+      await mkdir(skillDir, { recursive: true });
+      await writeFile(
+        join(skillDir, "SKILL.md"),
+        renderGeneratedSkillContent(skill.name, nonDottedPaths),
+        "utf8"
+      );
+    })
+  );
+
+  return SKILLS.map((skill) => ({
+    ...skill,
+    directoryName: skill.name,
+  }));
+}
+
+function getGeneratedSkillMetadata(): ResolvedSkillMetadata[] {
+  return SKILLS.map((skill) => ({
+    ...skill,
+    directoryName: skill.name,
+  }));
+}
+
+async function ensureSkillsDirectory(targetDir: string): Promise<string> {
+  const skillsDir = join(targetDir, ".claude", "skills");
+  await mkdir(skillsDir, { recursive: true });
+  return skillsDir;
+}
+
+async function copyDirectoryContents(sourceDir: string, targetDir: string): Promise<void> {
+  const entries = await readdir(sourceDir, { withFileTypes: true });
+
+  await Promise.all(
+    entries.map(async (entry) => {
+      await cp(join(sourceDir, entry.name), join(targetDir, entry.name), {
+        recursive: true,
+        force: true,
+      });
+    })
+  );
+}
+
+function renderGeneratedSkillContent(skillName: string, nonDottedPaths: boolean): string {
+  let skillContent = SKILL_CONTENT[skillName];
+  if (!skillContent) {
+    throw new Error(`Missing generated skill content for ${skillName}`);
+  }
+
+  if (nonDottedPaths) {
+    skillContent = skillContent
+      .replaceAll("{{FLOW_SUFFIX}}", "__flow")
+      .replaceAll("{{APP_SUFFIX}}", "__app")
+      .replaceAll("{{RAW_APP_SUFFIX}}", "__raw_app")
+      .replaceAll(
+        "{{INLINE_SCRIPT_NAMING}}",
+        "Inline script files should NOT include `.inline_script.` in their names (e.g. use `a.ts`, not `a.inline_script.ts`)."
+      );
+  } else {
+    skillContent = skillContent
+      .replaceAll("{{FLOW_SUFFIX}}", ".flow")
+      .replaceAll("{{APP_SUFFIX}}", ".app")
+      .replaceAll("{{RAW_APP_SUFFIX}}", ".raw_app")
+      .replaceAll(
+        "{{INLINE_SCRIPT_NAMING}}",
+        "Inline script files use the `.inline_script.` naming convention (e.g. `a.inline_script.ts`)."
+      );
+  }
+
+  const schemaMappings = SCHEMA_MAPPINGS[skillName];
+  if (!schemaMappings || schemaMappings.length === 0) {
+    return skillContent;
+  }
+
+  const schemaDocs = schemaMappings
+    .map((mapping) => {
+      const schemaYaml = SCHEMAS[mapping.schemaKey];
+      if (!schemaYaml) {
+        return null;
+      }
+      return formatSchemaForMarkdown(schemaYaml, mapping.name, mapping.filePattern);
+    })
+    .filter((entry): entry is string => entry !== null);
+
+  if (schemaDocs.length === 0) {
+    return skillContent;
+  }
+
+  return `${skillContent}\n\n${schemaDocs.join("\n\n")}`;
+}
+
+async function readSkillMetadataFromDirectory(skillsDir: string): Promise<ResolvedSkillMetadata[]> {
+  const entries = await readdir(skillsDir, { withFileTypes: true });
+  const skills: ResolvedSkillMetadata[] = [];
+
+  for (const entry of entries.sort((left, right) => left.name.localeCompare(right.name))) {
+    if (!entry.isDirectory()) {
+      continue;
+    }
+
+    const skillPath = join(skillsDir, entry.name, "SKILL.md");
+    if (!(await stat(skillPath).catch(() => null))) {
+      continue;
+    }
+
+    const content = await readFile(skillPath, "utf8");
+    skills.push(parseSkillMetadata(content, entry.name));
+  }
+
+  return skills;
+}
+
+function parseSkillMetadata(content: string, fallbackName: string): ResolvedSkillMetadata {
+  const frontMatterMatch = content.match(/^---\s*\n([\s\S]*?)\n---/);
+  if (!frontMatterMatch) {
+    return {
+      name: fallbackName,
+      description: `Skill loaded from ${fallbackName}`,
+      directoryName: fallbackName,
+    };
+  }
+
+  let name = fallbackName;
+  let description = `Skill loaded from ${fallbackName}`;
+
+  for (const line of frontMatterMatch[1].split("\n")) {
+    const separatorIndex = line.indexOf(":");
+    if (separatorIndex === -1) {
+      continue;
+    }
+
+    const key = line.slice(0, separatorIndex).trim();
+    const value = line.slice(separatorIndex + 1).trim();
+
+    if (key === "name" && value) {
+      name = value;
+    } else if (key === "description" && value) {
+      description = value;
+    }
+  }
+
+  return { name, description, directoryName: fallbackName };
+}
+
+async function writeProjectGuidanceFile(options: {
+  targetPath: string;
+  content: string;
+  overwrite: boolean;
+}): Promise<boolean> {
+  if (!options.overwrite && (await stat(options.targetPath).catch(() => null))) {
+    return false;
+  }
+
+  await writeFile(options.targetPath, options.content, "utf8");
+  return true;
+}
+
+function formatSchemaForMarkdown(
+  schemaYaml: string,
+  schemaName: string,
+  filePattern: string
+): string {
+  return `## ${schemaName} (\`${filePattern}\`)
+
+Must be a YAML file that adheres to the following schema:
+
+\`\`\`yaml
+${schemaYaml.trim()}
+\`\`\``;
+}
diff --git a/cli/test-skills/README.md b/cli/test-skills/README.md
deleted file mode 100644
index 7ca6599215..0000000000
--- a/cli/test-skills/README.md
+++ /dev/null
@@ -1,103 +0,0 @@
-# Windmill Skill Invocation Tests
-
-Test suite for verifying that Claude Code correctly invokes Windmill auto-generated skills based on user prompts.
-
-## Overview
-
-This framework tests skill invocation behavior by sending prompts through the Claude Agent SDK and verifying that the expected skills are invoked. Users must provide their own `.claude/skills` folder containing auto-generated Windmill skills.
-
-## Prerequisites
-
-- [Bun](https://bun.sh/) installed
-- `ANTHROPIC_API_KEY` environment variable set
-- Auto-generated Windmill skills placed in `.claude/skills/`
-
-## User Setup
-
-1. Create a `test-folder` directory inside `cli/test-skills/` and copy your auto-generated Windmill skills into it:
-
-```
-cli/test-skills/
-└── test-folder/
-    └── .claude/
-        └── skills/
-            ├── write-flow/
-            │   └── SKILL.md
-            ├── write-script-python3/
-            │   └── SKILL.md
-            ├── write-script-bun/
-            │   └── SKILL.md
-            ├── schedules/
-            │   └── SKILL.md
-            └── triggers/
-                └── SKILL.md
-```
-
-2. Set your API key:
-```bash
-export ANTHROPIC_API_KEY=your-key-here
-```
-
-3. Install dependencies and run tests:
-```bash
-cd cli/test-skills
-bun install
-bun test
-```
-
-## Expected Skills
-
-The tests expect the following auto-generated skills to be present:
-
-| Skill Name | Purpose |
-|------------|---------|
-| `write-flow` | Creating Windmill flows/workflows |
-| `write-script-python3` | Creating Python scripts |
-| `write-script-bun` | Creating TypeScript/Bun scripts |
-| `schedules` | Configuring schedules and cron jobs |
-| `triggers` | Setting up triggers (webhook, Kafka, etc.) |
-
-## Test Matrix
-
-| Prompt | Expected Skill |
-|--------|----------------|
-| "Create a flow to process user data" | `write-flow` |
-| "Build a workflow that fetches and transforms data" | `write-flow` |
-| "Write a Python script to fetch API data" | `write-script-python3` |
-| "Create a Python function to process CSV files" | `write-script-python3` |
-| "Write a TypeScript script using Bun" | `write-script-bun` |
-| "Create a Bun script to handle webhooks" | `write-script-bun` |
-| "Set up a schedule to run this daily at midnight" | `schedules` |
-| "Configure a cron job to run every hour" | `schedules` |
-| "Set up a webhook trigger for this flow" | `triggers` |
-| "Configure a Kafka trigger" | `triggers` |
-
-## Running Tests
-
-Run all tests:
-```bash
-bun test
-```
-
-Run only skill invocation tests:
-```bash
-bun test:skills
-```
-
-## Test Utilities
-
-The `src/test-utils.ts` module provides:
-
-- `runPromptAndCapture(prompt, cwd?, maxTurns)` - Runs a prompt and captures tool invocations
-- `wasToolUsed(result, toolName)` - Checks if a specific tool was used
-- `wasSkillInvoked(result, skillName)` - Checks if a specific skill was invoked
-- `getToolInputs(result, toolName)` - Gets all inputs for a specific tool
-- `getTestSkillsDir()` - Returns the test-skills directory path
-
-## Notes
-
-- Tests have extended timeouts (120 seconds) due to API latency
-- Tests run against the actual Claude API, so they consume API credits
-- Tests verify skill invocation, not skill execution
-- The working directory for tests is `test-folder/` (where `.claude/skills` should be placed)
-- Tests will fail with a clear error if `test-folder/` or `test-folder/.claude/skills/` don't exist
diff --git a/cli/test-skills/bun.lock b/cli/test-skills/bun.lock
deleted file mode 100644
index 9e4443cca8..0000000000
--- a/cli/test-skills/bun.lock
+++ /dev/null
@@ -1,61 +0,0 @@
-{
-  "lockfileVersion": 1,
-  "configVersion": 1,
-  "workspaces": {
-    "": {
-      "name": "claude-code-skill-tests",
-      "dependencies": {
-        "@anthropic-ai/claude-agent-sdk": "^0.2.25",
-      },
-      "devDependencies": {
-        "@types/bun": "latest",
-        "typescript": "^5.0.0",
-      },
-    },
-  },
-  "packages": {
-    "@anthropic-ai/claude-agent-sdk": ["@anthropic-ai/claude-agent-sdk@0.2.25", "", { "optionalDependencies": { "@img/sharp-darwin-arm64": "^0.33.5", "@img/sharp-darwin-x64": "^0.33.5", "@img/sharp-linux-arm": "^0.33.5", "@img/sharp-linux-arm64": "^0.33.5", "@img/sharp-linux-x64": "^0.33.5", "@img/sharp-linuxmusl-arm64": "^0.33.5", "@img/sharp-linuxmusl-x64": "^0.33.5", "@img/sharp-win32-x64": "^0.33.5" }, "peerDependencies": { "zod": "^4.0.0" } }, "sha512-YIP3I40+XSkC3zE1Z8KRQY02VA7UfofFamF1cFrLe7FbtCnjpslyDl9coGBh2DAi9xj2yQcKZZf751jEWpB+dQ=="],
-
-    "@img/sharp-darwin-arm64": ["@img/sharp-darwin-arm64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-darwin-arm64": "1.0.4" }, "os": "darwin", "cpu": "arm64" }, "sha512-UT4p+iz/2H4twwAoLCqfA9UH5pI6DggwKEGuaPy7nCVQ8ZsiY5PIcrRvD1DzuY3qYL07NtIQcWnBSY/heikIFQ=="],
-
-    "@img/sharp-darwin-x64": ["@img/sharp-darwin-x64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-darwin-x64": "1.0.4" }, "os": "darwin", "cpu": "x64" }, "sha512-fyHac4jIc1ANYGRDxtiqelIbdWkIuQaI84Mv45KvGRRxSAa7o7d1ZKAOBaYbnepLC1WqxfpimdeWfvqqSGwR2Q=="],
-
-    "@img/sharp-libvips-darwin-arm64": ["@img/sharp-libvips-darwin-arm64@1.0.4", "", { "os": "darwin", "cpu": "arm64" }, "sha512-XblONe153h0O2zuFfTAbQYAX2JhYmDHeWikp1LM9Hul9gVPjFY427k6dFEcOL72O01QxQsWi761svJ/ev9xEDg=="],
-
-    "@img/sharp-libvips-darwin-x64": ["@img/sharp-libvips-darwin-x64@1.0.4", "", { "os": "darwin", "cpu": "x64" }, "sha512-xnGR8YuZYfJGmWPvmlunFaWJsb9T/AO2ykoP3Fz/0X5XV2aoYBPkX6xqCQvUTKKiLddarLaxpzNe+b1hjeWHAQ=="],
-
-    "@img/sharp-libvips-linux-arm": ["@img/sharp-libvips-linux-arm@1.0.5", "", { "os": "linux", "cpu": "arm" }, "sha512-gvcC4ACAOPRNATg/ov8/MnbxFDJqf/pDePbBnuBDcjsI8PssmjoKMAz4LtLaVi+OnSb5FK/yIOamqDwGmXW32g=="],
-
-    "@img/sharp-libvips-linux-arm64": ["@img/sharp-libvips-linux-arm64@1.0.4", "", { "os": "linux", "cpu": "arm64" }, "sha512-9B+taZ8DlyyqzZQnoeIvDVR/2F4EbMepXMc/NdVbkzsJbzkUjhXv/70GQJ7tdLA4YJgNP25zukcxpX2/SueNrA=="],
-
-    "@img/sharp-libvips-linux-x64": ["@img/sharp-libvips-linux-x64@1.0.4", "", { "os": "linux", "cpu": "x64" }, "sha512-MmWmQ3iPFZr0Iev+BAgVMb3ZyC4KeFc3jFxnNbEPas60e1cIfevbtuyf9nDGIzOaW9PdnDciJm+wFFaTlj5xYw=="],
-
-    "@img/sharp-libvips-linuxmusl-arm64": ["@img/sharp-libvips-linuxmusl-arm64@1.0.4", "", { "os": "linux", "cpu": "arm64" }, "sha512-9Ti+BbTYDcsbp4wfYib8Ctm1ilkugkA/uscUn6UXK1ldpC1JjiXbLfFZtRlBhjPZ5o1NCLiDbg8fhUPKStHoTA=="],
-
-    "@img/sharp-libvips-linuxmusl-x64": ["@img/sharp-libvips-linuxmusl-x64@1.0.4", "", { "os": "linux", "cpu": "x64" }, "sha512-viYN1KX9m+/hGkJtvYYp+CCLgnJXwiQB39damAO7WMdKWlIhmYTfHjwSbQeUK/20vY154mwezd9HflVFM1wVSw=="],
-
-    "@img/sharp-linux-arm": ["@img/sharp-linux-arm@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-arm": "1.0.5" }, "os": "linux", "cpu": "arm" }, "sha512-JTS1eldqZbJxjvKaAkxhZmBqPRGmxgu+qFKSInv8moZ2AmT5Yib3EQ1c6gp493HvrvV8QgdOXdyaIBrhvFhBMQ=="],
-
-    "@img/sharp-linux-arm64": ["@img/sharp-linux-arm64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-arm64": "1.0.4" }, "os": "linux", "cpu": "arm64" }, "sha512-JMVv+AMRyGOHtO1RFBiJy/MBsgz0x4AWrT6QoEVVTyh1E39TrCUpTRI7mx9VksGX4awWASxqCYLCV4wBZHAYxA=="],
-
-    "@img/sharp-linux-x64": ["@img/sharp-linux-x64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-x64": "1.0.4" }, "os": "linux", "cpu": "x64" }, "sha512-opC+Ok5pRNAzuvq1AG0ar+1owsu842/Ab+4qvU879ippJBHvyY5n2mxF1izXqkPYlGuP/M556uh53jRLJmzTWA=="],
-
-    "@img/sharp-linuxmusl-arm64": ["@img/sharp-linuxmusl-arm64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-linuxmusl-arm64": "1.0.4" }, "os": "linux", "cpu": "arm64" }, "sha512-XrHMZwGQGvJg2V/oRSUfSAfjfPxO+4DkiRh6p2AFjLQztWUuY/o8Mq0eMQVIY7HJ1CDQUJlxGGZRw1a5bqmd1g=="],
-
-    "@img/sharp-linuxmusl-x64": ["@img/sharp-linuxmusl-x64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-linuxmusl-x64": "1.0.4" }, "os": "linux", "cpu": "x64" }, "sha512-WT+d/cgqKkkKySYmqoZ8y3pxx7lx9vVejxW/W4DOFMYVSkErR+w7mf2u8m/y4+xHe7yY9DAXQMWQhpnMuFfScw=="],
-
-    "@img/sharp-win32-x64": ["@img/sharp-win32-x64@0.33.5", "", { "os": "win32", "cpu": "x64" }, "sha512-MpY/o8/8kj+EcnxwvrP4aTJSWw/aZ7JIGR4aBeZkZw5B7/Jn+tY9/VNwtcoGmdT7GfggGIU4kygOMSbYnOrAbg=="],
-
-    "@types/bun": ["@types/bun@1.3.8", "", { "dependencies": { "bun-types": "1.3.8" } }, "sha512-3LvWJ2q5GerAXYxO2mffLTqOzEu5qnhEAlh48Vnu8WQfnmSwbgagjGZV6BoHKJztENYEDn6QmVd949W4uESRJA=="],
-
-    "@types/node": ["@types/node@25.1.0", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-t7frlewr6+cbx+9Ohpl0NOTKXZNV9xHRmNOvql47BFJKcEG1CxtxlPEEe+gR9uhVWM4DwhnvTF110mIL4yP9RA=="],
-
-    "bun-types": ["bun-types@1.3.8", "", { "dependencies": { "@types/node": "*" } }, "sha512-fL99nxdOWvV4LqjmC+8Q9kW3M4QTtTR1eePs94v5ctGqU8OeceWrSUaRw3JYb7tU3FkMIAjkueehrHPPPGKi5Q=="],
-
-    "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
-
-    "undici-types": ["undici-types@7.16.0", "", {}, "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw=="],
-
-    "zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="],
-  }
-}
diff --git a/cli/test-skills/package.json b/cli/test-skills/package.json
deleted file mode 100644
index 1d839ab23c..0000000000
--- a/cli/test-skills/package.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "name": "claude-code-skill-tests",
-  "version": "1.0.0",
-  "type": "module",
-  "scripts": {
-    "test": "bun test",
-    "test:skills": "bun test src/skill-invocation.test.ts"
-  },
-  "dependencies": {
-    "@anthropic-ai/claude-agent-sdk": "^0.2.25"
-  },
-  "devDependencies": {
-    "@types/bun": "latest",
-    "typescript": "^5.0.0"
-  }
-}
diff --git a/cli/test-skills/src/skill-invocation.test.ts b/cli/test-skills/src/skill-invocation.test.ts
deleted file mode 100644
index 9633eb2ef5..0000000000
--- a/cli/test-skills/src/skill-invocation.test.ts
+++ /dev/null
@@ -1,91 +0,0 @@
-import { describe, test, expect, beforeAll } from "bun:test";
-import { runPromptAndCapture, wasSkillInvoked, wasToolUsed, validateTestFolder } from "./test-utils";
-
-describe("Windmill Skill Invocation", () => {
-  beforeAll(() => {
-    if (!process.env.ANTHROPIC_API_KEY) {
-      throw new Error("ANTHROPIC_API_KEY environment variable is required");
-    }
-    validateTestFolder();
-  });
-
-  describe("Flow Creation", () => {
-    test("'Create a Windmill flow' should invoke write-flow skill", async () => {
-      const result = await runPromptAndCapture(
-        "Create a Windmill flow that fetches data from an API and transforms it. Use placeholder URLs.",
-        undefined,
-        3
-      );
-
-      console.log("Tools used:", result.toolsUsed.map(t => t.tool));
-      console.log("Skills invoked:", result.skillsInvoked);
-
-      expect(wasToolUsed(result, "Skill")).toBe(true);
-      expect(wasSkillInvoked(result, "write-flow")).toBe(true);
-    }, { timeout: 120000 });
-  });
-
-  describe("Python Script Creation", () => {
-    test("'Write a Windmill Python script' should invoke write-script-python3 skill", async () => {
-      const result = await runPromptAndCapture(
-        "Write a Windmill Python script that fetches data from https://api.example.com/users",
-        undefined,
-        3
-      );
-
-      console.log("Tools used:", result.toolsUsed.map(t => t.tool));
-      console.log("Skills invoked:", result.skillsInvoked);
-
-      expect(wasToolUsed(result, "Skill")).toBe(true);
-      expect(wasSkillInvoked(result, "write-script-python3")).toBe(true);
-    }, { timeout: 120000 });
-  });
-
-  describe("Bun Script Creation", () => {
-    test("'Write a Windmill Bun/TypeScript script' should invoke write-script-bun skill", async () => {
-      const result = await runPromptAndCapture(
-        "Write a Windmill Bun script that processes JSON data",
-        undefined,
-        3
-      );
-
-      console.log("Tools used:", result.toolsUsed.map(t => t.tool));
-      console.log("Skills invoked:", result.skillsInvoked);
-
-      expect(wasToolUsed(result, "Skill")).toBe(true);
-      expect(wasSkillInvoked(result, "write-script-bun")).toBe(true);
-    }, { timeout: 120000 });
-  });
-
-  describe("Schedule Configuration", () => {
-    test("'Create a Windmill schedule' should invoke schedules skill", async () => {
-      const result = await runPromptAndCapture(
-        "Create a Windmill schedule that runs a script daily at midnight",
-        undefined,
-        3
-      );
-
-      console.log("Tools used:", result.toolsUsed.map(t => t.tool));
-      console.log("Skills invoked:", result.skillsInvoked);
-
-      expect(wasToolUsed(result, "Skill")).toBe(true);
-      expect(wasSkillInvoked(result, "schedules")).toBe(true);
-    }, { timeout: 120000 });
-  });
-
-  describe("Trigger Configuration", () => {
-    test("'Set up a Windmill webhook trigger' should invoke triggers skill", async () => {
-      const result = await runPromptAndCapture(
-        "Set up a Windmill HTTP trigger for a flow at /api/webhook",
-        undefined,
-        3
-      );
-
-      console.log("Tools used:", result.toolsUsed.map(t => t.tool));
-      console.log("Skills invoked:", result.skillsInvoked);
-
-      expect(wasToolUsed(result, "Skill")).toBe(true);
-      expect(wasSkillInvoked(result, "triggers")).toBe(true);
-    }, { timeout: 120000 });
-  });
-});
diff --git a/cli/test-skills/src/test-utils.ts b/cli/test-skills/src/test-utils.ts
deleted file mode 100644
index dbf2ea4be9..0000000000
--- a/cli/test-skills/src/test-utils.ts
+++ /dev/null
@@ -1,137 +0,0 @@
-import { query, type Options } from "@anthropic-ai/claude-agent-sdk";
-import { existsSync } from "fs";
-import { join } from "path";
-
-export interface ToolInvocation {
-  tool: string;
-  input: Record<string, unknown>;
-  timestamp: number;
-}
-
-export interface TestResult {
-  toolsUsed: ToolInvocation[];
-  skillsInvoked: string[];
-  output: string;
-}
-
-/**
- * Get the test-skills directory path
- */
-export function getTestSkillsDir(): string {
-  return new URL("..", import.meta.url).pathname;
-}
-
-/**
- * Get the test-folder directory path (where user places .claude/skills)
- */
-export function getTestFolder(): string {
-  return join(getTestSkillsDir(), "test-folder");
-}
-
-/**
- * Validate that test-folder exists and has .claude/skills
- * Throws an error if validation fails
- */
-export function validateTestFolder(): void {
-  const testFolder = getTestFolder();
-  const skillsFolder = join(testFolder, ".claude", "skills");
-
-  if (!existsSync(testFolder)) {
-    throw new Error(
-      `test-folder does not exist at: ${testFolder}\n` +
-      `Please create it and add your .claude/skills directory inside.`
-    );
-  }
-
-  if (!existsSync(skillsFolder)) {
-    throw new Error(
-      `.claude/skills directory not found in test-folder at: ${skillsFolder}\n` +
-      `Please add your auto-generated Windmill skills to test-folder/.claude/skills/`
-    );
-  }
-}
-
-/**
- * Runs a prompt through the Claude Agent SDK and captures tool invocations
- * Uses test-folder as cwd where user-provided skills are located
- */
-export async function runPromptAndCapture(
-  prompt: string,
-  cwd?: string,
-  maxTurns: number = 3
-): Promise<TestResult> {
-  const workingDir = cwd ?? getTestFolder();
-  const toolsUsed: ToolInvocation[] = [];
-  const skillsInvoked: string[] = [];
-  let output = "";
-
-  const options: Options = {
-    cwd: workingDir,
-    model: "haiku",
-    maxTurns,
-    settingSources: ["project"],  // Required to load Skills from filesystem
-    allowedTools: ["Skill", "Read", "Glob", "Grep", "Bash", "Write", "Edit"],
-  };
-
-  for await (const message of query({ prompt, options })) {
-    if (message.type === "assistant") {
-      // The assistant message has a BetaMessage which contains content blocks
-      const content = message.message?.content;
-      if (Array.isArray(content)) {
-        for (const block of content) {
-          if (block.type === "tool_use") {
-            const toolInvocation: ToolInvocation = {
-              tool: block.name,
-              input: block.input as Record<string, unknown>,
-              timestamp: Date.now(),
-            };
-            toolsUsed.push(toolInvocation);
-
-            // Check if this is a Skill tool invocation
-            if (block.name === "Skill" && typeof block.input === "object" && block.input !== null) {
-              const skillInput = block.input as { skill?: string };
-              if (skillInput.skill) {
-                skillsInvoked.push(skillInput.skill);
-              }
-            }
-          } else if (block.type === "text") {
-            output += block.text;
-          }
-        }
-      }
-    } else if (message.type === "result") {
-      // Capture final result if available
-      const resultMessage = message as { result?: string };
-      if (typeof resultMessage.result === "string") {
-        output += resultMessage.result;
-      }
-    }
-  }
-
-  return {
-    toolsUsed,
-    skillsInvoked,
-    output,
-  };
-}
-
-/**
- * Helper to check if a specific tool was used
- */
-export function wasToolUsed(result: TestResult, toolName: string): boolean {
-  return result.toolsUsed.some((t) => t.tool === toolName);
-}
-
-/**
- * Helper to check if a specific skill was invoked
- */
-export function wasSkillInvoked(result: TestResult, skillName: string): boolean {
-  return result.skillsInvoked.some((s) => s === skillName || s.includes(skillName));
-}
-
-/**
- * Helper to get all tool inputs for a specific tool
- */
-export function getToolInputs(result: TestResult, toolName: string): Record<string, unknown>[] {
-  return result.toolsUsed.filter((t) => t.tool === toolName).map((t) => t.input);
-}
diff --git a/cli/test-skills/tsconfig.json b/cli/test-skills/tsconfig.json
deleted file mode 100644
index 45f0069307..0000000000
--- a/cli/test-skills/tsconfig.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "compilerOptions": {
-    "target": "ES2022",
-    "module": "ESNext",
-    "moduleResolution": "bundler",
-    "strict": true,
-    "esModuleInterop": true,
-    "skipLibCheck": true,
-    "forceConsistentCasingInFileNames": true,
-    "outDir": "./dist",
-    "rootDir": "./src",
-    "declaration": true,
-    "types": ["bun-types"]
-  },
-  "include": ["src/**/*"],
-  "exclude": ["node_modules", "dist"]
-}
diff --git a/cli/test/guidance_writer_unit.test.ts b/cli/test/guidance_writer_unit.test.ts
new file mode 100644
index 0000000000..dbe541e65a
--- /dev/null
+++ b/cli/test/guidance_writer_unit.test.ts
@@ -0,0 +1,148 @@
+import { describe, expect, test } from "bun:test";
+import { mkdtemp, mkdir, readFile, rm, writeFile } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { writeAiGuidanceFiles } from "../src/guidance/writer.ts";
+
+async function withTempDir(fn: (tempDir: string) => Promise<void>): Promise<void> {
+  const tempDir = await mkdtemp(join(tmpdir(), "wmill_guidance_writer_"));
+  try {
+    await fn(tempDir);
+  } finally {
+    await rm(tempDir, { recursive: true, force: true });
+  }
+}
+
+async function writeSkill(
+  rootDir: string,
+  skillName: string,
+  content: string
+): Promise<string> {
+  const skillPath = join(rootDir, skillName, "SKILL.md");
+  await mkdir(join(rootDir, skillName), { recursive: true });
+  await writeFile(skillPath, content, "utf8");
+  return skillPath;
+}
+
+describe("writeAiGuidanceFiles", () => {
+  test("preserves custom skills when refreshing generated guidance", async () => {
+    await withTempDir(async (tempDir) => {
+      const skillsDir = join(tempDir, ".claude", "skills");
+      const customSkillContent = `---
+name: custom-skill
+description: Custom skill
+---
+
+Preserve me.
+`;
+      const staleGeneratedContent = "stale generated skill";
+
+      const customSkillPath = await writeSkill(skillsDir, "custom-skill", customSkillContent);
+      const generatedSkillPath = await writeSkill(skillsDir, "write-flow", staleGeneratedContent);
+
+      await writeAiGuidanceFiles({
+        targetDir: tempDir,
+        overwriteProjectGuidance: false,
+      });
+
+      expect(await readFile(customSkillPath, "utf8")).toBe(customSkillContent);
+
+      const generatedSkillContent = await readFile(generatedSkillPath, "utf8");
+      expect(generatedSkillContent).not.toBe(staleGeneratedContent);
+      expect(generatedSkillContent).toContain("name: write-flow");
+    });
+  });
+
+  test("preserves custom skills when copying a skill bundle from source", async () => {
+    await withTempDir(async (tempDir) => {
+      const skillsDir = join(tempDir, ".claude", "skills");
+      const customSkillContent = `---
+name: custom-skill
+description: Custom skill
+---
+
+Keep this local skill.
+`;
+      const sourceSkillContent = `---
+name: write-flow
+description: Replacement flow skill
+---
+
+Copied from source.
+`;
+      const bundleOnlySkillContent = `---
+name: bundle-only
+description: Bundle only skill
+---
+
+Copied from source bundle.
+`;
+
+      const customSkillPath = await writeSkill(skillsDir, "custom-skill", customSkillContent);
+      const existingGeneratedSkillPath = await writeSkill(skillsDir, "write-flow", "old content");
+      const sourceSkillsDir = join(tempDir, "source-skills");
+
+      await writeSkill(sourceSkillsDir, "write-flow", sourceSkillContent);
+      const bundleOnlySkillPath = await writeSkill(
+        sourceSkillsDir,
+        "bundle-only",
+        bundleOnlySkillContent
+      );
+
+      await writeAiGuidanceFiles({
+        targetDir: tempDir,
+        overwriteProjectGuidance: false,
+        skillsSourcePath: sourceSkillsDir,
+      });
+
+      expect(await readFile(customSkillPath, "utf8")).toBe(customSkillContent);
+      expect(await readFile(existingGeneratedSkillPath, "utf8")).toBe(sourceSkillContent);
+      expect(await readFile(bundleOnlySkillPath.replace(sourceSkillsDir, skillsDir), "utf8")).toBe(
+        bundleOnlySkillContent
+      );
+    });
+  });
+
+  test("builds AGENTS skill references from copied directory names", async () => {
+    await withTempDir(async (tempDir) => {
+      const sourceSkillsDir = join(tempDir, "source-skills");
+      await writeSkill(
+        sourceSkillsDir,
+        "custom-folder",
+        `---
+name: write-flow
+description: Custom bundle skill
+---
+
+Copied from source bundle.
+`
+      );
+
+      await writeAiGuidanceFiles({
+        targetDir: tempDir,
+        overwriteProjectGuidance: false,
+        skillsSourcePath: sourceSkillsDir,
+      });
+
+      const agentsMd = await readFile(join(tempDir, "AGENTS.md"), "utf8");
+      expect(agentsMd).toContain(".claude/skills/custom-folder/SKILL.md");
+      expect(agentsMd).not.toContain(".claude/skills/write-flow/SKILL.md");
+    });
+  });
+
+  test("writes AGENTS.md and CLAUDE.md even if skills creation fails", async () => {
+    await withTempDir(async (tempDir) => {
+      await writeFile(join(tempDir, ".claude"), "not a directory\n", "utf8");
+
+      await expect(
+        writeAiGuidanceFiles({
+          targetDir: tempDir,
+          overwriteProjectGuidance: false,
+        })
+      ).rejects.toThrow();
+
+      expect(await readFile(join(tempDir, "AGENTS.md"), "utf8")).toContain(".claude/skills/");
+      expect(await readFile(join(tempDir, "CLAUDE.md"), "utf8")).toContain("@AGENTS.md");
+    });
+  });
+});
diff --git a/docs/failing-tests.md b/docs/failing-tests.md
new file mode 100644
index 0000000000..d0ae44f109
--- /dev/null
+++ b/docs/failing-tests.md
@@ -0,0 +1,33 @@
+# Failing Tests
+
+This file tracks benchmark cases that still fail or need follow-up validation.
+
+## Flow
+
+- `flow-test6-ai-agent-tools`
+  Latest failing run: `ai_evals/results/2026-04-09T11-25-24.107Z__flow`
+  Issues:
+  final output does not include the actions or tool-result details the prompt asks for
+  `open_support_ticket` contains a syntax bug
+
+- `flow-test7-simple-modification`
+  Latest failing run: `ai_evals/results/2026-04-09T11-25-24.107Z__flow`
+  Issues:
+  `validate_data` was added, but the failure behavior still does not match the requested contract
+  `save_results` throws instead of returning a graceful structured result
+
+- `flow-test11-preprocessor-and-failure-handler`
+  Latest failing run: `ai_evals/results/2026-04-09T11-25-24.107Z__flow`
+  Issues:
+  the model creates regular `preprocessor` and `failure` modules
+  it does not use Windmill's special top-level `preprocessor_module` and `failure_module`
+
+## Needs Reconfirmation
+
+- `flow-test4-order-processing-loop`
+  Full-suite failing run: `ai_evals/results/2026-04-09T11-25-24.107Z__flow`
+  Follow-up passing run after prompt improvement: `ai_evals/results/2026-04-09T13-29-15.877Z__flow`
+  Note:
+  this case failed on invalid `branchone` downstream result access
+  it passed after adding explicit branch-output guidance to the flow prompt
+  rerun the full flow suite to confirm the fix holds in the broader benchmark
diff --git a/docs/system-prompt-testing-plan.md b/docs/system-prompt-testing-plan.md
new file mode 100644
index 0000000000..9b12f1c5e0
--- /dev/null
+++ b/docs/system-prompt-testing-plan.md
@@ -0,0 +1,1000 @@
+# System Prompt And Skill Output Testing Plan
+
+Historical note:
+
+- This file is a planning document and no longer matches the current benchmark CLI in every detail.
+- The current source of truth is [ai_evals/README.md](/home/farhad/windmill__worktrees/prompt-testing-plan/ai_evals/README.md) and [system-prompt-testing-status.md](/home/farhad/windmill__worktrees/prompt-testing-plan/docs/system-prompt-testing-status.md).
+- In particular, the current tool no longer has the old variants, compare, or history workflow described below.
+
+## Goal
+
+Build a single testing strategy that answers one question reliably:
+
+> Given a user task, how good is the artifact produced by our AI system?
+
+This plan is intentionally focused on **black-box output evaluation**, not on unit testing frontend or CLI internals.
+
+The intended end state is a **new repo-level benchmark CLI** that runs a shared
+eval suite across multiple surfaces.
+
+That benchmark CLI should be the main entrypoint for:
+
+- running one case
+- running a benchmark set
+- comparing baseline vs candidate variants
+- writing benchmark history snapshots
+
+Frontend and Windmill CLI are not meant to become separate testing products.
+They should be implemented as adapters behind this shared benchmark CLI.
+
+The system under test is:
+
+- Frontend AI Chat in `script`, `flow`, and `app` modes
+- CLI local development experience driven by generated guidance and skills
+
+The artifact under test is:
+
+- Script code
+- Flow JSON / module structure
+- Raw app files and backend runnables
+- Files and project artifacts produced in a local CLI workspace
+
+## Non-Goals
+
+This plan does **not** treat the following as the main testing target:
+
+- Unit testing helper functions, stores, or tool wrapper internals
+- UI rendering behavior, DOM interactions, or component-level correctness
+- `wmill init` correctness as a standalone product area
+- Backend route correctness except where it affects prompt delivery or AI configuration
+
+Those may still need lightweight tests, but they are not the core of prompt reliability evaluation.
+
+## Core Principles
+
+### 1. Black-box evaluation only
+
+The runner should provide an input task to the real system setup, let it run, collect the final artifact, and score the result.
+
+In practice, this runner should be exposed through the new repo-level benchmark
+CLI rather than through separate ad hoc test commands for each surface.
+
+### 2. Headless execution
+
+Frontend evaluation must be fully decoupled from the browser UI. It should exercise prompt assembly, tool selection, and tool execution logic without mounting Svelte components or clicking through the app.
+
+### 3. Real prompt environment
+
+All evals must use the same prompt-building path, tool definitions, and skill content that production uses, or a clearly defined variant of them.
+
+### 4. Artifact-first scoring
+
+The main score is based on the produced artifact, not on intermediate transcripts.
+
+### 5. Reliability over one-off success
+
+A prompt is not "good" because it passed once. Reliability means pass rate across repeated runs and across a representative case set.
+
+### 6. Track benchmark history over time
+
+The suite must not only evaluate the current output. It must also produce a
+git-tracked benchmark history so the team can see whether the system is
+improving over time.
+
+This history should focus on official benchmark snapshots, not on every local
+experiment.
+
+### 7. Shared corpus, separate adapters
+
+Frontend and CLI should share the same evaluation corpus format when possible, but each surface should have its own execution adapter.
+
+### 8. CLI first, UI last
+
+The CLI should be the first surface brought to a high-confidence benchmark
+state.
+
+It is the cleanest foundation for the suite because it produces direct files in
+an isolated workspace, has less ambiguity than the frontend, and is easier to
+score deterministically.
+
+Frontend should reuse the benchmark model proven on the CLI rather than define
+a parallel testing philosophy.
+
+### 9. UI comes last
+
+The testing suite must exist and be trustworthy before building a studio UI on top of it.
+
+## Current State
+
+## Shared Prompt Source Of Truth
+
+The repo already has the right content split:
+
+- `system_prompts/` is the shared source of truth for core Windmill prompt content
+- frontend adds chat-specific tool instructions on top
+- CLI materializes guidance and skill content from generated outputs
+
+This is a strong foundation for a shared eval suite.
+
+## Execution Priority
+
+Even though the repo already has useful frontend eval scaffolding, the
+implementation priority should be:
+
+1. build the repo-level benchmark CLI and use the Windmill CLI adapter as the
+   first implementation behind it
+2. make the CLI artifact-evaluation path excellent
+3. stabilize shared scoring, reporting, and benchmark history around that path
+4. bring frontend onto the same benchmark model through the same benchmark CLI
+5. build the UI only after the underlying suite is trustworthy
+
+This keeps the hardest product question focused on artifact quality rather than
+on UI workflow.
+
+## Benchmark CLI As The Main Product
+
+The testing suite should have one primary interface:
+
+- a new repo-level benchmark CLI
+
+The benchmark CLI should be able to run:
+
+- Windmill CLI evals
+- frontend evals
+- shared reporting and comparison commands
+
+Illustrative command shape:
+
+```bash
+ai-evals run --surface cli --case bun-hello-script
+ai-evals run --surface frontend-flow --case support-flow
+ai-evals compare --surface cli --variant baseline --variant candidate-a
+ai-evals history latest
+```
+
+The exact binary name can change, but the architecture should not:
+
+- one benchmark CLI
+- shared case loader
+- shared scoring
+- shared history writer
+- separate surface adapters underneath
+
+## Temporary Bootstrap Code
+
+This bootstrap phase is now complete for frontend `flow`, `app`, and `script`.
+
+Frontend AI benchmark ownership has moved into `ai_evals/`, and the frontend
+source tree no longer owns a separate AI benchmark suite under
+`frontend/.../__tests__/...`.
+
+Benchmark authors should only need the repo-level benchmark CLI to run the
+long-term suite.
+
+The only temporary frontend-specific piece that remains is a thin Vitest/Vite
+loader bridge so the benchmark runner can import the production chat modules in
+the same module/runtime environment they already expect.
+
+## Frontend: What Exists Today
+
+The current frontend benchmark path is **decoupled from the UI** and now owned
+by `ai_evals`.
+
+They currently:
+
+- run through the shared headless chat loop
+- use production prompt builders
+- use production tool definitions
+- use benchmark-owned helper adapters that write to temp workspaces on disk
+- execute through the frontend module/runtime environment only as a loader bridge
+
+This means the current frontend evals are now a proper benchmark adapter,
+not a frontend test suite.
+
+That is the correct direction.
+
+### Frontend Architecture Notes
+
+There are three categories of code involved:
+
+- shared production logic:
+  - production system prompt builders
+  - production tool definitions
+  - production `runChatLoop`
+- benchmark-only infrastructure:
+  - case loading
+  - variant loading
+  - judge scoring
+  - benchmark result shaping
+  - history/reporting integration
+- alternate helper adapters:
+  - production helpers mutate UI/editor state
+  - benchmark helpers mutate temp-workspace files
+
+This is important because the benchmark suite is **not** meant to duplicate the
+frontend chat logic. It is meant to reuse the production chat loop and tool
+definitions while swapping the execution backend from UI state to filesystem
+state.
+
+## Frontend: What Is Missing
+
+### Coverage gaps
+
+- `script` is now exposed through the shared benchmark CLI, but it only has initial case coverage.
+- Existing frontend coverage is still too small relative to the target benchmark corpus.
+
+### Reliability gaps
+
+- Frontend flow and app can already run with pass/fail results and repeated runs through the shared benchmark CLI.
+- The remaining gap is turning that into stronger routine reliability gating with better deterministic validators and broader routine case coverage.
+- Frontend reliability reporting is still less mature than the intended end state for official CI tiers and richer failure triage.
+
+### Prompt-iteration gaps
+
+- Frontend prompt variants are file-backed now, but the repo only ships baseline manifests by default.
+- Creating and curating meaningful frontend candidate variants is still a mostly manual workflow compared with the CLI snapshot flow.
+- Frontend prompt comparison exists through the shared `compare` command, but it still needs broader routine use and better variant coverage.
+
+### Artifact-validation gaps
+
+- The current flow and app helpers are file-backed now, but several effects are still lightweight and should become more realistic over time.
+- Linting and runnable validation are currently too lightweight in the eval path.
+- Datatable interactions are mocked rather than validated as output constraints.
+- The suite does not yet enforce a strong deterministic validator layer before using an LLM judge.
+
+### Corpus gaps
+
+- Frontend surfaces already use shared case manifests under `ai_evals/cases/frontend/`.
+- The remaining gap is breadth and representativeness, not the absence of a shared corpus.
+- Cases still need richer metadata, stronger deterministic constraints, and a larger regression library built from real failures.
+
+### Reporting gaps
+
+- Frontend runs already emit the shared benchmark result shape and can write official history snapshots through the shared benchmark CLI.
+- There is still no rich leaderboard or trend-oriented debugging workflow for frontend surfaces specifically.
+- There is still no strong "worst failures first" report for debugging regressions.
+
+## Frontend: Perfect Testing Logic
+
+The perfect frontend testing logic is:
+
+Frontend should not be the place where the benchmark philosophy is invented.
+
+It should consume the shared case format, validator model, reporting format,
+and history format already proven through the CLI path.
+
+### 1. Stay fully headless
+
+Do not mount the chat UI.
+
+Do not click through the frontend.
+
+Do not use Playwright for prompt evaluation.
+
+The runner should directly invoke:
+
+- the production system message builder
+- the production user message builder
+- the production tool list
+- the production chat loop
+
+It is acceptable for the benchmark adapter to use the frontend Vitest/Vite
+runtime as a thin loader bridge when production chat modules still depend on
+that environment, as long as:
+
+- the benchmark entrypoint remains the shared benchmark CLI
+- the benchmark logic and fixtures live under `ai_evals`
+- the frontend source tree does not own a separate benchmark suite
+
+This keeps the suite decorrelated from the frontend UI while still testing the real AI logic.
+
+### 2. Test the three frontend AI surfaces separately
+
+#### Script mode
+
+Input:
+
+- user prompt
+- optional initial script
+- optional context such as selected workspace runnables or DB references
+
+Output:
+
+- final script code
+
+Scoring:
+
+- deterministic validators first
+- LLM judge second
+
+Deterministic validators should include:
+
+- expected entrypoint present
+- syntax / parse validity
+- language-appropriate compile or lint check where feasible
+- required behaviors or structures present
+- forbidden patterns absent
+
+#### Flow mode
+
+Input:
+
+- user prompt
+- optional initial flow
+- optional schema
+- optional workspace context
+
+Output:
+
+- final flow definition
+
+Scoring:
+
+- flow JSON is structurally valid
+- expected module types exist
+- expected branches / loops / tools exist
+- schema shape matches required inputs
+- required data flow connections are present
+- LLM judge scores completeness and overall quality
+
+#### App mode
+
+Input:
+
+- user prompt
+- optional initial app
+- optional workspace context
+
+Output:
+
+- final frontend files
+- final backend runnables
+
+Scoring:
+
+- expected files and runnables exist
+- file structure is coherent
+- app bundle / lint checks pass where feasible in headless mode
+- required UI/backend behaviors are represented in the artifact
+- LLM judge scores completeness and product quality
+
+### 3. Use repeated runs, not single runs
+
+Each case should run more than once.
+
+Recommended starting point:
+
+- PR smoke run: 2 runs per case on a small curated subset
+- nightly reliability run: 5 to 10 runs per case on the full benchmark set
+
+Primary metric:
+
+- pass rate
+
+Secondary metrics:
+
+- average deterministic score
+- average judge score
+- worst-case judge score
+- latency
+- total tool calls
+
+### 4. Keep tool traces as diagnostics only
+
+Tool usage matters for debugging, but it should not be the primary score.
+
+The suite should record:
+
+- tool names
+- tool arguments
+- iteration count
+- model/provider
+
+But the main question remains:
+
+> Was the final artifact good?
+
+### 5. Make prompt variants easy to test
+
+Prompt candidates should not require editing test code.
+
+The suite should support a file-based prompt variant workflow.
+
+Example direction:
+
+- `ai_evals/variants/frontend/script/baseline.md`
+- `ai_evals/variants/frontend/script/candidate-a.md`
+- `ai_evals/variants/frontend/flow/baseline.md`
+- `ai_evals/variants/frontend/app/baseline.md`
+
+Each variant should be runnable side by side against the same case set.
+
+### 6. Separate benchmark cases from test code
+
+Benchmark cases should live in data files, not inline in test files.
+
+Each case should define:
+
+- surface
+- user prompt
+- initial artifact if any
+- required constraints
+- forbidden constraints
+- judge rubric
+- tags
+
+This makes the benchmark editable by prompt authors without changing runner logic.
+
+## CLI: What Exists Today
+
+The current CLI tests prove only one narrow property:
+
+> Given a prompt, does the model invoke the expected skill?
+
+That is useful as a smoke signal, but it is far from sufficient for output evaluation.
+
+The current CLI setup also depends on manual preparation of a `.claude/skills` folder, which makes repeated benchmarking and prompt iteration much harder than necessary.
+
+## CLI: What Is Missing
+
+### Output-evaluation gap
+
+- The current suite does not score the artifact produced by the CLI workflow.
+- It only checks whether a skill was invoked.
+- It does not verify that the resulting files are good.
+
+### Automation gap
+
+- The current setup requires manual copying of generated skills into a test folder.
+- That makes the suite too fragile and too manual for rapid prompt iteration.
+
+### Reliability gap
+
+- There is no repeated-run measurement.
+- There is no pass-rate metric.
+- There is no baseline vs candidate comparison workflow.
+
+### Prompt-variant gap
+
+- There is no first-class way to test alternate skill bundles or alternate generated guidance.
+- There is no clean candidate flow for "I changed skill content, show me whether reliability improved."
+
+### Corpus gap
+
+- CLI cases are not aligned with frontend benchmark cases.
+- There is no shared benchmark language describing the task, initial state, and expected artifact.
+
+### Reporting gap
+
+- There is no stable output report for artifact comparison.
+- There is no failure clustering by skill bundle, task family, or model.
+
+## CLI: Perfect Testing Logic
+
+The perfect CLI testing logic is:
+
+This should be the reference implementation for the suite.
+
+### 1. Evaluate the final artifact, not the skill invocation
+
+Skill invocation should be kept as diagnostic metadata only.
+
+The primary output should be the files produced in a temporary workspace.
+
+Example CLI artifacts:
+
+- generated script files
+- generated flow files
+- raw app project files
+- schedule / trigger config files
+- AGENTS / guidance files only when they are directly relevant to the task
+
+### 2. Create the workspace automatically
+
+The runner should create a fresh temporary project for every case.
+
+It should seed that workspace with:
+
+- initial files for the benchmark case
+- the current generated CLI guidance and skills
+- any fixture data required by the task
+
+It should never depend on a manually maintained test folder.
+
+### 3. Materialize the exact skill bundle under test
+
+The runner should be able to test:
+
+- the current production skill bundle
+- a candidate skill bundle built from prompt changes
+
+For CLI, a "prompt variant" is effectively a skill-bundle variant.
+
+That means the suite should support alternate generated skill content without requiring ad hoc manual copies.
+
+### 4. Score the final workspace
+
+The scoring approach should match the frontend philosophy:
+
+- deterministic validators first
+- LLM judge second
+
+Deterministic validators for CLI should include:
+
+- expected files created
+- expected file names and locations
+- required content patterns present
+- expected artifact type produced
+- optional parse / lint / compile validation where feasible
+
+### 5. Run repeated benchmarks
+
+The CLI should use the same reliability logic as frontend:
+
+- benchmark set
+- repeated runs
+- pass rate
+- baseline vs candidate comparison
+
+### 6. Keep skill traces as diagnostics
+
+Record:
+
+- invoked skills
+- order of invocation
+- turns
+- file changes
+
+But do not let that replace artifact evaluation.
+
+## Perfect Shared Benchmark Model
+
+The frontend and CLI should share the same benchmark concept.
+
+Each evaluation case should define:
+
+- `id`
+- `surface`
+- `user_prompt`
+- `initial_state`
+- `workspace_context`
+- `artifact_checks`
+- `judge_rubric`
+- `tags`
+
+The same task should be runnable on multiple surfaces when it makes sense.
+
+This gives direct comparability between:
+
+- frontend script vs CLI script
+- frontend flow vs CLI flow
+- frontend app vs CLI app
+
+## Recommended Benchmark Categories
+
+The first benchmark set should be broad, but not huge.
+
+Recommended initial size:
+
+- 20 to 30 core cases
+
+Recommended categories:
+
+- from-scratch script creation
+- script modification
+- from-scratch flow creation
+- flow modification
+- from-scratch raw app creation
+- raw app modification
+- reuse of workspace assets
+- tasks requiring datatable awareness
+- tasks requiring constraints or edge-case handling
+- known regressions from real failures
+
+Every category should contain both:
+
+- "easy success" cases
+- "high ambiguity" cases
+
+This is essential for measuring reliability rather than only measuring best-case demos.
+
+## Scoring Model
+
+The suite should use three layers.
+
+## Layer 1: Deterministic Validators
+
+This is the hard gate.
+
+Examples:
+
+- parse succeeds
+- artifact shape is valid
+- required entrypoint exists
+- expected files exist
+- required module types exist
+- expected inputs / schema fields exist
+- forbidden patterns are absent
+
+If layer 1 fails, the run is a failure.
+
+## Layer 2: Task-Specific Validators
+
+These are stronger artifact checks derived from the benchmark case.
+
+Examples:
+
+- flow contains a loop and a conditional branch
+- app includes a reset button path and backend wiring
+- script performs the requested transformation
+
+These should still be deterministic whenever possible.
+
+## Layer 3: LLM Judge
+
+Use an LLM judge only after deterministic validation.
+
+The judge should answer:
+
+- Did the artifact satisfy the request?
+- Is it complete?
+- Is it coherent for Windmill?
+- How close is it to the intended solution?
+
+The judge score is valuable, but it should not be the only oracle.
+
+## Benchmark History
+
+The suite should persist official benchmark summaries in a git-tracked history
+layer so improvements and regressions can be reviewed over time.
+
+## What Should Be Git-Tracked
+
+Only official benchmark outputs should be committed:
+
+- post-merge benchmark snapshots on `main`
+- scheduled nightly benchmark snapshots
+- manually promoted benchmark snapshots when the team wants to record a result
+
+Each official snapshot should produce:
+
+- one detailed run JSON
+- one entry in an append-only summary file
+- regenerated rollups for trend views
+
+## What Should Not Be Git-Tracked
+
+The following should remain local or external by default:
+
+- raw transcripts
+- full model messages
+- large generated artifact bundles
+- ad hoc local experiments
+- temporary comparison runs
+
+This keeps git history focused on stable benchmark signals instead of noisy
+debug output.
+
+## Reliability Metrics
+
+Every prompt or skill candidate should be reported with:
+
+- total cases
+- passes
+- pass rate
+- average judge score
+- median judge score
+- worst-case judge score
+- average latency
+- average turns
+
+Per-case results should also be retained.
+
+This is the minimum needed to compare:
+
+- baseline vs candidate
+- provider vs provider
+- frontend vs CLI
+
+## Benchmark Metrics
+
+The history layer should track metrics in four groups.
+
+## Quality Metrics
+
+- `pass_rate`
+- `deterministic_pass_rate`
+- `judge_score_mean`
+- `judge_score_median`
+- `judge_score_p10`
+- `category_pass_rate`
+
+## Reliability Metrics
+
+- `runs_per_case`
+- `flake_rate`
+- `path_consistency`
+
+## Efficiency Metrics
+
+- `latency_ms_mean`
+- `latency_ms_median`
+- `tokens_prompt_mean`
+- `tokens_completion_mean`
+- `tokens_total_mean`
+- `tool_calls_mean`
+- `iterations_mean`
+- `estimated_cost_mean`
+- `cost_per_success`
+- `latency_per_success`
+
+## Provenance Metrics
+
+- `timestamp`
+- `git_sha`
+- `suite_version`
+- `scoring_version`
+- `surface`
+- `variant_name`
+- `provider`
+- `model`
+- `judge_model`
+
+The provenance metrics are essential. Without them, a trend line can mix prompt
+changes with upstream model drift and become hard to interpret.
+
+## Efficiency Score
+
+The suite should not collapse everything into one number.
+
+It should track at least three top-level composite scores:
+
+- `quality_score`
+- `efficiency_score`
+- `value_score`
+
+Recommended interpretation:
+
+- `quality_score`: how good the artifact is
+- `efficiency_score`: how fast and cheap the system is relative to peers
+- `value_score`: quality-adjusted efficiency
+
+These composite scores should sit on top of the raw metrics, not replace them.
+
+## Proposed Suite Architecture
+
+The suite should be built in six layers.
+
+## Layer 1: Benchmark Data
+
+Purpose:
+
+- define the cases once
+
+Contents:
+
+- case files
+- reusable initial fixtures
+- evaluation metadata
+
+## Layer 2: Benchmark CLI
+
+Purpose:
+
+- provide one shared entrypoint for the suite
+
+Responsibilities:
+
+- load cases and variants
+- select a surface adapter
+- run one case or a benchmark set
+- invoke shared scoring and history writing
+- expose comparison and history commands
+
+## Layer 3: Surface Adapters
+
+Purpose:
+
+- run a case against one surface
+
+Adapters:
+
+- frontend-script adapter
+- frontend-flow adapter
+- frontend-app adapter
+- CLI adapter
+
+Responsibilities:
+
+- prepare the correct prompt environment
+- prepare the initial artifact state
+- run the real model loop
+- return the final artifact plus diagnostics
+
+## Layer 4: Scoring And Reporting
+
+Purpose:
+
+- evaluate the final artifact
+- aggregate repeated runs
+- compare variants
+
+Responsibilities:
+
+- deterministic validation
+- LLM judging
+- pass/fail computation
+- result serialization
+- comparison reports
+
+## Layer 5: Benchmark History
+
+Purpose:
+
+- preserve official benchmark summaries over time
+- support trend analysis and regression review
+
+Responsibilities:
+
+- store official run snapshots
+- append benchmark summary entries
+- generate rollups for charts and dashboards
+- keep provenance metadata for every tracked run
+
+## Layer 6: UI Studio
+
+Purpose:
+
+- provide a user interface for the exact same benchmark CLI and runner stack
+
+Important rule:
+
+The UI must not define its own execution semantics.
+
+It must only be a frontend over the same suite used in CI and local benchmarking.
+
+## Proposed Development Order
+
+### Phase 1: Stabilize the benchmark model
+
+Deliverables:
+
+- shared case schema
+- shared result schema
+- initial core benchmark set
+
+### Phase 2: Build the benchmark CLI shell
+
+Deliverables:
+
+- repo-level benchmark CLI entrypoint
+- `run`, `compare`, and `history` command skeletons
+- adapter selection layer
+- temporary wiring to the first CLI adapter
+
+### Phase 3: Replace the CLI smoke suite with real artifact evaluation
+
+Deliverables:
+
+- temp-workspace runner
+- automatic skill-bundle materialization
+- artifact scoring
+- repeated-run support
+- baseline vs candidate skill-bundle comparison
+
+### Phase 4: Add shared reporting and benchmark history around the CLI path
+
+Deliverables:
+
+- baseline vs candidate reports
+- pass-rate summaries
+- worst-failure reports
+- official run schema
+- git-tracked benchmark summary file
+- history snapshot writer
+- rollup generation for trend charts
+
+### Phase 5: Finish the frontend black-box harness on top of the shared model
+
+Deliverables:
+
+- convert current flow and app evals into proper scored reliability tests
+- add script eval support
+- add repeated-run support
+- add prompt-variant loading from files
+- align frontend outputs with the shared result and history format
+- expose frontend runs through the same benchmark CLI
+
+### Phase 6: Add CI tiers
+
+Deliverables:
+
+- fast PR smoke benchmark
+- fuller nightly benchmark
+- official history updates on `main` and scheduled runs
+- manual benchmark mode for prompt authors
+
+### Phase 7: Build the UI studio
+
+Deliverables:
+
+- run selector
+- variant selector
+- per-case comparison view
+- artifact diff view
+- reliability dashboard
+- trend dashboard backed by git-tracked benchmark history
+
+This phase comes last because the UI is only valuable once the underlying suite is stable and trusted.
+
+## Proposed Prompt Variant Workflow
+
+The suite should make it cheap to test new prompt candidates.
+
+Recommended workflow:
+
+1. Edit or add a candidate prompt file.
+2. Run the benchmark against baseline and candidate.
+3. Compare pass rate and score.
+4. Inspect worst regressions first.
+5. Promote only if the candidate improves the benchmark materially.
+
+For CLI, the same workflow applies, but the tested unit is the generated skill bundle rather than a single chat system prompt.
+
+## Suggested Repository Direction
+
+This plan does not require the UI studio to exist first.
+
+A reasonable repo structure would be:
+
+```text
+ai_evals/
+  cli/
+  cases/
+  fixtures/
+  history/
+    runs/
+    rollups/
+  variants/
+    frontend/
+      script/
+      flow/
+      app/
+    cli/
+  results/        # gitignored
+  scripts/
+  adapters/
+  scoring/
+  reports/
+```
+
+The exact folder names can change, but the architectural split should remain.
+
+## What "Done" Looks Like
+
+This project is successful when all of the following are true:
+
+- one repo-level benchmark CLI is the primary way to run prompt evals
+- frontend prompt behavior is tested headlessly and independently from the UI
+- CLI local-dev behavior is tested by evaluating the final files it produces
+- benchmark cases are shared where possible between frontend and CLI
+- prompt and skill candidates can be tested without editing test code
+- reliability is reported as pass rate over repeated runs
+- baseline vs candidate comparisons are easy to run and inspect
+- the UI studio is only a thin interface over the same trusted runner
+
+## Final Recommendation
+
+The current frontend evals should be treated as a useful starting point, not the finished solution.
+
+They already prove that the repo can test AI behavior without coupling to the browser UI.
+
+The main work now is:
+
+- build the repo-level benchmark CLI as the durable entrypoint
+- replace CLI invocation checks with artifact evaluation
+- make the CLI path the reference benchmark implementation
+- unify frontend under that same benchmark model
+- make frontend evals complete and reliability-oriented only after the shared
+  scoring model is stable
+- build the UI only after the suite is strong enough to stand on its own
diff --git a/docs/system-prompt-testing-status.md b/docs/system-prompt-testing-status.md
new file mode 100644
index 0000000000..ef9be36161
--- /dev/null
+++ b/docs/system-prompt-testing-status.md
@@ -0,0 +1,129 @@
+# System Prompt Testing Status
+
+This document describes the benchmark tool that exists today. It is the current
+truth for `ai_evals/`.
+
+The longer planning document in
+[system-prompt-testing-plan.md](/home/farhad/windmill__worktrees/prompt-testing-plan/docs/system-prompt-testing-plan.md)
+still contains useful background, but parts of its workflow are now historical
+because the old variants/history system was removed.
+
+## Current Tool
+
+There is one repo-level benchmark CLI under `ai_evals/` with three commands:
+
+- `bun run cli -- models`
+- `bun run cli -- cases [mode]`
+- `bun run cli -- run <mode> [caseIds...]`
+
+Supported modes:
+
+- `cli`
+- `flow`
+- `script`
+- `app`
+
+Public `run` options:
+
+- `--runs <n>`
+- `--output <path>`
+- `--model <alias>`
+- `--verbose`
+- `--record`
+
+There is no variant workflow and no compare command in the current tool.
+Tracked history is intentionally minimal: `run --record` appends one compact
+summary line to `ai_evals/history/<mode>.jsonl`. This is only allowed for
+full-suite runs, not selected case ids. History lines include average token
+usage when the benchmark mode reports it, plus average judge score and per-case
+duration/judge/token usage summaries.
+
+## How It Works
+
+Each attempt runs:
+
+1. the current production prompts, tools, and guidance from this checkout
+2. deterministic validation
+3. LLM judging
+
+Results are written locally under `ai_evals/results/` as:
+
+- a summary JSON file
+- a sibling artifacts directory containing the generated flow/script/app/workspace
+
+If `--record` is used, the CLI also appends a compact JSONL summary line to the
+tracked file for that mode under `ai_evals/history/`.
+
+## Current Architecture
+
+- `ai_evals/cases/`: one YAML manifest per mode
+- `ai_evals/fixtures/`: initial and expected fixtures
+- `ai_evals/core/`: shared case loading, model resolution, validation, judging, and result writing
+- `ai_evals/history/`: optional tracked pass-rate history written by `run --record`, one JSONL file per mode
+- `ai_evals/modes/`: one runner per mode
+
+Execution model:
+
+- `flow`, `script`, and `app` reuse the production frontend chat loop and production tool definitions through the frontend Vitest bridge
+- `cli` creates a temp workspace, writes the current checkout guidance into it, and runs the Anthropic agent SDK against that workspace
+
+## Case Model
+
+Each case is intentionally small:
+
+- `prompt`
+- optional `initial`
+- optional `expected`
+- optional `validate`
+
+`validate` is mainly used for stronger deterministic checks where exact fixture
+matching would be too strict, especially for `flow` creation cases.
+
+Examples of current deterministic checks:
+
+- schema contains one of several accepted input shapes
+- `results.*` references resolve
+- required code/input characteristics exist in some module
+- expected workspace files are created in `cli` mode
+
+## Model Selection
+
+Model aliases are resolved through a shared registry in `ai_evals/core/models.ts`.
+
+Current aliases:
+
+- `haiku`
+- `sonnet`
+- `opus`
+- `4o`
+
+Notes:
+
+- the `models` command also shows accepted alias spellings such as `gpt-4o` and `claude-opus-4.6`
+- frontend modes can use Anthropic and OpenAI-backed aliases
+- `cli` mode is Anthropic-only because it runs through the Anthropic agent SDK
+- the judge model is separate and currently defaults to `claude-sonnet-4-6`
+
+## What Is Working Well
+
+- one simple local benchmark CLI
+- real production execution paths instead of synthetic prompt variants
+- local result and artifact persistence by default
+- live frontend progress output
+- reusable flow/script/app/cli runners under one tool
+- deterministic validation can now catch real runtime-invalid flow wiring
+
+## What Still Needs Work
+
+- broader case coverage across all four modes
+- stronger deterministic validators for more cases, especially app/script semantics
+- clearer per-case validation metadata as the corpus grows
+- CI automation for smoke and nightly runs
+
+## Recommended Next Focus
+
+The next high-value work is:
+
+1. add more realistic benchmark cases
+2. keep simplifying deterministic validators so they check correctness, not one exact implementation
+3. add CI only after the local benchmark signal is trustworthy
diff --git a/frontend/src/lib/components/copilot/chat/AIChatManager.svelte.ts b/frontend/src/lib/components/copilot/chat/AIChatManager.svelte.ts
index bb2ad5686b..faf5da8c2b 100644
--- a/frontend/src/lib/components/copilot/chat/AIChatManager.svelte.ts
+++ b/frontend/src/lib/components/copilot/chat/AIChatManager.svelte.ts
@@ -450,7 +450,9 @@ class AIChatManager {
 					} else if (this.mode === AIMode.FLOW) {
 						return prepareFlowUserMessage(
 							pendingPrompt,
-							this.flowAiChatHelpers!.getFlowAndSelectedId()
+							this.flowAiChatHelpers!.getFlowAndSelectedId(),
+							[],
+							this.flowAiChatHelpers!.inlineScriptSession
 						)
 					} else if (this.mode === AIMode.NAVIGATOR) {
 						return prepareNavigatorUserMessage(pendingPrompt)
@@ -648,7 +650,8 @@ class AIChatManager {
 					userMessage = prepareFlowUserMessage(
 						oldInstructions,
 						this.flowAiChatHelpers!.getFlowAndSelectedId(),
-						oldSelectedContext
+						oldSelectedContext,
+						this.flowAiChatHelpers!.inlineScriptSession
 					)
 					break
 				case AIMode.NAVIGATOR:
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/appChat.eval.test.ts b/frontend/src/lib/components/copilot/chat/__tests__/app/appChat.eval.test.ts
deleted file mode 100644
index a42ee1f099..0000000000
--- a/frontend/src/lib/components/copilot/chat/__tests__/app/appChat.eval.test.ts
+++ /dev/null
@@ -1,303 +0,0 @@
-import { describe, expect, it } from 'vitest'
-import { runVariantComparison, writeAppComparisonResults } from './appEvalRunner'
-import { BASELINE_VARIANT, STREAMLINED_VARIANT } from './variants'
-import { loadAppFixtureForEval } from './appFixtureLoader'
-// @ts-ignore - Node.js path
-import { dirname, join } from 'path'
-// @ts-ignore - Node.js url
-import { fileURLToPath } from 'url'
-import type { AIProvider } from '$lib/gen/types.gen'
-
-// Get API keys from environment - tests will be skipped if none are set
-// @ts-ignore
-const OPENAI_API_KEY = process.env.OPENAI_API_KEY
-// @ts-ignore
-const ANTHROPIC_API_KEY = process.env.ANTHROPIC_API_KEY
-
-const hasAnyKey = OPENAI_API_KEY || ANTHROPIC_API_KEY
-const describeWithApiKey = hasAnyKey ? describe : describe.skip
-
-// Get __dirname equivalent for ES modules
-const __filename = fileURLToPath(import.meta.url)
-const __dirname = dirname(__filename)
-
-// Build model variants based on available keys
-interface ModelVariant {
-	model: string
-	provider: AIProvider
-	apiKey: string
-}
-
-const MODEL_VARIANTS: ModelVariant[] = [
-	...(OPENAI_API_KEY
-		? [{ model: 'gpt-4o', provider: 'openai' as AIProvider, apiKey: OPENAI_API_KEY }]
-		: []),
-	...(ANTHROPIC_API_KEY
-		? [
-				{
-					model: 'claude-haiku-4-5-20241022',
-					provider: 'anthropic' as AIProvider,
-					apiKey: ANTHROPIC_API_KEY
-				}
-			]
-		: [])
-]
-
-const VARIANTS = [
-	...MODEL_VARIANTS.map((mv) => ({
-		...BASELINE_VARIANT,
-		model: mv.model,
-		name: `baseline-${mv.provider}-${mv.model}`,
-		_provider: mv.provider,
-		_apiKey: mv.apiKey
-	})),
-	...MODEL_VARIANTS.map((mv) => ({
-		...STREAMLINED_VARIANT,
-		model: mv.model,
-		name: `streamlined-${mv.provider}-${mv.model}`,
-		_provider: mv.provider,
-		_apiKey: mv.apiKey
-	}))
-]
-
-describeWithApiKey('App Chat LLM Evaluation', () => {
-	const TEST_TIMEOUT = 120_000
-	if (!hasAnyKey) {
-		console.warn('No API keys set (OPENAI_API_KEY or ANTHROPIC_API_KEY), skipping tests')
-	}
-
-	it(
-		'test1: creates a simple counter app',
-		async () => {
-			const USER_PROMPT = `Create a counter app with increment/decrement buttons`
-			const results = await runVariantComparison(
-				USER_PROMPT,
-				VARIANTS,
-				VARIANTS[0]._apiKey,
-				undefined,
-				VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey }))
-			)
-			const { summaryPath, appPaths } = await writeAppComparisonResults(USER_PROMPT, results)
-			console.log(`\nResults written to: ${summaryPath}`)
-			console.log(`App files: ${appPaths.join(', ')}`)
-
-			expect(true).toBe(true)
-		},
-		TEST_TIMEOUT
-	)
-
-	it(
-		'test2: modifies existing counter app to add reset button',
-		async () => {
-			const { initialFrontend, initialBackend } = await loadAppFixtureForEval(
-				join(__dirname, 'initial', 'test1_counter_app')
-			)
-
-			const USER_PROMPT = `Add a reset button that sets the counter back to 0`
-			const results = await runVariantComparison(
-				USER_PROMPT,
-				VARIANTS,
-				VARIANTS[0]._apiKey,
-				{
-					initialFrontend,
-					initialBackend
-				},
-				VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey }))
-			)
-			const { summaryPath, appPaths } = await writeAppComparisonResults(USER_PROMPT, results)
-			console.log(`\nResults written to: ${summaryPath}`)
-			console.log(`App files: ${appPaths.join(', ')}`)
-
-			expect(true).toBe(true)
-		},
-		TEST_TIMEOUT
-	)
-
-	// ==================== Shopping Cart Tests ====================
-
-	it(
-		'test3: shopping cart - add quantity selector',
-		async () => {
-			const { initialFrontend, initialBackend } = await loadAppFixtureForEval(
-				join(__dirname, 'initial', 'shopping_cart')
-			)
-
-			const USER_PROMPT = `Add a quantity selector (+ and - buttons) to each cart item so users can adjust quantities without removing and re-adding items`
-			const results = await runVariantComparison(
-				USER_PROMPT,
-				VARIANTS,
-				VARIANTS[0]._apiKey,
-				{
-					initialFrontend,
-					initialBackend
-				},
-				VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey }))
-			)
-
-			const { summaryPath, appPaths } = await writeAppComparisonResults(USER_PROMPT, results)
-			console.log(`\nResults written to: ${summaryPath}`)
-			console.log(`App files: ${appPaths.join(', ')}`)
-
-			expect(true).toBe(true)
-		},
-		TEST_TIMEOUT
-	)
-
-	it(
-		'test4: shopping cart - add discount code',
-		async () => {
-			const { initialFrontend, initialBackend } = await loadAppFixtureForEval(
-				join(__dirname, 'initial', 'shopping_cart')
-			)
-
-			const USER_PROMPT = `Add a discount code input field in the cart. When the code "SAVE10" is entered, apply a 10% discount to the total`
-			const results = await runVariantComparison(
-				USER_PROMPT,
-				VARIANTS,
-				VARIANTS[0]._apiKey,
-				{
-					initialFrontend,
-					initialBackend
-				},
-				VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey }))
-			)
-
-			const { summaryPath, appPaths } = await writeAppComparisonResults(USER_PROMPT, results)
-			console.log(`\nResults written to: ${summaryPath}`)
-			console.log(`App files: ${appPaths.join(', ')}`)
-
-			expect(true).toBe(true)
-		},
-		TEST_TIMEOUT
-	)
-
-	// ==================== File Manager Tests ====================
-
-	it(
-		'test5: file manager - add search bar',
-		async () => {
-			const { initialFrontend, initialBackend } = await loadAppFixtureForEval(
-				join(__dirname, 'initial', 'file_manager')
-			)
-
-			const USER_PROMPT = `Add a search bar in the toolbar that filters files and folders by name as the user types`
-			const results = await runVariantComparison(
-				USER_PROMPT,
-				VARIANTS,
-				VARIANTS[0]._apiKey,
-				{
-					initialFrontend,
-					initialBackend
-				},
-				VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey }))
-			)
-
-			const { summaryPath, appPaths } = await writeAppComparisonResults(USER_PROMPT, results)
-			console.log(`\nResults written to: ${summaryPath}`)
-			console.log(`App files: ${appPaths.join(', ')}`)
-
-			expect(true).toBe(true)
-		},
-		TEST_TIMEOUT
-	)
-
-	it(
-		'test6: file manager - show file details',
-		async () => {
-			const { initialFrontend, initialBackend } = await loadAppFixtureForEval(
-				join(__dirname, 'initial', 'file_manager')
-			)
-
-			const USER_PROMPT = `Show file size (formatted as KB/MB) and modified date in the file list for each item`
-			const results = await runVariantComparison(
-				USER_PROMPT,
-				VARIANTS,
-				VARIANTS[0]._apiKey,
-				{
-					initialFrontend,
-					initialBackend
-				},
-				VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey }))
-			)
-
-			const { summaryPath, appPaths } = await writeAppComparisonResults(USER_PROMPT, results)
-			console.log(`\nResults written to: ${summaryPath}`)
-			console.log(`App files: ${appPaths.join(', ')}`)
-
-			expect(true).toBe(true)
-		},
-		TEST_TIMEOUT
-	)
-
-	it(
-		'test7: file manager - add select all checkbox',
-		async () => {
-			const { initialFrontend, initialBackend } = await loadAppFixtureForEval(
-				join(__dirname, 'initial', 'file_manager')
-			)
-
-			const USER_PROMPT = `Add a "Select All" checkbox in the file list header and individual checkboxes for each file. Add a "Delete Selected" button that appears when items are selected`
-			const results = await runVariantComparison(
-				USER_PROMPT,
-				VARIANTS,
-				VARIANTS[0]._apiKey,
-				{
-					initialFrontend,
-					initialBackend
-				},
-				VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey }))
-			)
-
-			const { summaryPath, appPaths } = await writeAppComparisonResults(USER_PROMPT, results)
-			console.log(`\nResults written to: ${summaryPath}`)
-			console.log(`App files: ${appPaths.join(', ')}`)
-
-			expect(true).toBe(true)
-		},
-		TEST_TIMEOUT
-	)
-
-	// ==================== From-Scratch Creation Tests ====================
-
-	it(
-		'test8: create quiz app from scratch',
-		async () => {
-			const USER_PROMPT = `Create a multiple choice quiz app with 5 questions about general knowledge. Show one question at a time with 4 answer options. Track the score and show results at the end with percentage correct.`
-			const results = await runVariantComparison(
-				USER_PROMPT,
-				VARIANTS,
-				VARIANTS[0]._apiKey,
-				undefined,
-				VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey }))
-			)
-
-			const { summaryPath, appPaths } = await writeAppComparisonResults(USER_PROMPT, results)
-			console.log(`\nResults written to: ${summaryPath}`)
-			console.log(`App files: ${appPaths.join(', ')}`)
-
-			expect(true).toBe(true)
-		},
-		TEST_TIMEOUT
-	)
-
-	it(
-		'test9: create recipe book from scratch',
-		async () => {
-			const USER_PROMPT = `Create a recipe book app where users can add recipes with a name, ingredients list, and instructions. Include a search bar to filter recipes by name and the ability to delete recipes.`
-			const results = await runVariantComparison(
-				USER_PROMPT,
-				VARIANTS,
-				VARIANTS[0]._apiKey,
-				undefined,
-				VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey }))
-			)
-
-			const { summaryPath, appPaths } = await writeAppComparisonResults(USER_PROMPT, results)
-			console.log(`\nResults written to: ${summaryPath}`)
-			console.log(`App files: ${appPaths.join(', ')}`)
-
-			expect(true).toBe(true)
-		},
-		TEST_TIMEOUT
-	)
-})
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/appEvalComparison.ts b/frontend/src/lib/components/copilot/chat/__tests__/app/appEvalComparison.ts
deleted file mode 100644
index e6c795d445..0000000000
--- a/frontend/src/lib/components/copilot/chat/__tests__/app/appEvalComparison.ts
+++ /dev/null
@@ -1,171 +0,0 @@
-import Anthropic from '@anthropic-ai/sdk'
-import type { AppFiles, BackendRunnable } from '../../app/core'
-import { BASE_EVALUATOR_RESPONSE_FORMAT } from '../shared'
-import type { EvaluationResult } from '../shared'
-
-/**
- * Expected app structure for evaluation.
- */
-export interface ExpectedApp {
-	frontend: Record<string, string>
-	backend: Record<string, BackendRunnable>
-}
-
-/**
- * Initial app state for evaluation context.
- */
-export interface InitialApp {
-	frontend: Record<string, string>
-	backend: Record<string, BackendRunnable>
-}
-
-/**
- * System prompt for evaluating app generation without a reference expected app.
- * Evaluates based on user request fulfillment and appropriate modifications to initial state.
- */
-const APP_GENERATION_EVALUATOR_SYSTEM_PROMPT = `You are an expert evaluator for Windmill Raw App definitions. Your task is to evaluate a generated app based on:
-1. The original user request/prompt
-2. The initial app state (if any) - this is what the app looked like before the AI made changes
-
-## Windmill Raw App Context
-- Raw Apps consist of frontend files and backend runnables
-- Frontend files are TypeScript/JavaScript files bundled with esbuild (entrypoint: index.tsx)
-- Backend runnables can be: inline scripts (TypeScript/Python), workspace scripts, workspace flows, or hub scripts
-- Frontend calls backend using \`await backend.<runnable_key>(args...)\`
-- Each backend runnable has a key (identifier), name (description), type, and configuration
-
-## Backend Runnable Types
-- **inline**: Custom code with \`inlineScript.language\` and \`inlineScript.content\`
-- **script**: Workspace script reference with \`path\`
-- **flow**: Workspace flow reference with \`path\`
-- **hubscript**: Hub script reference with \`path\`
-
-## Evaluation Criteria
-1. **User Request Fulfillment**: Does the generated app address ALL requirements from the user's original prompt?
-   - Are all requested features implemented?
-   - Does the frontend UI match the requirements?
-   - Are the correct backend runnables created?
-2. **Appropriate Modifications** (if initial app was provided):
-   - Were the changes made relevant to the user's request?
-   - Was existing functionality preserved where appropriate?
-   - Were only necessary changes made (no unnecessary removals or additions)?
-3. **Frontend Structure**: Are the frontend files correctly organized and implemented?
-   - Is the code valid TypeScript/JavaScript?
-   - Are components properly structured?
-   - Are backend calls correctly made?
-4. **Backend Structure**: Are the backend runnables correctly configured?
-   - Do inline scripts have proper main functions?
-   - Are types and paths correct for non-inline runnables?
-5. **Integration**: Does the frontend correctly call the backend?
-   - Are the runnable keys correctly referenced?
-   - Are arguments passed correctly?
-6. **Code Quality**: Is the code functionally correct and well-structured?
-
-## Important Notes
-- Focus on whether the user's request was fulfilled, not on stylistic preferences
-- If an initial app was provided, evaluate the appropriateness of the changes made
-- For new apps (no initial state), evaluate completeness and correctness
-- Extra helper functions or slightly different approaches can still score high if they accomplish the goal
-
-${BASE_EVALUATOR_RESPONSE_FORMAT}`
-
-/**
- * Evaluates how well a generated app fulfills the user's request, considering any initial app state.
- * Uses Anthropic API directly.
- */
-export async function evaluateAppGeneration(
-	userPrompt: string,
-	generatedApp: AppFiles,
-	initialApp?: InitialApp
-): Promise<EvaluationResult> {
-	// @ts-ignore
-	const apiKey = process.env.ANTHROPIC_API_KEY
-	if (!apiKey) {
-		return {
-			success: false,
-			resemblanceScore: 0,
-			statement: 'No API key available for evaluation',
-			error: 'ANTHROPIC_API_KEY not set'
-		}
-	}
-
-	const client = new Anthropic({ apiKey })
-
-	let userMessage = `## User's Original Request
-${userPrompt}
-
-`
-
-	if (initialApp) {
-		userMessage += `## Initial App State (before AI modifications)
-\`\`\`json
-${JSON.stringify(initialApp, null, 2)}
-\`\`\`
-
-`
-	} else {
-		userMessage += `## Initial App State
-No initial app was provided - this is a new app created from scratch.
-
-`
-	}
-
-	userMessage += `## Generated App
-\`\`\`json
-${JSON.stringify(generatedApp, null, 2)}
-\`\`\`
-
-Please evaluate how well the generated app:
-1. Fulfills ALL requirements from the user's original request
-2. ${initialApp ? 'Makes appropriate modifications to the initial app state' : 'Implements a complete and correct new app'}`
-
-	try {
-		const response = await client.messages.create({
-			model: 'claude-sonnet-4-5-20250514',
-			max_tokens: 2048,
-			system: APP_GENERATION_EVALUATOR_SYSTEM_PROMPT,
-			messages: [
-				{ role: 'user', content: userMessage }
-			],
-			temperature: 0
-		})
-
-		const textBlock = response.content.find((block) => block.type === 'text')
-		const content = textBlock?.text
-		if (!content) {
-			return {
-				success: false,
-				resemblanceScore: 0,
-				statement: 'No response from evaluator',
-				error: 'Empty response from LLM'
-			}
-		}
-
-		// Parse JSON response - handle potential markdown code blocks
-		let jsonContent = content.trim()
-		if (jsonContent.startsWith('```')) {
-			jsonContent = jsonContent.replace(/^```(?:json)?\n?/, '').replace(/\n?```$/, '')
-		}
-
-		const parsed = JSON.parse(jsonContent) as {
-			resemblanceScore: number
-			statement: string
-			missingRequirements?: string[]
-		}
-
-		return {
-			success: true,
-			resemblanceScore: Math.max(0, Math.min(100, Math.round(parsed.resemblanceScore))),
-			statement: parsed.statement,
-			missingRequirements: parsed.missingRequirements ?? []
-		}
-	} catch (err) {
-		const errorMessage = err instanceof Error ? err.message : String(err)
-		return {
-			success: false,
-			resemblanceScore: 0,
-			statement: 'Evaluation failed',
-			error: errorMessage
-		}
-	}
-}
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/appEvalHelpers.ts b/frontend/src/lib/components/copilot/chat/__tests__/app/appEvalHelpers.ts
deleted file mode 100644
index 4dbc8d58a0..0000000000
--- a/frontend/src/lib/components/copilot/chat/__tests__/app/appEvalHelpers.ts
+++ /dev/null
@@ -1,147 +0,0 @@
-import type {
-	AppAIChatHelpers,
-	AppFiles,
-	BackendRunnable,
-	LintResult,
-	SelectedContext
-} from '../../app/core'
-
-/**
- * Creates an empty lint result (no errors or warnings).
- */
-function createEmptyLintResult(): LintResult {
-	return {
-		errorCount: 0,
-		warningCount: 0,
-		errors: { frontend: {}, backend: {} },
-		warnings: { frontend: {}, backend: {} }
-	}
-}
-
-/**
- * Creates mock AppAIChatHelpers for eval testing.
- * Tracks app files state in memory and allows tool functions to modify it.
- */
-export function createAppEvalHelpers(
-	initialFrontend: Record<string, string> = {},
-	initialBackend: Record<string, BackendRunnable> = {}
-) {
-	// In-memory state
-	let frontend: Record<string, string> = { ...initialFrontend }
-	let backend: Record<string, BackendRunnable> = { ...initialBackend }
-	let snapshotId = 0
-	const snapshots: Map<number, { frontend: Record<string, string>; backend: Record<string, BackendRunnable> }> = new Map()
-
-	const helpers: AppAIChatHelpers = {
-		// Frontend file operations
-		listFrontendFiles: () => Object.keys(frontend),
-
-		getFrontendFile: (path: string) => frontend[path],
-
-		getFrontendFiles: () => ({ ...frontend }),
-
-		setFrontendFile: (path: string, content: string) => {
-			frontend[path] = content
-			// Return mock lint result - in real usage this would validate the file
-			return createEmptyLintResult()
-		},
-
-		deleteFrontendFile: (path: string) => {
-			delete frontend[path]
-		},
-
-		// Backend runnable operations
-		listBackendRunnables: () => {
-			return Object.entries(backend).map(([key, runnable]) => ({
-				key,
-				name: runnable.name
-			}))
-		},
-
-		getBackendRunnable: (key: string) => backend[key],
-
-		getBackendRunnables: () => ({ ...backend }),
-
-		setBackendRunnable: async (key: string, runnable: BackendRunnable) => {
-			backend[key] = runnable
-			// Return mock lint result - in real usage this would validate the runnable
-			return createEmptyLintResult()
-		},
-
-		deleteBackendRunnable: (key: string) => {
-			delete backend[key]
-		},
-
-		// Combined view
-		getFiles: (): AppFiles => ({
-			frontend: { ...frontend },
-			backend: { ...backend }
-		}),
-
-		getSelectedContext: (): SelectedContext => ({
-			type: 'none'
-		}),
-
-		// Snapshot management
-		snapshot: () => {
-			const id = ++snapshotId
-			snapshots.set(id, {
-				frontend: { ...frontend },
-				backend: { ...backend }
-			})
-			return id
-		},
-
-		revertToSnapshot: (id: number) => {
-			const snap = snapshots.get(id)
-			if (snap) {
-				frontend = { ...snap.frontend }
-				backend = { ...snap.backend }
-			}
-		},
-
-		// Linting
-		lint: () => {
-			// Return mock lint result - no actual linting in eval
-			return createEmptyLintResult()
-		},
-
-		// Data table operations (mock implementation for testing)
-		getDatatables: async () => {
-			// Return empty array for eval testing - no real datatables in test context
-			return []
-		},
-
-		getAvailableDatatableNames: () => {
-			// Return empty array for eval testing - no real datatables in test context
-			return []
-		},
-
-		execDatatableSql: async (
-			_datatableName: string,
-			_sql: string,
-			_newTable?: { schema: string; name: string }
-		) => {
-			// Return success with empty result for eval testing
-			return { success: true, result: [] }
-		},
-
-		addTableToWhitelist: (
-			_datatableName: string,
-			_schemaName: string,
-			_tableName: string
-		) => {
-			// No-op for eval testing - tables are not tracked in test context
-		}
-	}
-
-	return {
-		helpers,
-		getFiles: (): AppFiles => ({
-			frontend: { ...frontend },
-			backend: { ...backend }
-		}),
-		getFrontend: () => ({ ...frontend }),
-		getBackend: () => ({ ...backend })
-	}
-}
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/appEvalRunner.ts b/frontend/src/lib/components/copilot/chat/__tests__/app/appEvalRunner.ts
deleted file mode 100644
index 2e6a491bce..0000000000
--- a/frontend/src/lib/components/copilot/chat/__tests__/app/appEvalRunner.ts
+++ /dev/null
@@ -1,177 +0,0 @@
-import type { AppFiles, BackendRunnable, AppAIChatHelpers } from '../../app/core'
-import { getAppTools, prepareAppSystemMessage, prepareAppUserMessage } from '../../app/core'
-import { createAppEvalHelpers } from './appEvalHelpers'
-import { evaluateAppGeneration, type InitialApp } from './appEvalComparison'
-import {
-	runEval,
-	resolveSystemPrompt,
-	resolveTools,
-	resolveModel,
-	type VariantConfig,
-	type BaseEvalResult,
-	type EvaluationResult,
-	type Tool,
-	type VariantDefaults
-} from '../shared'
-import { writeAppComparisonResultsToFolders } from './appResultsWriter'
-import type { AIProvider } from '$lib/gen/types.gen'
-
-// Re-export for convenience
-export type { InitialApp } from './appEvalComparison'
-
-/**
- * App-specific evaluation result.
- */
-export interface AppEvalResult extends BaseEvalResult<AppFiles> {
-	/** Alias for output to maintain API compatibility */
-	files: AppFiles
-}
-
-/**
- * Options for running an app evaluation.
- */
-export interface AppEvalOptions {
-	initialFrontend?: Record<string, string>
-	initialBackend?: Record<string, BackendRunnable>
-	model?: string
-	customSystemPrompt?: string
-	maxIterations?: number
-	variant?: VariantConfig
-	/** Whether to evaluate the generated app with LLM. Default: true. Set to false to skip evaluation. */
-	evaluateWithLLM?: boolean
-	/** AI provider (inferred from model name if omitted) */
-	provider?: AIProvider
-}
-
-/**
- * App-specific variant defaults.
- */
-const appDefaults: VariantDefaults<AppAIChatHelpers> = {
-	prepareSystemMessage: prepareAppSystemMessage,
-	tools: getAppTools() as Tool<AppAIChatHelpers>[]
-}
-
-/**
- * Runs an app chat evaluation using the shared chat loop (same code path as production).
- */
-export async function runAppEval(
-	userPrompt: string,
-	apiKey: string,
-	options?: AppEvalOptions
-): Promise<AppEvalResult> {
-	const { helpers, getFiles } = createAppEvalHelpers(
-		options?.initialFrontend ?? {},
-		options?.initialBackend ?? {}
-	)
-
-	// Resolve variant configuration
-	const variantName = options?.variant?.name ?? 'baseline'
-	const systemMessage = resolveSystemPrompt(
-		options?.variant,
-		appDefaults,
-		options?.customSystemPrompt
-	)
-	const { tools } = resolveTools(options?.variant, appDefaults)
-	const model = resolveModel(options?.variant, options?.model)
-
-	// Build user message
-	const userMessage = prepareAppUserMessage(userPrompt, helpers.getSelectedContext())
-
-	// Run the base evaluation
-	const rawResult = await runEval({
-		userPrompt,
-		systemMessage,
-		userMessage,
-		tools,
-		helpers,
-		apiKey,
-		getOutput: getFiles,
-		options: {
-			maxIterations: options?.maxIterations,
-			model,
-			workspace: 'test-workspace',
-			provider: options?.provider
-		}
-	})
-
-	// Run LLM evaluation unless explicitly disabled
-	let evaluationResult: EvaluationResult | undefined
-	if (options?.evaluateWithLLM !== false) {
-		const generatedApp = getFiles()
-		const initialApp: InitialApp | undefined =
-			options?.initialFrontend || options?.initialBackend
-				? {
-						frontend: options.initialFrontend ?? {},
-						backend: options.initialBackend ?? {}
-					}
-				: undefined
-		evaluationResult = await evaluateAppGeneration(userPrompt, generatedApp, initialApp)
-	}
-
-	return {
-		...rawResult,
-		variantName,
-		files: rawResult.output,
-		evaluationResult
-	}
-}
-
-/**
- * Per-variant provider override.
- */
-export interface VariantProviderOverride {
-	provider: AIProvider
-	apiKey: string
-}
-
-/**
- * Runs the same prompt against multiple variants sequentially for comparison.
- * Accepts optional per-variant provider/apiKey overrides.
- */
-export async function runVariantComparison(
-	userPrompt: string,
-	variants: VariantConfig[],
-	defaultApiKey: string,
-	baseOptions?: Omit<AppEvalOptions, 'variant'>,
-	providerOverrides?: VariantProviderOverride[]
-): Promise<AppEvalResult[]> {
-	const results: AppEvalResult[] = await Promise.all(
-		variants.map(async (variant, i) => {
-			const override = providerOverrides?.[i]
-			return await runAppEval(userPrompt, override?.apiKey ?? defaultApiKey, {
-				...baseOptions,
-				variant,
-				provider: override?.provider ?? baseOptions?.provider
-			})
-		})
-	)
-	return results
-}
-
-/**
- * Writes app comparison results to a folder-based structure.
- * Each variant gets its own folder with frontend/, backend/, and details.json.
- */
-export async function writeAppComparisonResults(
-	userPrompt: string,
-	results: AppEvalResult[],
-	outputDir?: string
-): Promise<{ summaryPath: string; appPaths: string[] }> {
-	// @ts-ignore
-	const { dirname, join } = await import('path')
-	// @ts-ignore
-	const { fileURLToPath } = await import('url')
-
-	const __filename = fileURLToPath(import.meta.url)
-	const __dirname = dirname(__filename)
-
-	const resultsDir = outputDir ?? join(__dirname, 'results')
-
-	const result = await writeAppComparisonResultsToFolders({
-		userPrompt,
-		results,
-		outputDir: resultsDir
-	})
-
-	return { summaryPath: result.summaryPath, appPaths: result.variantPaths }
-}
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/appResultsWriter.ts b/frontend/src/lib/components/copilot/chat/__tests__/app/appResultsWriter.ts
deleted file mode 100644
index a9b3475e96..0000000000
--- a/frontend/src/lib/components/copilot/chat/__tests__/app/appResultsWriter.ts
+++ /dev/null
@@ -1,247 +0,0 @@
-import type { AppFiles, BackendRunnable } from '../../app/core'
-import type { AppEvalResult } from './appEvalRunner'
-import { generateTimestamp } from '../shared'
-
-/**
- * Writes frontend files to a folder, preserving directory structure.
- * File paths like "/components/Button.tsx" become "frontend/components/Button.tsx"
- */
-async function writeFrontendFiles(
-	frontend: Record<string, string>,
-	frontendPath: string
-): Promise<void> {
-	// @ts-ignore - Node.js fs/promises
-	const { writeFile, mkdir } = await import('fs/promises')
-	// @ts-ignore - Node.js path
-	const { join, dirname } = await import('path')
-
-	for (const [filePath, content] of Object.entries(frontend)) {
-		// Remove leading slash and join with frontend path
-		const relativePath = filePath.startsWith('/') ? filePath.slice(1) : filePath
-		const fullPath = join(frontendPath, relativePath)
-
-		// Ensure parent directory exists
-		await mkdir(dirname(fullPath), { recursive: true })
-
-		await writeFile(fullPath, content)
-	}
-}
-
-/**
- * Writes backend runnables to a folder structure.
- * Each runnable becomes a folder with main.ts/main.py and meta.json
- */
-async function writeBackendRunnables(
-	backend: Record<string, BackendRunnable>,
-	backendPath: string
-): Promise<void> {
-	// @ts-ignore - Node.js fs/promises
-	const { writeFile, mkdir } = await import('fs/promises')
-	// @ts-ignore - Node.js path
-	const { join } = await import('path')
-
-	for (const [key, runnable] of Object.entries(backend)) {
-		const runnablePath = join(backendPath, key)
-		await mkdir(runnablePath, { recursive: true })
-
-		// Write meta.json
-		const meta: { name: string; language?: string; type?: string; path?: string } = {
-			name: runnable.name
-		}
-
-		if (runnable.type === 'inline' && runnable.inlineScript) {
-			meta.language = runnable.inlineScript.language
-
-			// Write main file
-			const extension = runnable.inlineScript.language === 'python3' ? 'py' : 'ts'
-			const mainPath = join(runnablePath, `main.${extension}`)
-			await writeFile(mainPath, runnable.inlineScript.content)
-		} else {
-			// For non-inline runnables, store type and path in meta
-			meta.type = runnable.type
-			if (runnable.path) {
-				meta.path = runnable.path
-			}
-		}
-
-		const metaPath = join(runnablePath, 'meta.json')
-		await writeFile(metaPath, JSON.stringify(meta, null, '\t'))
-	}
-}
-
-/**
- * Writes app files (frontend + backend) to a folder structure.
- */
-async function writeAppToFolder(appFiles: AppFiles, folderPath: string): Promise<void> {
-	// @ts-ignore - Node.js path
-	const { join } = await import('path')
-
-	if (Object.keys(appFiles.frontend).length > 0) {
-		await writeFrontendFiles(appFiles.frontend, join(folderPath, 'frontend'))
-	}
-
-	if (Object.keys(appFiles.backend).length > 0) {
-		await writeBackendRunnables(appFiles.backend, join(folderPath, 'backend'))
-	}
-}
-
-/**
- * Parameters for writing app comparison results.
- */
-export interface WriteAppResultsParams {
-	userPrompt: string
-	results: AppEvalResult[]
-	outputDir: string
-}
-
-/**
- * Writes app comparison results to a folder-based structure.
- *
- * Creates:
- * ```
- * results/{timestamp}/
- * ├── summary.md
- * └── {variant_name}/
- *     ├── details.json    # Metadata (toolsCalled, evaluationResult, etc.)
- *     ├── frontend/       # Frontend files
- *     │   └── index.tsx
- *     └── backend/        # Backend runnables
- *         └── myFunction/
- *             ├── main.ts
- *             └── meta.json
- * ```
- */
-export async function writeAppComparisonResultsToFolders(
-	params: WriteAppResultsParams
-): Promise<{ summaryPath: string; variantPaths: string[] }> {
-	// @ts-ignore - Node.js fs/promises
-	const { writeFile, mkdir } = await import('fs/promises')
-	// @ts-ignore - Node.js path
-	const { join } = await import('path')
-
-	const { userPrompt, results, outputDir } = params
-	const timestamp = generateTimestamp()
-
-	// Ensure results directory exists
-	await mkdir(outputDir, { recursive: true })
-	const resultFolder = join(outputDir, timestamp)
-	await mkdir(resultFolder, { recursive: true })
-
-	// Check if any results have evaluation data
-	const hasEvaluation = results.some((r) => r.evaluationResult)
-
-	// Build summary markdown
-	const summaryLines: string[] = [
-		`# App Eval Results - ${timestamp}`,
-		'',
-		'## User Prompt',
-		'```',
-		userPrompt.trim(),
-		'```',
-		'',
-		'## Results',
-		''
-	]
-
-	// Add results table header based on whether evaluation data exists
-	if (hasEvaluation) {
-		summaryLines.push(
-			'| Variant | Success | Total Tokens | Tool Calls | Iterations | Resemblance Score |'
-		)
-		summaryLines.push(
-			'|---------|---------|--------------|------------|------------|-------------------|'
-		)
-	} else {
-		summaryLines.push('| Variant | Success | Total Tokens | Tool Calls | Iterations |')
-		summaryLines.push('|---------|---------|--------------|------------|------------|')
-	}
-
-	for (const result of results) {
-		const baseRow = `| ${result.variantName} | ${result.success} | ${result.tokenUsage.total} | ${result.toolsCalled.length} | ${result.iterations}`
-		if (hasEvaluation) {
-			const score = result.evaluationResult?.resemblanceScore ?? 'N/A'
-			summaryLines.push(`${baseRow} | ${score} |`)
-		} else {
-			summaryLines.push(`${baseRow} |`)
-		}
-	}
-
-	// Add evaluation details section if available
-	if (hasEvaluation) {
-		summaryLines.push('')
-		summaryLines.push('## Evaluation Details')
-		summaryLines.push('')
-		for (const result of results) {
-			if (result.evaluationResult) {
-				summaryLines.push(`### ${result.variantName}`)
-				summaryLines.push('')
-				summaryLines.push(`**Score:** ${result.evaluationResult.resemblanceScore}/100`)
-				summaryLines.push('')
-				summaryLines.push(`**Statement:** ${result.evaluationResult.statement}`)
-				summaryLines.push('')
-				if (
-					result.evaluationResult.missingRequirements &&
-					result.evaluationResult.missingRequirements.length > 0
-				) {
-					summaryLines.push('**Missing Requirements:**')
-					for (const req of result.evaluationResult.missingRequirements) {
-						summaryLines.push(`- ${req}`)
-					}
-					summaryLines.push('')
-				}
-				if (result.evaluationResult.error) {
-					summaryLines.push(`**Error:** ${result.evaluationResult.error}`)
-					summaryLines.push('')
-				}
-			}
-		}
-	}
-
-	// Add errors section for failed variants
-	const failedResults = results.filter((r) => !r.success && r.error)
-	if (failedResults.length > 0) {
-		summaryLines.push('')
-		summaryLines.push('## Errors')
-		summaryLines.push('')
-		for (const result of failedResults) {
-			summaryLines.push(`### ${result.variantName}`)
-			summaryLines.push('')
-			summaryLines.push('```')
-			summaryLines.push(result.error!)
-			summaryLines.push('```')
-			summaryLines.push('')
-		}
-	}
-
-	const variantPaths: string[] = []
-
-	// Write each variant to its own folder
-	for (const result of results) {
-		const variantFolder = join(resultFolder, result.variantName)
-		await mkdir(variantFolder, { recursive: true })
-		variantPaths.push(variantFolder)
-
-		// Write details.json (metadata without app files)
-		const details = {
-			variantName: result.variantName,
-			success: result.success,
-			error: result.error ?? null,
-			evaluationResult: result.evaluationResult ?? null,
-			toolsCalled: result.toolsCalled,
-			toolCallDetails: result.toolCallDetails,
-			tokenUsage: result.tokenUsage,
-			iterations: result.iterations,
-			messages: result.messages
-		}
-		await writeFile(join(variantFolder, 'details.json'), JSON.stringify(details, null, '\t'))
-
-		// Write app files to frontend/ and backend/ folders
-		await writeAppToFolder(result.files, variantFolder)
-	}
-
-	// Write summary markdown file
-	const summaryPath = join(resultFolder, 'summary.md')
-	await writeFile(summaryPath, summaryLines.join('\n'))
-
-	return { summaryPath, variantPaths }
-}
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/variants/baseline.ts b/frontend/src/lib/components/copilot/chat/__tests__/app/variants/baseline.ts
deleted file mode 100644
index 558424e972..0000000000
--- a/frontend/src/lib/components/copilot/chat/__tests__/app/variants/baseline.ts
+++ /dev/null
@@ -1,12 +0,0 @@
-import type { VariantConfig } from '../../shared'
-
-/**
- * Baseline variant - uses the production system prompt and all tools.
- * This is the default configuration that matches the actual app chat implementation.
- */
-export const BASELINE_VARIANT: VariantConfig = {
-	name: 'baseline',
-	description: 'Production configuration with default system prompt and all tools',
-	systemPrompt: { type: 'default' },
-	tools: { type: 'default' }
-}
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/variants/index.ts b/frontend/src/lib/components/copilot/chat/__tests__/app/variants/index.ts
deleted file mode 100644
index b49c56123d..0000000000
--- a/frontend/src/lib/components/copilot/chat/__tests__/app/variants/index.ts
+++ /dev/null
@@ -1,6 +0,0 @@
-// Re-export all variant configurations
-export { BASELINE_VARIANT } from './baseline'
-export { STREAMLINED_VARIANT } from './streamlined'
-
-// Re-export types for convenience
-export type { VariantConfig } from '../../shared'
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/app/variants/streamlined.ts b/frontend/src/lib/components/copilot/chat/__tests__/app/variants/streamlined.ts
deleted file mode 100644
index 515db0a756..0000000000
--- a/frontend/src/lib/components/copilot/chat/__tests__/app/variants/streamlined.ts
+++ /dev/null
@@ -1,144 +0,0 @@
-import type { VariantConfig } from '../../shared'
-import type { Tool } from '../../shared'
-import type { AppAIChatHelpers } from '../../../app/core'
-import { getAppTools } from '../../../app/core'
-
-// Tool names to remove (batch-fetch tools)
-const TOOLS_TO_REMOVE = ['get_files', 'get_frontend_files', 'get_backend_runnables']
-
-/**
- * Build the streamlined tools by filtering out batch-fetch tools.
- */
-function buildStreamlinedTools(): Tool<AppAIChatHelpers>[] {
-	const defaultTools = getAppTools()
-	return defaultTools.filter((t) => !TOOLS_TO_REMOVE.includes(t.def.function.name))
-}
-
-/**
- * Streamlined system prompt - simplified instructions focused on:
- * 1. Reading relevant files first
- * 2. Making changes with appropriate tools
- * 3. Using lint at the end to fix errors
- */
-const STREAMLINED_SYSTEM_PROMPT = `You are a helpful assistant that creates and edits apps on the Windmill platform. Apps are defined as a collection of files that contains both the frontend and the backend.
-
-## App Structure
-
-### Frontend
-- The frontend is bundled using esbuild with entrypoint \`index.tsx\`
-- Frontend files are managed separately from backend runnables
-- The \`wmill.d.ts\` file is generated automatically from the backend runnables shape
-
-### Backend
-Backend runnables can be of different types:
-- **inline**: Custom code written directly in the app (TypeScript/Bun or Python)
-- **script**: Reference to a workspace script by path
-- **flow**: Reference to a workspace flow by path
-- **hubscript**: Reference to a hub script by path
-
-Frontend calls backend using \`await backend.<runnable_key>(args...)\`.
-
-For inline scripts, the code must have a \`main\` function as its entrypoint.
-
-## Available Tools
-
-### File Management
-- \`list_frontend_files()\`: List all frontend file paths (use this first to see what exists)
-- \`get_frontend_file(path)\`: Get content of a specific frontend file
-- \`set_frontend_file(path, content)\`: Create or update a frontend file. Returns lint diagnostics.
-- \`delete_frontend_file(path)\`: Delete a frontend file
-- \`list_backend_runnables()\`: List all backend runnable keys and names (use this first to see what exists)
-- \`get_backend_runnable(key)\`: Get full configuration of a specific backend runnable
-- \`set_backend_runnable(key, name, type, ...)\`: Create or update a backend runnable. Returns lint diagnostics.
-- \`delete_backend_runnable(key)\`: Delete a backend runnable
-
-### Linting
-- \`lint()\`: Lint all files. Returns errors/warnings grouped by frontend/backend.
-
-### Discovery
-- \`search_workspace(query, type)\`: Search workspace scripts and flows
-- \`search_hub_scripts(query)\`: Search hub scripts
-
-## Backend Runnable Configuration
-
-When creating a backend runnable with \`set_backend_runnable\`:
-
-1. **For inline scripts** (type: "inline"):
-   \`\`\`
-   {
-     key: "myFunction",
-     name: "Does something useful",
-     type: "inline",
-     inlineScript: {
-       language: "bun",  // or "python3"
-       content: "export async function main(arg1: string) { return result; }"
-     }
-   }
-   \`\`\`
-
-2. **For workspace scripts** (type: "script"):
-   \`\`\`
-   {
-     key: "sendEmail",
-     name: "Send email via SMTP",
-     type: "script",
-     path: "f/folder/send_email",
-     staticInputs: { smtp_server: "mail.example.com" }  // optional pre-filled inputs
-   }
-   \`\`\`
-
-3. **For workspace flows** (type: "flow"):
-   \`\`\`
-   {
-     key: "processOrder",
-     name: "Process customer order",
-     type: "flow",
-     path: "f/folder/process_order_flow"
-   }
-   \`\`\`
-
-4. **For hub scripts** (type: "hubscript"):
-   \`\`\`
-   {
-     key: "slackMessage",
-     name: "Send Slack message",
-     type: "hubscript",
-     path: "hub/123/slack/send_message"
-   }
-   \`\`\`
-
-## Instructions
-
-1. Start by reading relevant files to understand the current state
-2. Make changes using the appropriate tools
-3. Use \`lint()\` at the end to check for and fix any errors
-
-Windmill expects all backend runnable calls to use an object parameter structure. For example for:
-\`\`\`typescript
-export async function main(arg1: string, arg2: string, arg3: number, arg4: { field1: string, field2: number }) {
-  ...
-}
-\`\`\`
-
-You would call it like this:
-\`\`\`typescript
-await backend.myFunction({ arg1: 'value1', arg2: 'value2', arg3: 3, arg4: { field1: 'value1', field2: 2 } })
-\`\`\`
-If the runnable has no parameters, you can call it without an object:
-\`\`\`typescript
-await backend.myFunction()
-\`\`\`
-
-When you are using the windmill-client, do not forget that as id for variables or resources, those are path that are of the form 'u/<user>/<name>' or 'f/<folder>/<name>'.
-`
-
-/**
- * Streamlined variant - removes batch-fetch tools and uses simplified instructions.
- * Forces the model to read individual files before making changes.
- */
-export const STREAMLINED_VARIANT: VariantConfig = {
-	name: 'streamlined',
-	description: 'No batch tools - forces reading individual files before making changes',
-	systemPrompt: { type: 'custom', content: STREAMLINED_SYSTEM_PROMPT },
-	tools: { type: 'custom', tools: buildStreamlinedTools() }
-}
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/flowChat.eval.test.ts b/frontend/src/lib/components/copilot/chat/__tests__/flow/flowChat.eval.test.ts
deleted file mode 100644
index de9b8e5f43..0000000000
--- a/frontend/src/lib/components/copilot/chat/__tests__/flow/flowChat.eval.test.ts
+++ /dev/null
@@ -1,449 +0,0 @@
-import { describe, it, expect } from 'vitest'
-import { runVariantComparison, writeFlowComparisonResults, type ExpectedFlow } from './flowEvalRunner'
-import { BASELINE_VARIANT, MINIMAL_SINGLE_TOOL_VARIANT } from './variants'
-// @ts-ignore - JSON import
-import expectedTest1 from './expected/test1.json'
-// @ts-ignore - JSON import
-import expectedTest2 from './expected/test2.json'
-// @ts-ignore - JSON import
-import expectedTest3 from './expected/test3.json'
-// @ts-ignore - JSON import
-import expectedTest4 from './expected/test4.json'
-// @ts-ignore - JSON import
-import expectedTest5 from './expected/test5_modify_simple.json'
-// @ts-ignore - JSON import
-import expectedTest6 from './expected/test6_modify_medium.json'
-// @ts-ignore - JSON import
-import expectedTest7 from './expected/test7_modify_complex.json'
-// @ts-ignore - JSON import
-import initialTest5 from './initial/test5_initial.json'
-// @ts-ignore - JSON import
-import initialTest6 from './initial/test6_initial.json'
-// @ts-ignore - JSON import
-import initialTest7 from './initial/test7_initial.json'
-import type { FlowModule } from '$lib/gen'
-import type { AIProvider } from '$lib/gen/types.gen'
-
-// Get API keys from environment - tests will be skipped if none are set
-// @ts-ignore
-const OPENAI_API_KEY = process.env.OPENAI_API_KEY
-// @ts-ignore
-const ANTHROPIC_API_KEY = process.env.ANTHROPIC_API_KEY
-
-const hasAnyKey = OPENAI_API_KEY || ANTHROPIC_API_KEY
-const describeWithApiKey = hasAnyKey ? describe : describe.skip
-
-// Build model variants based on available keys
-interface ModelVariant {
-	model: string
-	provider: AIProvider
-	apiKey: string
-}
-
-const MODEL_VARIANTS: ModelVariant[] = [
-	...(OPENAI_API_KEY
-		? [{ model: 'gpt-4o', provider: 'openai' as AIProvider, apiKey: OPENAI_API_KEY }]
-		: []),
-	...(ANTHROPIC_API_KEY
-		? [
-				{
-					model: 'claude-haiku-4-5-20241022',
-					provider: 'anthropic' as AIProvider,
-					apiKey: ANTHROPIC_API_KEY
-				}
-			]
-		: [])
-]
-
-const VARIANTS = [
-	...MODEL_VARIANTS.map((mv) => ({
-		...BASELINE_VARIANT,
-		model: mv.model,
-		name: `baseline-${mv.provider}-${mv.model}`,
-		_provider: mv.provider,
-		_apiKey: mv.apiKey
-	})),
-	...MODEL_VARIANTS.map((mv) => ({
-		...MINIMAL_SINGLE_TOOL_VARIANT,
-		model: mv.model,
-		name: `minimal-single-tool-${mv.provider}-${mv.model}`,
-		_provider: mv.provider,
-		_apiKey: mv.apiKey
-	}))
-]
-
-describeWithApiKey('Flow Chat LLM Evaluation', () => {
-	const TEST_TIMEOUT = 120_000
-	if (!hasAnyKey) {
-		console.warn('No API keys set (OPENAI_API_KEY or ANTHROPIC_API_KEY), skipping tests')
-	}
-
-	it(
-		'test1: user role-based actions with loop and branches',
-		async () => {
-			const USER_PROMPT = `
-THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES
-
-STEP 1: Fetch mock users from api
-STEP 2: Filter only active users:
-STEP 3: Loop on all users
-STEP 4: Do branches based on user's role, do different action based on that. Roles are admin, user, moderator
-STEP 5: Return action taken for each user
-`
-			const results = await runVariantComparison(
-				USER_PROMPT,
-				VARIANTS,
-				VARIANTS[0]._apiKey,
-				{
-					expectedFlow: expectedTest1 as ExpectedFlow
-				},
-				VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey }))
-			)
-
-			// Write results to files
-			const { summaryPath, flowPaths } = await writeFlowComparisonResults(USER_PROMPT, results)
-			console.log(`\nResults written to: ${summaryPath}`)
-			console.log(`Flow files: ${flowPaths.join(', ')}`)
-
-			// Assert all variants succeeded
-			for (const result of results) {
-				expect(true).toBe(true)
-
-				// Log evaluation results
-				if (result.evaluationResult) {
-					console.log(
-						`[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100`
-					)
-					console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`)
-					if (
-						result.evaluationResult.missingRequirements &&
-						result.evaluationResult.missingRequirements.length > 0
-					) {
-						console.log(
-							`[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}`
-						)
-					}
-				}
-			}
-		},
-		TEST_TIMEOUT
-	)
-
-	it(
-		'test2: e-commerce order processing with inventory check and branching',
-		async () => {
-			const USER_PROMPT = `
-THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES
-
-STEP 1: Receive order data from input (order has items array with name/price/quantity, customer_email, shipping_address)
-STEP 2: Validate order - check all items have valid price > 0 and quantity > 0, return validation result
-STEP 3: Calculate order total with 8% tax rate
-STEP 4: Check inventory for each item (loop through items, return mock availability)
-STEP 5: Branch based on inventory - if all items available, create shipment record; otherwise create backorder record
-STEP 6: Send confirmation (mock email to customer_email)
-STEP 7: Return final order summary with status
-`
-			const results = await runVariantComparison(
-				USER_PROMPT,
-				VARIANTS,
-				VARIANTS[0]._apiKey,
-				{
-					expectedFlow: expectedTest2 as ExpectedFlow
-				},
-				VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey }))
-			)
-
-			const { summaryPath, flowPaths } = await writeFlowComparisonResults(USER_PROMPT, results)
-			console.log(`\nResults written to: ${summaryPath}`)
-			console.log(`Flow files: ${flowPaths.join(', ')}`)
-
-			for (const result of results) {
-				expect(true).toBe(true)
-
-				if (result.evaluationResult) {
-					console.log(
-						`[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100`
-					)
-					console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`)
-					if (
-						result.evaluationResult.missingRequirements &&
-						result.evaluationResult.missingRequirements.length > 0
-					) {
-						console.log(
-							`[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}`
-						)
-					}
-				}
-			}
-		},
-		TEST_TIMEOUT
-	)
-
-	it(
-		'test3: data pipeline with parallel processing and quality-based routing',
-		async () => {
-			const USER_PROMPT = `
-THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES
-
-STEP 1: Fetch list of data sources from configuration (return mock array of 3 source objects with id and url)
-STEP 2: For each data source in parallel:
-  - Fetch raw data from the source (mock fetch returning sample records)
-  - Transform/clean the data (filter out invalid entries)
-  - Validate the transformed data (return validation score 0-100)
-STEP 3: Aggregate all validated data into single dataset with combined records
-STEP 4: Calculate overall data quality score (average of all validation scores)
-STEP 5: Branch based on quality score:
-  - If score >= 90: Store in primary database and return success
-  - If score >= 70 and < 90: Store in secondary database with warning flag
-  - If score < 70: Store in quarantine and send alert
-STEP 6: Return processing report with statistics (total records, quality score, destination)
-`
-			const results = await runVariantComparison(
-				USER_PROMPT,
-				VARIANTS,
-				VARIANTS[0]._apiKey,
-				{
-					expectedFlow: expectedTest3 as ExpectedFlow
-				},
-				VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey }))
-			)
-
-			const { summaryPath, flowPaths } = await writeFlowComparisonResults(USER_PROMPT, results)
-			console.log(`\nResults written to: ${summaryPath}`)
-			console.log(`Flow files: ${flowPaths.join(', ')}`)
-
-			for (const result of results) {
-				expect(true).toBe(true)
-
-				if (result.evaluationResult) {
-					console.log(
-						`[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100`
-					)
-					console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`)
-					if (
-						result.evaluationResult.missingRequirements &&
-						result.evaluationResult.missingRequirements.length > 0
-					) {
-						console.log(
-							`[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}`
-						)
-					}
-				}
-			}
-		},
-		TEST_TIMEOUT
-	)
-
-	it(
-		'test4: AI agent with tools for customer support',
-		async () => {
-			const USER_PROMPT = `
-THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES
-
-Create a customer support flow with an AI agent:
-
-STEP 1: Receive customer query from input (customer_id string, query_text string)
-STEP 2: Fetch customer profile and order history (mock data based on customer_id)
-STEP 3: Use an AI agent to handle the customer query. The agent should have access to these tools:
-  - lookup_order: Takes order_id, returns order details (mock data)
-  - check_refund_eligibility: Takes order_id, returns eligibility status and reason
-  - create_support_ticket: Takes description and priority (low/medium/high), returns ticket_id
-  - search_faq: Takes search_query, returns relevant FAQ answers
-  The agent should use the customer profile context and respond helpfully.
-STEP 4: Log the interaction to audit trail (customer_id, query, response summary)
-STEP 5: Return the agent's response and any actions taken
-`
-			const results = await runVariantComparison(
-				USER_PROMPT,
-				VARIANTS,
-				VARIANTS[0]._apiKey,
-				{
-					expectedFlow: expectedTest4 as ExpectedFlow
-				},
-				VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey }))
-			)
-
-			const { summaryPath, flowPaths } = await writeFlowComparisonResults(USER_PROMPT, results)
-			console.log(`\nResults written to: ${summaryPath}`)
-			console.log(`Flow files: ${flowPaths.join(', ')}`)
-
-			for (const result of results) {
-				expect(true).toBe(true)
-
-				if (result.evaluationResult) {
-					console.log(
-						`[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100`
-					)
-					console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`)
-					if (
-						result.evaluationResult.missingRequirements &&
-						result.evaluationResult.missingRequirements.length > 0
-					) {
-						console.log(
-							`[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}`
-						)
-					}
-				}
-			}
-		},
-		TEST_TIMEOUT
-	)
-
-	// ==================== MODIFICATION TESTS ====================
-	// These tests evaluate the LLM's ability to modify existing flows
-
-	it(
-		'test5: simple modification - add validation step to existing flow',
-		async () => {
-			const USER_PROMPT = `
-THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES
-
-Modify this existing flow to add error handling:
-- Add a new step after process_data called "validate_data" to validate the processed data
-- The validation step should check if the data array is not empty
-- If validation fails (empty array), it should return an error object with message "No data to save"
-- If validation passes, return the data for the next step
-- Update save_results to handle the validation result appropriately
-`
-			const results = await runVariantComparison(
-				USER_PROMPT,
-				VARIANTS,
-				VARIANTS[0]._apiKey,
-				{
-					initialModules: initialTest5.value.modules as FlowModule[],
-					initialSchema: initialTest5.schema,
-					expectedFlow: expectedTest5 as ExpectedFlow
-				},
-				VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey }))
-			)
-
-			const { summaryPath, flowPaths } = await writeFlowComparisonResults(USER_PROMPT, results)
-			console.log(`\nResults written to: ${summaryPath}`)
-			console.log(`Flow files: ${flowPaths.join(', ')}`)
-
-			for (const result of results) {
-				expect(true).toBe(true)
-
-				if (result.evaluationResult) {
-					console.log(
-						`[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100`
-					)
-					console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`)
-					if (
-						result.evaluationResult.missingRequirements &&
-						result.evaluationResult.missingRequirements.length > 0
-					) {
-						console.log(
-							`[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}`
-						)
-					}
-				}
-			}
-		},
-		TEST_TIMEOUT
-	)
-
-	it(
-		'test6: medium modification - add branching inside existing loop',
-		async () => {
-			const USER_PROMPT = `
-THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES
-
-Modify the order processing loop to handle different order types:
-- Inside the loop_orders, replace the simple process_order step with branching based on order.type
-- For type "express": add a step called handle_express that marks as priority and calculates express shipping cost ($15.99)
-- For type "standard": add a step called handle_standard that calculates standard shipping cost ($5.99)
-- For type "pickup": add a step called handle_pickup that marks as no shipping required (cost $0)
-- Move the original process_order step to the default branch for unknown order types
-- Each branch step should return the orderId, shipping cost, and shipping type
-`
-			const results = await runVariantComparison(
-				USER_PROMPT,
-				VARIANTS,
-				VARIANTS[0]._apiKey,
-				{
-					initialModules: initialTest6.value.modules as FlowModule[],
-					initialSchema: initialTest6.schema,
-					expectedFlow: expectedTest6 as ExpectedFlow
-				},
-				VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey }))
-			)
-
-			const { summaryPath, flowPaths } = await writeFlowComparisonResults(USER_PROMPT, results)
-			console.log(`\nResults written to: ${summaryPath}`)
-			console.log(`Flow files: ${flowPaths.join(', ')}`)
-
-			for (const result of results) {
-				expect(true).toBe(true)
-
-				if (result.evaluationResult) {
-					console.log(
-						`[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100`
-					)
-					console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`)
-					if (
-						result.evaluationResult.missingRequirements &&
-						result.evaluationResult.missingRequirements.length > 0
-					) {
-						console.log(
-							`[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}`
-						)
-					}
-				}
-			}
-		},
-		TEST_TIMEOUT
-	)
-
-	it(
-		'test7: complex modification - refactor sequential to parallel execution',
-		async () => {
-			const USER_PROMPT = `
-THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES
-
-Refactor this flow for better performance by parallelizing the enrichment steps:
-- The three enrichment steps (enrich_price, enrich_inventory, enrich_reviews) currently run sequentially
-- Wrap them in a parallel branch (branchall) called "parallel_enrichment" so they run concurrently
-- Each enrichment step should include basic error handling with try/catch that returns a fallback value if it fails
-- Update the combine_data step to receive results from the parallel branch (results.parallel_enrichment returns an array of branch results)
-- The combine_data step should check if any enrichment used a fallback value and set a hasFallbacks flag
-- Keep get_item as the first step and return_result as the last step unchanged
-`
-			const results = await runVariantComparison(
-				USER_PROMPT,
-				VARIANTS,
-				VARIANTS[0]._apiKey,
-				{
-					initialModules: initialTest7.value.modules as FlowModule[],
-					initialSchema: initialTest7.schema,
-					expectedFlow: expectedTest7 as ExpectedFlow
-				},
-				VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey }))
-			)
-
-			const { summaryPath, flowPaths } = await writeFlowComparisonResults(USER_PROMPT, results)
-			console.log(`\nResults written to: ${summaryPath}`)
-			console.log(`Flow files: ${flowPaths.join(', ')}`)
-
-			for (const result of results) {
-				expect(true).toBe(true)
-
-				if (result.evaluationResult) {
-					console.log(
-						`[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100`
-					)
-					console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`)
-					if (
-						result.evaluationResult.missingRequirements &&
-						result.evaluationResult.missingRequirements.length > 0
-					) {
-						console.log(
-							`[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}`
-						)
-					}
-				}
-			}
-		},
-		TEST_TIMEOUT
-	)
-})
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalComparison.ts b/frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalComparison.ts
deleted file mode 100644
index 4c2b41d577..0000000000
--- a/frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalComparison.ts
+++ /dev/null
@@ -1,68 +0,0 @@
-import type { FlowModule } from '$lib/gen'
-import { evaluateWithLLM, BASE_EVALUATOR_RESPONSE_FORMAT } from '../shared'
-import type { EvaluationResult } from '../shared'
-
-/**
- * Expected flow structure for evaluation.
- */
-export interface ExpectedFlow {
-	summary?: string
-	value: {
-		modules: FlowModule[]
-	}
-	schema?: Record<string, any>
-}
-
-/**
- * Flow-specific evaluator system prompt.
- */
-const FLOW_EVALUATOR_SYSTEM_PROMPT = `You are an expert evaluator for Windmill flow definitions. Your task is to evaluate a generated flow against:
-1. The original user request/prompt
-2. An expected reference flow
-
-## Windmill Flow Context
-- Flows consist of modules (steps) that execute sequentially
-- Module types include: rawscript, forloopflow, branchone, branchall, script, flow, aiagent
-- Each module has an id, value (containing type and config), and may have input_transforms
-- input_transforms connect modules using expressions like "results.previous_step". Valid input_transforms are: static, javascript. Valid variables in javascript expressions are: results, flow_input, flow_input.iter.value (for forloopflow), flow_input.iter.index (for forloopflow).
-- forloopflow contains nested modules that execute per iteration with access to flow_input.iter.value
-- branchone executes first matching branch, branchall executes all matching branches
-- Branches have conditional expressions (expr) that determine execution
-- aiagent modules contain tools array with tool definitions
-
-## Evaluation Criteria
-1. **User Request Fulfillment**: Does the generated flow address ALL requirements from the user's original prompt?
-   - Are all requested steps present?
-   - Are the requested features implemented (loops, branches, specific logic)?
-   - Does the schema match what the user requested for inputs?
-2. **Structure**: Are the module types and nesting structure appropriate for the task?
-3. **Logic**: Does the flow accomplish the intended logical task?
-4. **Connections**: Are input_transforms connecting data correctly between steps?
-5. **Completeness**: Are all required steps present with no major omissions?
-6. **Code Quality**: Is the code functionally correct (exact syntax doesn't need to match)?
-
-## Important Notes
-- Minor differences in variable names, code formatting, or exact wording are acceptable
-- Focus on functional equivalence, not character-by-character matching
-- The generated flow should achieve the same outcome as described in the user request
-- Extra helper steps or slightly different approaches can still score high if they accomplish the goal
-- If the user requested specific module types (like aiagent), verify they are used correctly
-
-${BASE_EVALUATOR_RESPONSE_FORMAT}`
-
-/**
- * Evaluates how well a generated flow matches an expected flow and user request using an LLM.
- * Returns a resemblance score (0-100), a qualitative statement, and any missing requirements.
- */
-export async function evaluateFlowComparison(
-	generatedFlow: ExpectedFlow,
-	expectedFlow: ExpectedFlow,
-	userPrompt: string
-): Promise<EvaluationResult> {
-	return evaluateWithLLM({
-		userPrompt,
-		generatedOutput: generatedFlow,
-		expectedOutput: expectedFlow,
-		evaluatorSystemPrompt: FLOW_EVALUATOR_SYSTEM_PROMPT
-	})
-}
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalHelpers.ts b/frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalHelpers.ts
deleted file mode 100644
index 39d6ebd5e1..0000000000
--- a/frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalHelpers.ts
+++ /dev/null
@@ -1,104 +0,0 @@
-import type { FlowAIChatHelpers } from '../../flow/core'
-import type { FlowModule, InputTransform } from '$lib/gen'
-import type { ExtendedOpenFlow } from '$lib/components/flows/types'
-import { findModuleById } from '../../shared'
-import { inlineScriptStore, restoreInlineScriptReferences } from '../../flow/inlineScriptsUtils'
-
-/**
- * Creates mock FlowAIChatHelpers for eval testing.
- * Tracks flow state in memory and allows tool functions to modify it.
- */
-export function createFlowEvalHelpers(
-	initialModules: FlowModule[] = [],
-	initialSchema?: Record<string, any>
-) {
-	let flow: ExtendedOpenFlow = {
-		value: { modules: structuredClone(initialModules) },
-		summary: '',
-		schema: initialSchema ?? {
-			$schema: 'https://json-schema.org/draft/2020-12/schema',
-			properties: {},
-			required: [],
-			type: 'object'
-		}
-	}
-
-	const helpers: FlowAIChatHelpers = {
-		getFlowAndSelectedId: () => ({ flow, selectedId: '' }),
-
-		getModules: (id?: string) => {
-			if (!id) return flow.value.modules
-			const module = findModuleById(flow.value.modules, id)
-			return module ? [module] : []
-		},
-
-		setSnapshot: () => {
-			// No-op for eval - we don't need snapshot tracking
-		},
-
-		revertToSnapshot: () => {
-			// No-op for eval
-		},
-
-		setCode: async (id: string, code: string) => {
-			const module = findModuleById(flow.value.modules, id)
-			if (module && module.value.type === 'rawscript') {
-				module.value.content = code
-			}
-			// Keep store coherent for subsequent set_flow_json calls with references
-			inlineScriptStore.set(id, code)
-		},
-
-		setFlowJson: async (
-			modules: FlowModule[] | undefined,
-			schema: Record<string, any> | undefined
-		) => {
-			if (modules) {
-				// Restore inline script references back to full content
-				const restoredModules = restoreInlineScriptReferences(modules)
-				flow.value.modules = restoredModules
-			}
-
-			// Update schema if provided
-			if (schema !== undefined) {
-				flow.schema = schema
-			}
-		},
-
-		getFlowInputsSchema: async () => flow.schema ?? {},
-
-		updateExprsToSet: (_id: string, _inputTransforms: Record<string, InputTransform>) => {
-			// No-op for eval - UI-only functionality
-		},
-
-		acceptAllModuleActions: () => {
-			// No-op for eval
-		},
-
-		rejectAllModuleActions: () => {
-			// No-op for eval
-		},
-
-		hasPendingChanges: () => false,
-
-		selectStep: (_id: string) => {
-			// No-op for eval
-		},
-
-		testFlow: async () => {
-			// Return mock job ID - we don't actually run flows in eval
-			return 'mock-job-id-' + Date.now()
-		},
-
-		getLintErrors: async () => {
-			// Return empty lint result for eval
-			return { errorCount: 0, warningCount: 0, errors: [], warnings: [] }
-		}
-	}
-
-	return {
-		helpers,
-		getFlow: () => flow,
-		getModules: () => flow.value.modules
-	}
-}
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalRunner.ts b/frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalRunner.ts
deleted file mode 100644
index f3c976950d..0000000000
--- a/frontend/src/lib/components/copilot/chat/__tests__/flow/flowEvalRunner.ts
+++ /dev/null
@@ -1,186 +0,0 @@
-import type { FlowModule } from '$lib/gen'
-import type { AIProvider } from '$lib/gen/types.gen'
-import type { ExtendedOpenFlow } from '$lib/components/flows/types'
-import { flowTools, prepareFlowSystemMessage, prepareFlowUserMessage, type FlowAIChatHelpers } from '../../flow/core'
-import { createFlowEvalHelpers } from './flowEvalHelpers'
-import { evaluateFlowComparison, type ExpectedFlow } from './flowEvalComparison'
-import {
-	runEval,
-	resolveSystemPrompt,
-	resolveTools,
-	resolveModel,
-	writeComparisonResults,
-	type VariantConfig,
-	type BaseEvalResult,
-	type EvaluationResult,
-	type Tool,
-	type VariantDefaults
-} from '../shared'
-
-// Re-export for convenience
-export type { ExpectedFlow } from './flowEvalComparison'
-
-/**
- * Flow-specific evaluation result.
- */
-export interface FlowEvalResult extends BaseEvalResult<ExtendedOpenFlow> {
-	/** Alias for output to maintain API compatibility */
-	flow: ExtendedOpenFlow
-}
-
-/**
- * Options for running a flow evaluation.
- */
-export interface FlowEvalOptions {
-	initialModules?: FlowModule[]
-	initialSchema?: Record<string, any>
-	model?: string
-	customSystemPrompt?: string
-	maxIterations?: number
-	variant?: VariantConfig
-	expectedFlow?: ExpectedFlow
-	/** AI provider (inferred from model name if omitted) */
-	provider?: AIProvider
-}
-
-/**
- * Flow-specific variant defaults.
- */
-const flowDefaults: VariantDefaults<FlowAIChatHelpers> = {
-	prepareSystemMessage: prepareFlowSystemMessage,
-	tools: flowTools as Tool<FlowAIChatHelpers>[]
-}
-
-/**
- * Runs a flow chat evaluation using the shared chat loop (same code path as production).
- */
-export async function runFlowEval(
-	userPrompt: string,
-	apiKey: string,
-	options?: FlowEvalOptions
-): Promise<FlowEvalResult> {
-	const { helpers, getFlow } = createFlowEvalHelpers(
-		options?.initialModules ?? [],
-		options?.initialSchema
-	)
-
-	// Resolve variant configuration
-	const variantName = options?.variant?.name ?? 'baseline'
-	const systemMessage = resolveSystemPrompt(options?.variant, flowDefaults, options?.customSystemPrompt)
-	const { tools } = resolveTools(options?.variant, flowDefaults)
-	const model = resolveModel(options?.variant, options?.model)
-
-	// Build user message
-	const userMessage = prepareFlowUserMessage(userPrompt, helpers.getFlowAndSelectedId(), [])
-
-	// Run the base evaluation
-	const rawResult = await runEval({
-		userPrompt,
-		systemMessage,
-		userMessage,
-		tools,
-		helpers,
-		apiKey,
-		getOutput: getFlow,
-		options: {
-			maxIterations: options?.maxIterations,
-			model,
-			workspace: 'test-workspace',
-			provider: options?.provider
-		}
-	})
-
-	// Run evaluation if expected flow is provided
-	let evaluationResult: EvaluationResult | undefined
-	if (options?.expectedFlow) {
-		const generatedFlow = getFlow()
-		evaluationResult = await evaluateFlowComparison(
-			{
-				summary: generatedFlow.summary,
-				value: { modules: generatedFlow.value.modules },
-				schema: generatedFlow.schema
-			},
-			options.expectedFlow,
-			userPrompt
-		)
-	}
-
-	return {
-		...rawResult,
-		variantName,
-		flow: rawResult.output,
-		evaluationResult
-	}
-}
-
-/**
- * Per-variant provider override.
- */
-export interface VariantProviderOverride {
-	provider: AIProvider
-	apiKey: string
-}
-
-/**
- * Runs the same prompt against multiple variants sequentially for comparison.
- * Accepts optional per-variant provider/apiKey overrides.
- */
-export async function runVariantComparison(
-	userPrompt: string,
-	variants: VariantConfig[],
-	defaultApiKey: string,
-	baseOptions?: Omit<FlowEvalOptions, 'variant'>,
-	providerOverrides?: VariantProviderOverride[]
-): Promise<FlowEvalResult[]> {
-	const results: FlowEvalResult[] = await Promise.all(
-		variants.map(async (variant, i) => {
-			const override = providerOverrides?.[i]
-			return await runFlowEval(userPrompt, override?.apiKey ?? defaultApiKey, {
-				...baseOptions,
-				variant,
-				provider: override?.provider ?? baseOptions?.provider
-			})
-		})
-	)
-	return results
-}
-
-/**
- * Writes flow comparison results to files.
- */
-export async function writeFlowComparisonResults(
-	userPrompt: string,
-	results: FlowEvalResult[],
-	outputDir?: string
-): Promise<{ summaryPath: string; flowPaths: string[] }> {
-	// @ts-ignore
-	const { dirname, join } = await import('path')
-	// @ts-ignore
-	const { fileURLToPath } = await import('url')
-
-	const __filename = fileURLToPath(import.meta.url)
-	const __dirname = dirname(__filename)
-
-	const resultsDir = outputDir ?? join(__dirname, 'results')
-
-	const result = await writeComparisonResults({
-		userPrompt,
-		results,
-		outputDir: resultsDir,
-		formatOutput: (flow: ExtendedOpenFlow) => ({
-			summary: flow.summary ?? '',
-			value: {
-				modules: flow.value.modules
-			},
-			schema: flow.schema ?? {
-				$schema: 'https://json-schema.org/draft/2020-12/schema',
-				properties: {},
-				required: [],
-				type: 'object'
-			}
-		}),
-		outputLabel: 'flow'
-	})
-
-	return { summaryPath: result.summaryPath, flowPaths: result.outputPaths }
-}
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/variants/baseline.ts b/frontend/src/lib/components/copilot/chat/__tests__/flow/variants/baseline.ts
deleted file mode 100644
index bd20f4f8c2..0000000000
--- a/frontend/src/lib/components/copilot/chat/__tests__/flow/variants/baseline.ts
+++ /dev/null
@@ -1,12 +0,0 @@
-import type { VariantConfig } from '../../shared'
-
-/**
- * Baseline variant - uses the production system prompt and all tools.
- * This is the default configuration that matches the actual flow chat implementation.
- */
-export const BASELINE_VARIANT: VariantConfig = {
-	name: 'baseline',
-	description: 'Production configuration with default system prompt and all tools',
-	systemPrompt: { type: 'default' },
-	tools: { type: 'default' }
-}
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/variants/index.ts b/frontend/src/lib/components/copilot/chat/__tests__/flow/variants/index.ts
deleted file mode 100644
index 914db4a398..0000000000
--- a/frontend/src/lib/components/copilot/chat/__tests__/flow/variants/index.ts
+++ /dev/null
@@ -1,6 +0,0 @@
-// Re-export all variant configurations
-export { BASELINE_VARIANT } from './baseline'
-export { MINIMAL_SINGLE_TOOL_VARIANT, setFlowJsonTool } from './minimal-single-tool'
-
-// Re-export types for convenience
-export type { VariantConfig } from '../../shared'
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/flow/variants/minimal-single-tool.ts b/frontend/src/lib/components/copilot/chat/__tests__/flow/variants/minimal-single-tool.ts
deleted file mode 100644
index a07b24b691..0000000000
--- a/frontend/src/lib/components/copilot/chat/__tests__/flow/variants/minimal-single-tool.ts
+++ /dev/null
@@ -1,402 +0,0 @@
-import type { VariantConfig } from '../../shared'
-import type { Tool } from '../../shared'
-import type { FlowAIChatHelpers } from '../../../flow/core'
-import { flowTools } from '../../../flow/core'
-import openFlowSchema from '../../../flow/openFlow.json'
-
-/**
- * IDs of the granular flow editing tools that should be replaced by set_flow_json.
- */
-const FLOW_EDITING_TOOL_NAMES = [
-	'add_module',
-	'remove_module',
-	'remove_branch',
-	'modify_module',
-	'set_flow_schema'
-]
-
-/**
- * A single tool that sets the entire flow JSON at once.
- * This replaces the granular flow editing tools (add_module, remove_module, modify_module, etc.)
- */
-export const setFlowJsonTool: Tool<FlowAIChatHelpers> = {
-	def: {
-		type: 'function',
-		function: {
-			name: 'set_flow_json',
-			description:
-				'Set the entire flow by providing the complete flow object. This replaces all existing modules and schema.',
-			strict: false,
-			parameters: {
-				type: 'object',
-				properties: {
-					modules: {
-						type: 'array',
-						description: 'Array of flow modules',
-						items: {
-							type: 'object'
-						}
-					},
-					schema: {
-						type: 'object',
-						description:
-							'Flow input schema (JSON Schema format) defining parameters the flow accepts'
-					}
-				},
-				required: ['modules']
-			}
-		}
-	},
-	fn: async ({ args, helpers }) => {
-		const { modules, schema } = args as { modules: any[]; schema?: Record<string, any> }
-		await helpers.setFlowJson(modules, schema)
-		return `Flow updated with ${modules.length} module(s): [${modules.map((m: any) => m.id).join(', ')}]`
-	}
-}
-
-/**
- * Build the tools array for the minimal-single-tool variant.
- * Keeps all utility tools (search, resource type, test run, db schema, code generation instructions)
- * but replaces all flow editing tools with a single set_flow_json tool.
- */
-function buildMinimalSingleToolTools(): Tool<FlowAIChatHelpers>[] {
-	// Get all production tools except flow editing tools
-	const utilityTools = (flowTools as Tool<FlowAIChatHelpers>[]).filter(
-		(t) => !FLOW_EDITING_TOOL_NAMES.includes(t.def.function.name)
-	)
-
-	return [...utilityTools, setFlowJsonTool]
-}
-
-const MINIMAL_SINGLE_TOOL_SYSTEM_PROMPT = `You are a helpful assistant that creates and edits workflows on the Windmill platform.
-
-## IMPORTANT RULES
-
-**Reserved IDs - Do NOT use these module IDs:**
-- \`failure\` - Reserved for failure handler module
-- \`preprocessor\` - Reserved for preprocessor module
-- \`Input\` - Reserved for flow input reference
-
-## Tool Selection Guide
-
-**Flow Modification:**
-- **Create or modify the entire flow** → \`set_flow_json\` (provide complete modules array and optional schema)
-
-**Code & Scripts:**
-- **Get language-specific coding instructions** → \`get_instructions_for_code_generation\` (call BEFORE writing code)
-- **Find workspace scripts and flows** → \`search_workspace\`
-- **Get details of a specific script or flow** → \`get_runnable_details\`
-- **Find Windmill Hub scripts** → \`search_hub_scripts\`
-
-**Testing:**
-- **Test entire flow** → \`test_run_flow\`
-- **Test single step** → \`test_run_step\`
-
-**Resources & Schema:**
-- **Search resource types** → \`resource_type\`
-- **Get database schema** → \`get_db_schema\`
-
-## Common Mistakes to Avoid
-
-- **Don't forget \`input_transforms\`** - Rawscript parameters won't receive values without them
-- **Don't use spaces in module IDs** - Use underscores (e.g., \`fetch_data\` not \`fetch data\`)
-- **Don't reference future steps** - \`results.step_id\` only works for steps that execute before the current one
-- **Don't create duplicate IDs** - Each module ID must be unique in the flow
-
-## Flow Modification with set_flow_json
-
-Use the \`set_flow_json\` tool to set the entire flow structure at once. Provide the complete modules array and optionally the flow input schema.
-
-**Parameters:**
-- \`modules\`: Array of flow modules (required)
-- \`schema\`: Flow input schema in JSON Schema format (optional)
-
-**Example - Simple flow:**
-\`\`\`javascript
-set_flow_json({
-  modules: [
-    {
-      id: "fetch_data",
-      summary: "Fetch user data from API",
-      value: {
-        type: "rawscript",
-        language: "bun",
-        content: "export async function main(userId: string) { return { id: userId, name: 'John' }; }",
-        input_transforms: {
-          userId: { type: "javascript", expr: "flow_input.user_id" }
-        }
-      }
-    },
-    {
-      id: "process_data",
-      summary: "Process the fetched data",
-      value: {
-        type: "rawscript",
-        language: "bun",
-        content: "export async function main(data: any) { return { processed: true, ...data }; }",
-        input_transforms: {
-          data: { type: "javascript", expr: "results.fetch_data" }
-        }
-      }
-    }
-  ],
-  schema: {
-    type: "object",
-    properties: {
-      user_id: { type: "string", description: "User ID to fetch" }
-    },
-    required: ["user_id"]
-  }
-})
-\`\`\`
-
-**Example - Flow with for loop:**
-\`\`\`javascript
-set_flow_json({
-  modules: [
-    {
-      id: "get_items",
-      summary: "Get list of items",
-      value: {
-        type: "rawscript",
-        language: "bun",
-        content: "export async function main() { return [1, 2, 3]; }",
-        input_transforms: {}
-      }
-    },
-    {
-      id: "loop_items",
-      summary: "Process each item",
-      value: {
-        type: "forloopflow",
-        iterator: { type: "javascript", expr: "results.get_items" },
-        skip_failures: false,
-        parallel: true,
-        modules: [
-          {
-            id: "process_item",
-            summary: "Process single item",
-            value: {
-              type: "rawscript",
-              language: "bun",
-              content: "export async function main(item: number) { return item * 2; }",
-              input_transforms: {
-                item: { type: "javascript", expr: "flow_input.iter.value" }
-              }
-            }
-          }
-        ]
-      }
-    }
-  ]
-})
-\`\`\`
-
-**Example - Flow with branches (branchone):**
-\`\`\`javascript
-set_flow_json({
-  modules: [
-    {
-      id: "get_value",
-      summary: "Get a value to branch on",
-      value: {
-        type: "rawscript",
-        language: "bun",
-        content: "export async function main() { return 50; }",
-        input_transforms: {}
-      }
-    },
-    {
-      id: "branch_on_value",
-      summary: "Branch based on value",
-      value: {
-        type: "branchone",
-        branches: [
-          {
-            summary: "High value",
-            expr: "results.get_value > 75",
-            modules: [
-              {
-                id: "high_handler",
-                value: {
-                  type: "rawscript",
-                  language: "bun",
-                  content: "export async function main() { return 'high'; }",
-                  input_transforms: {}
-                }
-              }
-            ]
-          },
-          {
-            summary: "Medium value",
-            expr: "results.get_value > 25",
-            modules: [
-              {
-                id: "medium_handler",
-                value: {
-                  type: "rawscript",
-                  language: "bun",
-                  content: "export async function main() { return 'medium'; }",
-                  input_transforms: {}
-                }
-              }
-            ]
-          }
-        ],
-        default: [
-          {
-            id: "low_handler",
-            value: {
-              type: "rawscript",
-              language: "bun",
-              content: "export async function main() { return 'low'; }",
-              input_transforms: {}
-            }
-          }
-        ]
-      }
-    }
-  ]
-})
-\`\`\`
-
-Follow the user instructions carefully.
-At the end of your changes, explain precisely what you did and what the flow does now.
-ALWAYS test your modifications using the \`test_run_flow\` tool. If the user cancels the test run, do not try again and wait for the next user instruction.
-When testing steps that are sql scripts, the arguments to be passed are { database: $res:<db_resource> }.
-
-### Input Transforms for Rawscripts
-
-Rawscript modules use \`input_transforms\` to map function parameters to values. Each key in \`input_transforms\` corresponds to a parameter name in your script's \`main\` function.
-
-**Transform Types:**
-- \`static\`: Fixed value passed directly
-- \`javascript\`: Dynamic expression evaluated at runtime
-
-**Available Variables in JavaScript Expressions:**
-- \`flow_input.{property}\` - Access flow input parameters
-- \`results.{step_id}\` - Access output from a previous step
-- \`flow_input.iter.value\` - Current item when inside a for-loop
-- \`flow_input.iter.index\` - Current index when inside a for-loop
-
-**Example - Rawscript using flow input and previous step result:**
-\`\`\`json
-{
-  "id": "step_b",
-  "value": {
-    "type": "rawscript",
-    "language": "bun",
-    "content": "export async function main(userId: string, data: any[]) { return 'Hello, world!'; }",
-    "input_transforms": {
-      "userId": { "type": "javascript", "expr": "flow_input.user_id" },
-      "data": { "type": "javascript", "expr": "results.step_a" }
-    }
-  }
-}
-\`\`\`
-
-**Important:** The parameter names in \`input_transforms\` must match the function parameter names in your script.
-
-### Other Key Concepts
-- **Resources**: For flow inputs, use type "object" with format "resource-<type>". For step inputs, use "$res:path/to/resource"
-- **Module IDs**: Must be unique and valid identifiers. Used to reference results via \`results.step_id\`
-- **Module types**: Use 'bun' as default language for rawscript if unspecified
-
-### Writing Code for Modules
-
-**IMPORTANT: Before writing any code for a rawscript module, you MUST call the \`get_instructions_for_code_generation\` tool with the target language.** This tool provides essential language-specific instructions.
-
-Example: Before writing TypeScript/Bun code, call \`get_instructions_for_code_generation({ language: "bun" })\`
-
-### Creating Flows
-
-1. **Search for existing scripts first** (unless user explicitly asks to write from scratch):
-   - First: \`search_workspace\` to find workspace scripts and flows
-   - Use \`get_runnable_details\` to inspect a specific script or flow (inputs, description, code)
-   - Then: \`search_hub_scripts\` (only consider highly relevant results)
-   - Only create raw scripts if no suitable script is found
-
-2. **Build the complete flow using \`set_flow_json\`:**
-   - If using existing script: use \`type: "script"\` with \`path\`
-   - If creating rawscript: use \`type: "rawscript"\` with \`language\` and \`content\`
-   - **First call \`get_instructions_for_code_generation\` to get the correct code format**
-   - Always define \`input_transforms\` to connect parameters to flow inputs or previous step results
-
-### AI Agent Modules
-
-AI agents can use tools to accomplish tasks. When creating an AI agent module:
-
-\`\`\`javascript
-{
-  id: "support_agent",
-  summary: "AI agent for customer support",
-  value: {
-    type: "aiagent",
-    input_transforms: {
-      provider: { type: "static", value: "$res:f/ai_providers/openai" },
-      output_type: { type: "static", value: "text" },
-      user_message: { type: "javascript", expr: "flow_input.query" },
-      system_prompt: { type: "static", value: "You are a helpful assistant." }
-    },
-    tools: [
-      {
-        id: "search_docs",
-        summary: "Search_documentation",
-        value: {
-          tool_type: "flowmodule",
-          type: "rawscript",
-          language: "bun",
-          content: "export async function main(query: string) { return ['doc1', 'doc2']; }",
-          input_transforms: { query: { type: "static", value: "" } }
-        }
-      }
-    ]
-  }
-}
-\`\`\`
-
-- **Tool IDs**: Cannot contain spaces - use underscores
-- **Tool summaries**: Cannot contain spaces - use underscores
-- **Tool types**: \`flowmodule\` for scripts/flows, \`mcp\` for MCP server tools
-
-## Resource Types
-On Windmill, credentials and configuration are stored in resources. Resource types define the format of the resource.
-- Use the \`resource_type\` tool to search for available resource types (e.g. stripe, google, postgresql, etc.)
-- If the user needs a resource as flow input, set the property type in the schema to "object" and add a key called "format" set to "resource-nameofresourcetype" (e.g. "resource-stripe")
-- If the user wants a specific resource as step input, set the step value to a static string in the format: "$res:path/to/resource"
-
-### OpenFlow Schema Reference
-Below is the complete OpenAPI schema for OpenFlow. All field descriptions and behaviors are defined here. Refer to this as the authoritative reference when generating flow JSON:
-
-\`\`\`json
-${JSON.stringify(openFlowSchema, null, 2)}
-\`\`\`
-
-The schema includes detailed descriptions for:
-- **FlowModuleValue types**: rawscript, script, flow, forloopflow, whileloopflow, branchone, branchall, identity, aiagent
-- **Module configuration**: stop_after_if, skip_if, suspend, sleep, cache_ttl, retry, mock, timeout
-- **InputTransform**: static vs javascript, available variables (results, flow_input, flow_input.iter)
-- **Special modules**: preprocessor_module, failure_module
-- **Loop options**: iterator, parallel, parallelism, skip_failures
-- **Branch types**: BranchOne (first match), BranchAll (all execute)
-`
-
-/**
- * Minimal single-tool variant.
- * Replaces granular flow editing tools (add_module, remove_module, modify_module, etc.)
- * with a single set_flow_json tool, while keeping all other utility tools.
- * Uses the default system prompt.
- */
-export const MINIMAL_SINGLE_TOOL_VARIANT: VariantConfig = {
-	name: 'minimal-single-tool',
-	description:
-		'Default prompt with set_flow_json instead of granular flow editing tools, keeps all utility tools',
-	systemPrompt: {
-		type: 'custom',
-		content: MINIMAL_SINGLE_TOOL_SYSTEM_PROMPT
-	},
-	tools: {
-		type: 'custom',
-		tools: buildMinimalSingleToolTools()
-	}
-}
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/shared/baseLLMEvaluator.ts b/frontend/src/lib/components/copilot/chat/__tests__/shared/baseLLMEvaluator.ts
deleted file mode 100644
index bd7bd06d44..0000000000
--- a/frontend/src/lib/components/copilot/chat/__tests__/shared/baseLLMEvaluator.ts
+++ /dev/null
@@ -1,135 +0,0 @@
-import Anthropic from '@anthropic-ai/sdk'
-import type { EvaluationResult } from './types'
-
-/**
- * Parameters for LLM-based evaluation.
- */
-export interface EvaluateParams {
-	/** The user's original request/prompt */
-	userPrompt: string
-	/** The generated output to evaluate */
-	generatedOutput: unknown
-	/** The expected/reference output */
-	expectedOutput: unknown
-	/** Domain-specific system prompt for the evaluator */
-	evaluatorSystemPrompt: string
-	/** Anthropic API key for evaluation */
-	apiKey?: string
-	/** Model to use for evaluation (default: 'claude-sonnet-4-5-20250514') */
-	model?: string
-}
-
-/**
- * Base evaluator system prompt template.
- * Domain-specific evaluators should build on this structure.
- */
-export const BASE_EVALUATOR_RESPONSE_FORMAT = `
-## Response Format
-You MUST respond with valid JSON only, no additional text:
-{
-  "resemblanceScore": <0-100 integer>,
-  "statement": "<brief 1-2 sentence summary of how well the output matches the user request and expected output>",
-  "missingRequirements": ["<list any requirements from user prompt that are missing or incorrectly implemented>"]
-}
-
-Score guidelines:
-- 90-100: Fully addresses user request, functionally equivalent to expected output
-- 70-89: Addresses most user requirements, same overall structure with minor differences
-- 50-69: Partially addresses user request, achieves similar goal but different approach
-- 30-49: Missing significant requirements from user request
-- 0-29: Does not address user request or significantly incorrect`
-
-/**
- * Evaluates how well a generated output matches an expected output using an LLM.
- * Uses Anthropic API directly instead of OpenRouter.
- */
-export async function evaluateWithLLM(params: EvaluateParams): Promise<EvaluationResult> {
-	const {
-		userPrompt,
-		generatedOutput,
-		expectedOutput,
-		evaluatorSystemPrompt,
-		apiKey,
-		model = 'claude-sonnet-4-5-20250514'
-	} = params
-
-	// @ts-ignore - process.env
-	const anthropicKey = apiKey ?? process.env.ANTHROPIC_API_KEY
-	if (!anthropicKey) {
-		return {
-			success: false,
-			resemblanceScore: 0,
-			statement: 'No API key available for evaluation',
-			error: 'ANTHROPIC_API_KEY not set and no apiKey provided'
-		}
-	}
-
-	const client = new Anthropic({ apiKey: anthropicKey })
-
-	const userMessage = `## User's Original Request
-${userPrompt}
-
-## Expected Reference Output
-\`\`\`json
-${JSON.stringify(expectedOutput, null, 2)}
-\`\`\`
-
-## Generated Output
-\`\`\`json
-${JSON.stringify(generatedOutput, null, 2)}
-\`\`\`
-
-Please evaluate how well the generated output:
-1. Fulfills ALL requirements from the user's original request
-2. Matches the structure and logic of the expected reference output`
-
-	try {
-		const response = await client.messages.create({
-			model,
-			max_tokens: 2048,
-			system: evaluatorSystemPrompt,
-			messages: [
-				{ role: 'user', content: userMessage }
-			],
-			temperature: 0
-		})
-
-		const textBlock = response.content.find((block) => block.type === 'text')
-		const content = textBlock?.text
-		if (!content) {
-			return {
-				success: false,
-				resemblanceScore: 0,
-				statement: 'No response from evaluator',
-				error: 'Empty response from LLM'
-			}
-		}
-
-		// Parse JSON response - handle potential markdown code blocks
-		let jsonContent = content.trim()
-		if (jsonContent.startsWith('```')) {
-			jsonContent = jsonContent.replace(/^```(?:json)?\n?/, '').replace(/\n?```$/, '')
-		}
-
-		const parsed = JSON.parse(jsonContent) as {
-			resemblanceScore: number
-			statement: string
-			missingRequirements?: string[]
-		}
-
-		return {
-			success: true,
-			resemblanceScore: Math.max(0, Math.min(100, Math.round(parsed.resemblanceScore))),
-			statement: parsed.statement,
-			missingRequirements: parsed.missingRequirements ?? []
-		}
-	} catch (err) {
-		const errorMessage = err instanceof Error ? err.message : String(err)
-		return {
-			success: false,
-			resemblanceScore: 0,
-			statement: 'Evaluation failed',
-			error: errorMessage
-		}
-	}
-}
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/shared/baseResultsWriter.ts b/frontend/src/lib/components/copilot/chat/__tests__/shared/baseResultsWriter.ts
deleted file mode 100644
index 0ecce615ba..0000000000
--- a/frontend/src/lib/components/copilot/chat/__tests__/shared/baseResultsWriter.ts
+++ /dev/null
@@ -1,169 +0,0 @@
-// @ts-ignore
-import { writeFile, mkdir } from 'fs/promises'
-// @ts-ignore
-import { join, dirname } from 'path'
-// @ts-ignore
-import { fileURLToPath } from 'url'
-import type { BaseEvalResult } from './types'
-
-/**
- * Generates a timestamp string suitable for filenames.
- * Format: 2024-01-15T10-30-45-123Z (ISO but with dashes instead of colons)
- */
-export function generateTimestamp(): string {
-	return new Date().toISOString().replace(/:/g, '-')
-}
-
-/**
- * Parameters for writing comparison results.
- */
-export interface WriteResultsParams<TOutput> {
-	/** User prompt that was tested */
-	userPrompt: string
-	/** Results from all variants */
-	results: BaseEvalResult<TOutput>[]
-	/** Directory to write results to */
-	outputDir: string
-	/** Function to format domain-specific output for JSON files */
-	formatOutput: (output: TOutput) => unknown
-	/** Label for the output type (e.g., 'flow', 'app') */
-	outputLabel?: string
-}
-
-/**
- * Writes comparison results to files in the results folder.
- * Creates:
- * - summary.md - Summary with prompt and results table
- * - {variant_name}.json - Full result with metadata for each variant
- * - {variant_name}_{outputLabel}.json - Clean output for each variant
- */
-export async function writeComparisonResults<TOutput>(
-	params: WriteResultsParams<TOutput>
-): Promise<{ summaryPath: string; outputPaths: string[] }> {
-	const { userPrompt, results, outputDir, formatOutput, outputLabel = 'output' } = params
-	const timestamp = generateTimestamp()
-
-	// Ensure results directory exists
-	await mkdir(outputDir, { recursive: true })
-	const resultFolder = join(outputDir, timestamp)
-	await mkdir(resultFolder, { recursive: true })
-
-	// Check if any results have evaluation data
-	const hasEvaluation = results.some((r) => r.evaluationResult)
-
-	// Build summary markdown
-	const summaryLines: string[] = [
-		`# Eval Results - ${timestamp}`,
-		'',
-		'## User Prompt',
-		'```',
-		userPrompt.trim(),
-		'```',
-		'',
-		'## Results',
-		''
-	]
-
-	// Add results table header based on whether evaluation data exists
-	if (hasEvaluation) {
-		summaryLines.push(
-			'| Variant | Success | Total Tokens | Tool Calls | Iterations | Resemblance Score |'
-		)
-		summaryLines.push(
-			'|---------|---------|--------------|------------|------------|-------------------|'
-		)
-	} else {
-		summaryLines.push('| Variant | Success | Total Tokens | Tool Calls | Iterations |')
-		summaryLines.push('|---------|---------|--------------|------------|------------|')
-	}
-
-	for (const result of results) {
-		const baseRow = `| ${result.variantName} | ${result.success} | ${result.tokenUsage.total} | ${result.toolsCalled.length} | ${result.iterations}`
-		if (hasEvaluation) {
-			const score = result.evaluationResult?.resemblanceScore ?? 'N/A'
-			summaryLines.push(`${baseRow} | ${score} |`)
-		} else {
-			summaryLines.push(`${baseRow} |`)
-		}
-	}
-
-	// Add evaluation details section if available
-	if (hasEvaluation) {
-		summaryLines.push('')
-		summaryLines.push('## Evaluation Details')
-		summaryLines.push('')
-		for (const result of results) {
-			if (result.evaluationResult) {
-				summaryLines.push(`### ${result.variantName}`)
-				summaryLines.push('')
-				summaryLines.push(`**Score:** ${result.evaluationResult.resemblanceScore}/100`)
-				summaryLines.push('')
-				summaryLines.push(`**Statement:** ${result.evaluationResult.statement}`)
-				summaryLines.push('')
-				if (
-					result.evaluationResult.missingRequirements &&
-					result.evaluationResult.missingRequirements.length > 0
-				) {
-					summaryLines.push('**Missing Requirements:**')
-					for (const req of result.evaluationResult.missingRequirements) {
-						summaryLines.push(`- ${req}`)
-					}
-					summaryLines.push('')
-				}
-				if (result.evaluationResult.error) {
-					summaryLines.push(`**Error:** ${result.evaluationResult.error}`)
-					summaryLines.push('')
-				}
-			}
-		}
-	}
-
-	// Add errors section for failed variants
-	const failedResults = results.filter((r) => !r.success && r.error)
-	if (failedResults.length > 0) {
-		summaryLines.push('')
-		summaryLines.push('## Errors')
-		summaryLines.push('')
-		for (const result of failedResults) {
-			summaryLines.push(`### ${result.variantName}`)
-			summaryLines.push('')
-			summaryLines.push('```')
-			summaryLines.push(result.error!)
-			summaryLines.push('```')
-			summaryLines.push('')
-		}
-	}
-
-	const outputPaths: string[] = []
-
-	for (const result of results) {
-		const resultFilename = `${result.variantName}.json`
-		const resultPath = join(resultFolder, resultFilename)
-		outputPaths.push(resultPath)
-
-		const outputFilename = `${result.variantName}_${outputLabel}.json`
-		const outputPath = join(resultFolder, outputFilename)
-
-		// Write result JSON file (with metadata)
-		const resultData = {
-			variantName: result.variantName,
-			success: result.success,
-			error: result.error,
-			evaluationResult: result.evaluationResult,
-			toolsCalled: result.toolsCalled,
-			toolCallDetails: result.toolCallDetails,
-			messages: result.messages
-		}
-		await writeFile(resultPath, JSON.stringify(resultData, null, 2))
-
-		// Write clean output JSON file (domain-specific format)
-		const outputData = formatOutput(result.output)
-		await writeFile(outputPath, JSON.stringify(outputData, null, 2))
-	}
-
-	// Write summary markdown file
-	const summaryPath = join(resultFolder, `summary.md`)
-	await writeFile(summaryPath, summaryLines.join('\n'))
-
-	return { summaryPath, outputPaths }
-}
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/shared/baseVariants.ts b/frontend/src/lib/components/copilot/chat/__tests__/shared/baseVariants.ts
deleted file mode 100644
index 26d9bf57cc..0000000000
--- a/frontend/src/lib/components/copilot/chat/__tests__/shared/baseVariants.ts
+++ /dev/null
@@ -1,108 +0,0 @@
-import type {
-	ChatCompletionFunctionTool,
-	ChatCompletionSystemMessageParam
-} from 'openai/resources/chat/completions.mjs'
-import type { ChatCompletionTool } from 'openai/resources/chat/completions.mjs'
-import type { VariantConfig } from './types'
-
-/**
- * Generic tool interface that matches the structure used across chat modules.
- */
-export interface Tool<THelpers> {
-	def: ChatCompletionFunctionTool
-	fn: (params: {
-		args: Record<string, unknown>
-		workspace: string
-		helpers: THelpers
-		toolCallbacks: {
-			setToolStatus: (...args: unknown[]) => void
-			removeToolStatus: (...args: unknown[]) => void
-		}
-		toolId: string
-	}) => Promise<string>
-}
-
-/**
- * Domain-specific defaults for variant resolution.
- */
-export interface VariantDefaults<THelpers> {
-	/** Function to prepare system message, optionally with custom prompt */
-	prepareSystemMessage: (customPrompt?: string) => ChatCompletionSystemMessageParam
-	/** Available tools for the domain */
-	tools: Tool<THelpers>[]
-}
-
-/**
- * Resolves system prompt from variant config.
- * Returns the appropriate ChatCompletionSystemMessageParam based on config.
- */
-export function resolveSystemPrompt<THelpers>(
-	variant: VariantConfig | undefined,
-	defaults: VariantDefaults<THelpers>,
-	fallbackCustomPrompt?: string
-): ChatCompletionSystemMessageParam {
-	if (!variant?.systemPrompt || variant.systemPrompt.type === 'default') {
-		return defaults.prepareSystemMessage(fallbackCustomPrompt)
-	}
-
-	if (variant.systemPrompt.type === 'default-with-custom') {
-		return defaults.prepareSystemMessage(variant.systemPrompt.custom)
-	}
-
-	// type === 'custom'
-	return {
-		role: 'system',
-		content: variant.systemPrompt.content
-	}
-}
-
-/**
- * Resolves tools from variant config.
- * Returns both the tool definitions (for API) and full tools (for execution).
- */
-export function resolveTools<THelpers>(
-	variant: VariantConfig | undefined,
-	defaults: VariantDefaults<THelpers>
-): {
-	toolDefs: ChatCompletionTool[]
-	tools: Tool<THelpers>[]
-} {
-	if (!variant?.tools || variant.tools.type === 'default') {
-		return {
-			toolDefs: defaults.tools.map((t) => t.def),
-			tools: defaults.tools
-		}
-	}
-
-	if (variant.tools.type === 'subset') {
-		const includeList = (variant.tools as { type: 'subset'; include: string[] }).include
-		const subset = defaults.tools.filter((t) => includeList.includes(t.def.function.name))
-		return {
-			toolDefs: subset.map((t) => t.def),
-			tools: subset
-		}
-	}
-
-	if (variant.tools.type === 'custom') {
-		// Custom tools are typed as unknown[] in base VariantConfig but domain-specific
-		// code should ensure they are the correct Tool<THelpers> type
-		const customTools = variant.tools.tools as Tool<THelpers>[]
-		return {
-			toolDefs: customTools.map((t) => t.def),
-			tools: customTools
-		}
-	}
-
-	// Default fallback
-	return {
-		toolDefs: defaults.tools.map((t) => t.def),
-		tools: defaults.tools
-	}
-}
-
-/**
- * Resolves model from variant config with fallback.
- */
-export function resolveModel(variant?: VariantConfig, fallback?: string): string {
-	return variant?.model ?? fallback ?? 'gpt-4o'
-}
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/shared/index.ts b/frontend/src/lib/components/copilot/chat/__tests__/shared/index.ts
deleted file mode 100644
index 0c1b3bc8cb..0000000000
--- a/frontend/src/lib/components/copilot/chat/__tests__/shared/index.ts
+++ /dev/null
@@ -1,28 +0,0 @@
-// Types
-export type {
-	TokenUsage,
-	ToolCallDetail,
-	EvaluationResult,
-	BaseEvalResult,
-	VariantConfig,
-	EvalRunnerOptions,
-	ToolCallbacks
-} from './types'
-
-export { createNoOpToolCallbacks } from './types'
-
-// Variant resolution
-export type { Tool, VariantDefaults } from './baseVariants'
-export { resolveSystemPrompt, resolveTools, resolveModel } from './baseVariants'
-
-// Eval runner
-export type { RawEvalResult, RunEvalParams } from './baseEvalRunner'
-export { runEval } from './baseEvalRunner'
-
-// LLM evaluator
-export type { EvaluateParams } from './baseLLMEvaluator'
-export { evaluateWithLLM, BASE_EVALUATOR_RESPONSE_FORMAT } from './baseLLMEvaluator'
-
-// Results writer
-export type { WriteResultsParams } from './baseResultsWriter'
-export { writeComparisonResults, generateTimestamp } from './baseResultsWriter'
diff --git a/frontend/src/lib/components/copilot/chat/__tests__/shared/types.ts b/frontend/src/lib/components/copilot/chat/__tests__/shared/types.ts
deleted file mode 100644
index 61f7f1fd1f..0000000000
--- a/frontend/src/lib/components/copilot/chat/__tests__/shared/types.ts
+++ /dev/null
@@ -1,107 +0,0 @@
-import type { ChatCompletionMessageParam } from 'openai/resources/chat/completions.mjs'
-import type { AIProvider } from '$lib/gen/types.gen'
-
-/**
- * Token usage tracking for LLM calls.
- */
-export interface TokenUsage {
-	prompt: number
-	completion: number
-	total: number
-}
-
-/**
- * Details of a single tool call made during evaluation.
- */
-export interface ToolCallDetail {
-	name: string
-	arguments: Record<string, unknown>
-}
-
-/**
- * Result of LLM-based comparison/evaluation.
- */
-export interface EvaluationResult {
-	success: boolean
-	resemblanceScore: number
-	statement: string
-	missingRequirements?: string[]
-	error?: string
-}
-
-/**
- * Base evaluation result that can be extended for domain-specific outputs.
- * @template TOutput The domain-specific output type (e.g., flow definition, app files)
- */
-export interface BaseEvalResult<TOutput> {
-	success: boolean
-	output: TOutput
-	error?: string
-	tokenUsage: TokenUsage
-	toolCallsCount: number
-	toolsCalled: string[]
-	toolCallDetails: ToolCallDetail[]
-	iterations: number
-	variantName: string
-	evaluationResult?: EvaluationResult
-	messages: ChatCompletionMessageParam[]
-}
-
-/**
- * Base configuration for a variant in eval testing.
- * Allows customizing system prompt, tools, and model for comparison.
- *
- * Note: Domain-specific variants may extend this with custom tool configurations.
- * See flow/flowEvalVariants.ts for an example with custom tools.
- */
-export interface VariantConfig {
-	name: string
-	description?: string
-
-	/** System prompt configuration */
-	systemPrompt?:
-		| { type: 'default' }
-		| { type: 'default-with-custom'; custom: string }
-		| { type: 'custom'; content: string }
-
-	/** Tools configuration - basic types supported by shared code */
-	tools?:
-		| { type: 'default' }
-		| { type: 'subset'; include: string[] }
-		| { type: 'custom'; tools: unknown[] }
-
-	/** Model to use (default: 'gpt-4o') */
-	model?: string
-}
-
-/**
- * Options for running an evaluation.
- */
-export interface EvalRunnerOptions {
-	/** Maximum iterations for tool call loop (default: 20) */
-	maxIterations?: number
-	/** Model to use for LLM calls */
-	model?: string
-	/** Workspace ID for tool calls */
-	workspace?: string
-	/** AI provider (inferred from model name if omitted) */
-	provider?: AIProvider
-}
-
-/**
- * No-op tool callbacks for eval testing.
- */
-export interface ToolCallbacks {
-	setToolStatus: (id: string, status: { content?: string; result?: string; error?: string }) => void
-	removeToolStatus: (id: string) => void
-}
-
-/**
- * Creates no-op tool callbacks for eval testing.
- */
-export function createNoOpToolCallbacks(): ToolCallbacks {
-	return {
-		setToolStatus: () => {},
-		removeToolStatus: () => {}
-	}
-}
diff --git a/frontend/src/lib/components/copilot/chat/anthropic.ts b/frontend/src/lib/components/copilot/chat/anthropic.ts
index ac45c175a6..4bc581db88 100644
--- a/frontend/src/lib/components/copilot/chat/anthropic.ts
+++ b/frontend/src/lib/components/copilot/chat/anthropic.ts
@@ -17,6 +17,12 @@ import type { MessageStream } from '@anthropic-ai/sdk/lib/MessageStream'
 import type { AIProviderModel } from '$lib/gen'
 import { getProviderAndCompletionConfig, workspaceAIClients } from '../lib'
 import { processToolCall, type Tool, type ToolCallbacks } from './shared'
+import { anthropicUsageToChatTokenUsage, type ChatTokenUsage } from './tokenUsage'
+
+interface ParsedCompletionResult {
+	shouldContinue: boolean
+	tokenUsage: ChatTokenUsage
+}
 
 export async function getAnthropicCompletion(
 	messages: ChatCompletionMessageParam[],
@@ -70,7 +76,7 @@ export async function parseAnthropicCompletion(
 	helpers: any,
 	abortController?: AbortController,
 	options?: { workspace?: string }
-): Promise<boolean> {
+): Promise<ParsedCompletionResult> {
 	let toolCallsToProcess: ChatCompletionMessageFunctionToolCall[] = []
 	let error = null
 
@@ -205,6 +211,9 @@ export async function parseAnthropicCompletion(
 		throw error
 	}
 
+	const finalMessage = await completion.finalMessage()
+	const tokenUsage = anthropicUsageToChatTokenUsage(finalMessage.usage)
+
 	// Process tool calls if any
 	if (toolCallsToProcess.length > 0) {
 		const assistantWithTools = {
@@ -226,10 +235,10 @@ export async function parseAnthropicCompletion(
 			messages.push(messageToAdd)
 			addedMessages.push(messageToAdd)
 		}
-		return true // Continue the conversation loop
+		return { shouldContinue: true, tokenUsage }
 	}
 
-	return false // End the conversation
+	return { shouldContinue: false, tokenUsage }
 }
 
 export function convertOpenAIToAnthropicMessages(messages: ChatCompletionMessageParam[]): {
diff --git a/frontend/src/lib/components/copilot/chat/chatLoop.ts b/frontend/src/lib/components/copilot/chat/chatLoop.ts
index 4b239e4a05..3c27317be3 100644
--- a/frontend/src/lib/components/copilot/chat/chatLoop.ts
+++ b/frontend/src/lib/components/copilot/chat/chatLoop.ts
@@ -13,6 +13,11 @@ import {
 	parseOpenAIResponsesCompletion
 } from './openai-responses'
 import type { Tool, ToolCallbacks } from './shared'
+import {
+	addChatTokenUsage,
+	emptyChatTokenUsage,
+	type ChatTokenUsage
+} from './tokenUsage'
 
 export interface ChatClients {
 	openai: OpenAI
@@ -49,6 +54,7 @@ export interface ChatLoopConfig {
 
 export interface ChatLoopResult {
 	addedMessages: ChatCompletionMessageParam[]
+	tokenUsage: ChatTokenUsage
 }
 
 export async function runChatLoop(config: ChatLoopConfig): Promise<ChatLoopResult> {
@@ -66,6 +72,7 @@ export async function runChatLoop(config: ChatLoopConfig): Promise<ChatLoopResul
 	let skipResponsesApi = config.skipResponsesApi ?? false
 
 	const addedMessages: ChatCompletionMessageParam[] = []
+	let tokenUsage = emptyChatTokenUsage()
 	let iterations = 0
 
 	while (true) {
@@ -122,7 +129,8 @@ export async function runChatLoop(config: ChatLoopConfig): Promise<ChatLoopResul
 						helpers,
 						parseOptions
 					)
-					if (!continueCompletion) {
+					tokenUsage = addChatTokenUsage(tokenUsage, continueCompletion.tokenUsage)
+					if (!continueCompletion.shouldContinue) {
 						break
 					}
 				} catch (err) {
@@ -155,7 +163,8 @@ export async function runChatLoop(config: ChatLoopConfig): Promise<ChatLoopResul
 					undefined,
 					parseOptions
 				)
-				if (!continueCompletion) {
+				tokenUsage = addChatTokenUsage(tokenUsage, continueCompletion.tokenUsage)
+				if (!continueCompletion.shouldContinue) {
 					break
 				}
 			}
@@ -180,7 +189,8 @@ export async function runChatLoop(config: ChatLoopConfig): Promise<ChatLoopResul
 					abortController,
 					parseOptions
 				)
-				if (!continueCompletion) {
+				tokenUsage = addChatTokenUsage(tokenUsage, continueCompletion.tokenUsage)
+				if (!continueCompletion.shouldContinue) {
 					break
 				}
 			}
@@ -200,12 +210,13 @@ export async function runChatLoop(config: ChatLoopConfig): Promise<ChatLoopResul
 					undefined,
 					parseOptions
 				)
-				if (!continueCompletion) {
+				tokenUsage = addChatTokenUsage(tokenUsage, continueCompletion.tokenUsage)
+				if (!continueCompletion.shouldContinue) {
 					break
 				}
 			}
 		}
 	}
 
-	return { addedMessages }
+	return { addedMessages, tokenUsage }
 }
diff --git a/frontend/src/lib/components/copilot/chat/flow/FlowAIChat.svelte b/frontend/src/lib/components/copilot/chat/flow/FlowAIChat.svelte
index 1a567db370..8fd4b35003 100644
--- a/frontend/src/lib/components/copilot/chat/flow/FlowAIChat.svelte
+++ b/frontend/src/lib/components/copilot/chat/flow/FlowAIChat.svelte
@@ -5,7 +5,7 @@
 	import { dfs } from '$lib/components/flows/previousResults'
 	import type { FlowModule, InputTransform, OpenFlow } from '$lib/gen'
 	import type { FlowAIChatHelpers } from './core'
-	import { restoreInlineScriptReferences } from './inlineScriptsUtils'
+	import { createInlineScriptSession } from './inlineScriptsUtils'
 	import { loadSchemaFromModule } from '$lib/components/flows/flowInfers'
 	import { aiChatManager } from '../AIChatManager.svelte'
 	import { refreshStateStore } from '$lib/svelte5Utils.svelte'
@@ -27,6 +27,7 @@
 	const selectedId = $derived(selectionManager.getSelectedId())
 
 	const { exprsToSet } = getContext<FlowCopilotContext | undefined>('FlowCopilotContext') ?? {}
+	const inlineScriptSession = createInlineScriptSession()
 
 	// Get diffManager from the graph
 	const diffManager = $derived(flowModuleSchemaMap?.getDiffManager())
@@ -62,6 +63,7 @@
 			}
 			return flowStore.val.value.modules
 		},
+		inlineScriptSession,
 		setSnapshot: (snapshot: ExtendedOpenFlow) => {
 			diffManager?.setBeforeFlow(snapshot)
 		},
@@ -103,6 +105,7 @@
 
 				// 2. Apply the code change
 				module.value.content = code
+				inlineScriptSession.set(id, code)
 				const { input_transforms, schema } = await loadSchemaFromModule(module)
 				module.value.input_transforms = input_transforms
 				refreshStateStore(flowStore)
@@ -216,7 +219,13 @@
 
 				if (modules) {
 					// Restore inline script references back to full content
-					const restoredModules = restoreInlineScriptReferences(modules)
+					const restoredModules = inlineScriptSession.restoreInlineScriptReferences(modules)
+					const unresolvedRefs = inlineScriptSession.findUnresolvedInlineScriptRefs(restoredModules)
+					if (unresolvedRefs.length > 0) {
+						throw new Error(
+							`Unresolved inline script references: ${unresolvedRefs.join(', ')}`
+						)
+					}
 					// Directly modify flowStore (immediate effect)
 					flowStore.val.value.modules = restoredModules
 				}
diff --git a/frontend/src/lib/components/copilot/chat/flow/core.ts b/frontend/src/lib/components/copilot/chat/flow/core.ts
index f695da0aa1..e8fd54020a 100644
--- a/frontend/src/lib/components/copilot/chat/flow/core.ts
+++ b/frontend/src/lib/components/copilot/chat/flow/core.ts
@@ -34,7 +34,7 @@ import {
 } from '../shared'
 import type { ContextElement } from '../context'
 import type { ExtendedOpenFlow } from '$lib/components/flows/types'
-import { inlineScriptStore, extractAndReplaceInlineScripts } from './inlineScriptsUtils'
+import type { InlineScriptSession } from './inlineScriptsUtils'
 import { flowModulesSchema } from './openFlowZod'
 import { collectAllModuleIdsFromArray } from './utils'
 import { getFlowPrompt } from '$system_prompts'
@@ -247,6 +247,7 @@ export interface FlowAIChatHelpers {
 	// flow context
 	getFlowAndSelectedId: () => { flow: ExtendedOpenFlow; selectedId: string }
 	getModules: (id?: string) => FlowModule[]
+	inlineScriptSession: InlineScriptSession
 
 	// snapshot management (AI sets this when making changes)
 	/** Set the before flow snapshot */
@@ -581,7 +582,7 @@ export const flowTools: Tool<FlowAIChatHelpers>[] = [
 	},
 	{
 		def: inspectInlineScriptToolDef,
-		fn: async ({ args, toolCallbacks, toolId }) => {
+		fn: async ({ args, helpers, toolCallbacks, toolId }) => {
 			const parsedArgs = inspectInlineScriptSchema.parse(args)
 			const moduleId = parsedArgs.moduleId
 
@@ -589,7 +590,7 @@ export const flowTools: Tool<FlowAIChatHelpers>[] = [
 				content: `Retrieving inline script content for module '${moduleId}'...`
 			})
 
-			const content = inlineScriptStore.get(moduleId)
+			const content = helpers.inlineScriptSession.get(moduleId)
 
 			if (content === undefined) {
 				toolCallbacks.setToolStatus(toolId, {
@@ -623,9 +624,6 @@ export const flowTools: Tool<FlowAIChatHelpers>[] = [
 
 			toolCallbacks.setToolStatus(toolId, { content: `Setting code for module '${moduleId}'...` })
 
-			// Update store to keep it coherent (for subsequent set_flow_json calls with references)
-			inlineScriptStore.set(moduleId, code)
-
 			// Update the flow directly via helper
 			await helpers.setCode(moduleId, code)
 
@@ -1057,7 +1055,8 @@ You have access to the following contexts:
 export function prepareFlowUserMessage(
 	instructions: string,
 	flowAndSelectedId?: { flow: ExtendedOpenFlow; selectedId: string },
-	selectedContext: ContextElement[] = []
+	selectedContext: ContextElement[] = [],
+	inlineScriptSession?: InlineScriptSession
 ): ChatCompletionUserMessageParam {
 	const flow = flowAndSelectedId?.flow
 	const selectedId = flowAndSelectedId?.selectedId
@@ -1075,10 +1074,13 @@ ${instructions}`
 	}
 
 	const codePieces = selectedContext.filter((c) => c.type === 'flow_module_code_piece')
+	const scriptSession = inlineScriptSession
 
 	// Clear the inline script store and extract inline scripts for token optimization
-	inlineScriptStore.clear()
-	const optimizedModules = extractAndReplaceInlineScripts(flow.value.modules)
+	scriptSession?.clear()
+	const optimizedModules = scriptSession
+		? scriptSession.extractAndReplaceInlineScripts(flow.value.modules)
+		: flow.value.modules
 
 	// Apply code pieces to the optimized modules (returns YAML string)
 	const flowModulesYaml = applyCodePiecesToFlowModules(codePieces, optimizedModules)
@@ -1086,7 +1088,7 @@ ${instructions}`
 	// Handle preprocessor and failure modules
 	let optimizedPreprocessor = flow.value.preprocessor_module
 	if (optimizedPreprocessor?.value?.type === 'rawscript' && optimizedPreprocessor.value.content) {
-		inlineScriptStore.set(optimizedPreprocessor.id, optimizedPreprocessor.value.content)
+		scriptSession?.set(optimizedPreprocessor.id, optimizedPreprocessor.value.content)
 		optimizedPreprocessor = {
 			...optimizedPreprocessor,
 			value: {
@@ -1098,7 +1100,7 @@ ${instructions}`
 
 	let optimizedFailure = flow.value.failure_module
 	if (optimizedFailure?.value?.type === 'rawscript' && optimizedFailure.value.content) {
-		inlineScriptStore.set(optimizedFailure.id, optimizedFailure.value.content)
+		scriptSession?.set(optimizedFailure.id, optimizedFailure.value.content)
 		optimizedFailure = {
 			...optimizedFailure,
 			value: {
diff --git a/frontend/src/lib/components/copilot/chat/flow/inlineScriptsUtils.ts b/frontend/src/lib/components/copilot/chat/flow/inlineScriptsUtils.ts
index 9a124c44d7..4180b7e36b 100644
--- a/frontend/src/lib/components/copilot/chat/flow/inlineScriptsUtils.ts
+++ b/frontend/src/lib/components/copilot/chat/flow/inlineScriptsUtils.ts
@@ -1,10 +1,17 @@
 import type { FlowModule } from '$lib/gen'
 
-/**
- * Storage for inline scripts extracted from flow modules.
- * Maps module IDs to their rawscript content for token-efficient transmission to AI.
- */
-class InlineScriptStore {
+export interface InlineScriptSession {
+	clear(): void
+	set(moduleId: string, content: string): void
+	get(moduleId: string): string | undefined
+	has(moduleId: string): boolean
+	getAll(): Record<string, string>
+	extractAndReplaceInlineScripts(modules: FlowModule[]): FlowModule[]
+	restoreInlineScriptReferences(modules: FlowModule[]): FlowModule[]
+	findUnresolvedInlineScriptRefs(modules: FlowModule[]): string[]
+}
+
+class DefaultInlineScriptSession implements InlineScriptSession {
 	private scripts: Map<string, string> = new Map()
 
 	clear() {
@@ -26,15 +33,28 @@ class InlineScriptStore {
 	getAll(): Record<string, string> {
 		return Object.fromEntries(this.scripts.entries())
 	}
+
+	extractAndReplaceInlineScripts(modules: FlowModule[]): FlowModule[] {
+		return extractAndReplaceInlineScripts(modules, this)
+	}
+
+	restoreInlineScriptReferences(modules: FlowModule[]): FlowModule[] {
+		return restoreInlineScriptReferences(modules, this)
+	}
+
+	findUnresolvedInlineScriptRefs(modules: FlowModule[]): string[] {
+		return findUnresolvedInlineScriptRefs(modules)
+	}
 }
 
-export const inlineScriptStore = new InlineScriptStore()
+export function createInlineScriptSession(): InlineScriptSession {
+	return new DefaultInlineScriptSession()
+}
 
-/**
- * Recursively extracts all rawscript content from flow modules and stores them.
- * Replaces the content with references like "inline_script.{module_id}".
- */
-export function extractAndReplaceInlineScripts(modules: FlowModule[]): FlowModule[] {
+function extractAndReplaceInlineScripts(
+	modules: FlowModule[],
+	session: Pick<InlineScriptSession, 'set'>
+): FlowModule[] {
 	if (!modules || !Array.isArray(modules)) {
 		return []
 	}
@@ -43,52 +63,45 @@ export function extractAndReplaceInlineScripts(modules: FlowModule[]): FlowModul
 		const newModule = { ...module }
 
 		if (newModule.value.type === 'rawscript' && newModule.value.content) {
-			// Store the original content
-			inlineScriptStore.set(module.id, newModule.value.content)
-
-			// Replace with reference
+			session.set(module.id, newModule.value.content)
 			newModule.value = {
 				...newModule.value,
 				content: `inline_script.${module.id}`
 			}
 		} else if (newModule.value.type === 'forloopflow' || newModule.value.type === 'whileloopflow') {
-			// Recursively process nested modules in loops
 			if (newModule.value.modules) {
 				newModule.value = {
 					...newModule.value,
-					modules: extractAndReplaceInlineScripts(newModule.value.modules)
+					modules: extractAndReplaceInlineScripts(newModule.value.modules, session)
 				}
 			}
 		} else if (newModule.value.type === 'branchone') {
-			// Process branches and default modules
 			if (newModule.value.branches) {
 				newModule.value = {
 					...newModule.value,
 					branches: newModule.value.branches.map((branch) => ({
 						...branch,
-						modules: branch.modules ? extractAndReplaceInlineScripts(branch.modules) : []
+						modules: branch.modules ? extractAndReplaceInlineScripts(branch.modules, session) : []
 					}))
 				}
 			}
 			if (newModule.value.default) {
 				newModule.value = {
 					...newModule.value,
-					default: extractAndReplaceInlineScripts(newModule.value.default)
+					default: extractAndReplaceInlineScripts(newModule.value.default, session)
 				}
 			}
 		} else if (newModule.value.type === 'branchall') {
-			// Process all branches
 			if (newModule.value.branches) {
 				newModule.value = {
 					...newModule.value,
 					branches: newModule.value.branches.map((branch) => ({
 						...branch,
-						modules: branch.modules ? extractAndReplaceInlineScripts(branch.modules) : []
+						modules: branch.modules ? extractAndReplaceInlineScripts(branch.modules, session) : []
 					}))
 				}
 			}
 		} else if (newModule.value.type === 'aiagent') {
-			// Process AI agent tools
 			if (newModule.value.tools) {
 				newModule.value = {
 					...newModule.value,
@@ -102,7 +115,7 @@ export function extractAndReplaceInlineScripts(modules: FlowModule[]): FlowModul
 							'content' in tool.value &&
 							tool.value.content
 						) {
-							inlineScriptStore.set(tool.id, tool.value.content as string)
+							session.set(tool.id, tool.value.content as string)
 							return {
 								...tool,
 								value: {
@@ -121,70 +134,58 @@ export function extractAndReplaceInlineScripts(modules: FlowModule[]): FlowModul
 	})
 }
 
-/**
- * Recursively restores inline script references back to their full content.
- * If content matches pattern "inline_script.{id}", looks up and restores the original.
- * If content doesn't match (new/modified script), keeps it as-is.
- */
-export function restoreInlineScriptReferences(modules: FlowModule[]): FlowModule[] {
+function restoreInlineScriptReferences(
+	modules: FlowModule[],
+	session: Pick<InlineScriptSession, 'get'>
+): FlowModule[] {
 	return modules.map((module) => {
 		const newModule = { ...module }
 
 		if (newModule.value.type === 'rawscript' && newModule.value.content) {
-			const content = newModule.value.content
-			// Check if it's a reference
-			const match = content.match(/^inline_script\.(.+)$/)
+			const match = newModule.value.content.match(/^inline_script\.(.+)$/)
 			if (match) {
-				const moduleId = match[1]
-				const storedContent = inlineScriptStore.get(moduleId)
+				const storedContent = session.get(match[1])
 				if (storedContent !== undefined) {
-					// Restore original content
 					newModule.value = {
 						...newModule.value,
 						content: storedContent
 					}
 				}
-				// If not found in store, keep the reference as-is (shouldn't happen normally)
 			}
-			// If not a reference, it's new/modified content - keep as-is
 		} else if (newModule.value.type === 'forloopflow' || newModule.value.type === 'whileloopflow') {
-			// Recursively process nested modules in loops
 			if (newModule.value.modules) {
 				newModule.value = {
 					...newModule.value,
-					modules: restoreInlineScriptReferences(newModule.value.modules)
+					modules: restoreInlineScriptReferences(newModule.value.modules, session)
 				}
 			}
 		} else if (newModule.value.type === 'branchone') {
-			// Process branches and default modules
 			if (newModule.value.branches) {
 				newModule.value = {
 					...newModule.value,
 					branches: newModule.value.branches.map((branch) => ({
 						...branch,
-						modules: branch.modules ? restoreInlineScriptReferences(branch.modules) : []
+						modules: branch.modules ? restoreInlineScriptReferences(branch.modules, session) : []
 					}))
 				}
 			}
 			if (newModule.value.default) {
 				newModule.value = {
 					...newModule.value,
-					default: restoreInlineScriptReferences(newModule.value.default)
+					default: restoreInlineScriptReferences(newModule.value.default, session)
 				}
 			}
 		} else if (newModule.value.type === 'branchall') {
-			// Process all branches
 			if (newModule.value.branches) {
 				newModule.value = {
 					...newModule.value,
 					branches: newModule.value.branches.map((branch) => ({
 						...branch,
-						modules: branch.modules ? restoreInlineScriptReferences(branch.modules) : []
+						modules: branch.modules ? restoreInlineScriptReferences(branch.modules, session) : []
 					}))
 				}
 			}
 		} else if (newModule.value.type === 'aiagent') {
-			// Process AI agent tools
 			if (newModule.value.tools) {
 				newModule.value = {
 					...newModule.value,
@@ -198,11 +199,9 @@ export function restoreInlineScriptReferences(modules: FlowModule[]): FlowModule
 							'content' in tool.value &&
 							tool.value.content
 						) {
-							const content = tool.value.content as string
-							const match = content.match(/^inline_script\.(.+)$/)
+							const match = (tool.value.content as string).match(/^inline_script\.(.+)$/)
 							if (match) {
-								const toolId = match[1]
-								const storedContent = inlineScriptStore.get(toolId)
+								const storedContent = session.get(match[1])
 								if (storedContent !== undefined) {
 									return {
 										...tool,
@@ -224,11 +223,7 @@ export function restoreInlineScriptReferences(modules: FlowModule[]): FlowModule
 	})
 }
 
-/**
- * Recursively finds any unresolved inline script references in flow modules.
- * Returns array of module IDs that still have `inline_script.{id}` patterns.
- */
-export function findUnresolvedInlineScriptRefs(modules: FlowModule[]): string[] {
+function findUnresolvedInlineScriptRefs(modules: FlowModule[]): string[] {
 	const unresolvedRefs: string[] = []
 
 	function checkModule(module: FlowModule) {
@@ -257,7 +252,6 @@ export function findUnresolvedInlineScriptRefs(modules: FlowModule[]): string[]
 				})
 			}
 		} else if (module.value.type === 'aiagent') {
-			// Check AI agent tools
 			if (module.value.tools) {
 				for (const tool of module.value.tools) {
 					if (
diff --git a/frontend/src/lib/components/copilot/chat/openai-responses.ts b/frontend/src/lib/components/copilot/chat/openai-responses.ts
index 56364e1401..c557132417 100644
--- a/frontend/src/lib/components/copilot/chat/openai-responses.ts
+++ b/frontend/src/lib/components/copilot/chat/openai-responses.ts
@@ -14,6 +14,15 @@ import {
 import { processToolCall, type Tool, type ToolCallbacks } from './shared'
 import type { ResponseStream } from 'openai/lib/responses/ResponseStream.mjs'
 import type { AIProviderModel } from '$lib/gen'
+import {
+	openAIResponsesUsageToChatTokenUsage,
+	type ChatTokenUsage
+} from './tokenUsage'
+
+interface ParsedCompletionResult {
+	shouldContinue: boolean
+	tokenUsage: ChatTokenUsage
+}
 
 // Conversion utilities for Responses API
 function convertMessagesToResponsesInput(messages: ChatCompletionMessageParam[]): {
@@ -219,7 +228,7 @@ export async function parseOpenAIResponsesCompletion(
 	tools: Tool<any>[],
 	helpers: any,
 	options?: { workspace?: string }
-): Promise<boolean> {
+): Promise<ParsedCompletionResult> {
 	let toolCallsToProcess: ChatCompletionMessageFunctionToolCall[] = []
 	let error: OpenAIError | ResponseErrorEvent | null = null
 	let textContent = ''
@@ -337,6 +346,9 @@ export async function parseOpenAIResponsesCompletion(
 		throw error
 	}
 
+	const finalResponse = await runner.finalResponse()
+	const tokenUsage = openAIResponsesUsageToChatTokenUsage(finalResponse.usage)
+
 	// Process tool calls if any
 	if (toolCallsToProcess.length > 0) {
 		const assistantWithTools = {
@@ -358,10 +370,10 @@ export async function parseOpenAIResponsesCompletion(
 			messages.push(messageToAdd)
 			addedMessages.push(messageToAdd)
 		}
-		return true // Continue the conversation loop
+		return { shouldContinue: true, tokenUsage }
 	}
 
-	return false // End the conversation
+	return { shouldContinue: false, tokenUsage }
 }
 
 export async function getNonStreamingOpenAIResponsesCompletion(
diff --git a/frontend/src/lib/components/copilot/chat/shared.ts b/frontend/src/lib/components/copilot/chat/shared.ts
index 20e488d923..a1a8e3497f 100644
--- a/frontend/src/lib/components/copilot/chat/shared.ts
+++ b/frontend/src/lib/components/copilot/chat/shared.ts
@@ -920,8 +920,6 @@ export function formatScriptLintResult(lintResult: ScriptLintResult): string {
 	return response
 }
 
-// ============= Workspace Runnables Search =============
-
 export class WorkspaceRunnablesSearch {
 	private uf: uFuzzy
 	private scriptsWorkspace: string | undefined = undefined
diff --git a/frontend/src/lib/components/copilot/chat/tokenUsage.ts b/frontend/src/lib/components/copilot/chat/tokenUsage.ts
new file mode 100644
index 0000000000..d1b7a2e56b
--- /dev/null
+++ b/frontend/src/lib/components/copilot/chat/tokenUsage.ts
@@ -0,0 +1,73 @@
+export interface ChatTokenUsage {
+	prompt: number
+	completion: number
+	total: number
+}
+
+export function emptyChatTokenUsage(): ChatTokenUsage {
+	return { prompt: 0, completion: 0, total: 0 }
+}
+
+export function addChatTokenUsage(
+	total: ChatTokenUsage,
+	usage: ChatTokenUsage | null | undefined
+): ChatTokenUsage {
+	if (!usage) {
+		return total
+	}
+
+	return {
+		prompt: total.prompt + usage.prompt,
+		completion: total.completion + usage.completion,
+		total: total.total + usage.total
+	}
+}
+
+export function anthropicUsageToChatTokenUsage(usage: {
+	input_tokens?: number | null
+	output_tokens?: number | null
+	cache_creation_input_tokens?: number | null
+	cache_read_input_tokens?: number | null
+} | null | undefined): ChatTokenUsage {
+	const prompt =
+		(usage?.input_tokens ?? 0) +
+		(usage?.cache_creation_input_tokens ?? 0) +
+		(usage?.cache_read_input_tokens ?? 0)
+	const completion = usage?.output_tokens ?? 0
+
+	return {
+		prompt,
+		completion,
+		total: prompt + completion
+	}
+}
+
+export function openAIResponsesUsageToChatTokenUsage(usage: {
+	input_tokens?: number | null
+	output_tokens?: number | null
+	total_tokens?: number | null
+} | null | undefined): ChatTokenUsage {
+	const prompt = usage?.input_tokens ?? 0
+	const completion = usage?.output_tokens ?? 0
+
+	return {
+		prompt,
+		completion,
+		total: usage?.total_tokens ?? prompt + completion
+	}
+}
+
+export function openAICompletionsUsageToChatTokenUsage(usage: {
+	prompt_tokens?: number | null
+	completion_tokens?: number | null
+	total_tokens?: number | null
+} | null | undefined): ChatTokenUsage {
+	const prompt = usage?.prompt_tokens ?? 0
+	const completion = usage?.completion_tokens ?? 0
+
+	return {
+		prompt,
+		completion,
+		total: usage?.total_tokens ?? prompt + completion
+	}
+}
diff --git a/frontend/src/lib/components/copilot/lib.ts b/frontend/src/lib/components/copilot/lib.ts
index 4b0c8c0e9e..2650f4d3de 100644
--- a/frontend/src/lib/components/copilot/lib.ts
+++ b/frontend/src/lib/components/copilot/lib.ts
@@ -25,6 +25,11 @@ import { convertOpenAIToAnthropicMessages } from './chat/anthropic'
 import type { Stream } from 'openai/core/streaming.mjs'
 import { generateRandomString } from '$lib/utils'
 import { copilotInfo, getCurrentModel } from '$lib/aiStore'
+import {
+	emptyChatTokenUsage,
+	openAICompletionsUsageToChatTokenUsage,
+	type ChatTokenUsage
+} from './chat/tokenUsage'
 
 export const SUPPORTED_LANGUAGES = new Set(Object.keys(GEN_CONFIG.prompts))
 
@@ -905,7 +910,18 @@ export async function getCompletion(
 
 	// Use Completions API for other providers
 	const client = options?.openaiClient ?? workspaceAIClients.getOpenaiClient()
-	const completion = client.chat.completions.create(config, {
+	const completionConfig =
+		(provider === 'openai' || provider === 'azure_openai' || provider === 'googleai') &&
+		config.stream
+			? {
+					...config,
+					stream_options: {
+						...(config.stream_options ?? {}),
+						include_usage: true
+					}
+				}
+			: config
+	const completion = client.chat.completions.create(completionConfig, {
 		signal: abortController.signal,
 		headers: {
 			'X-Provider': provider
@@ -936,12 +952,16 @@ export async function parseOpenAICompletion(
 	helpers: any,
 	_abortController?: AbortController, // unused, for signature compatibility with parseAnthropicCompletion
 	options?: { workspace?: string }
-): Promise<boolean> {
+): Promise<{ shouldContinue: boolean; tokenUsage: ChatTokenUsage }> {
 	const finalToolCalls: Record<number, ChatCompletionChunk.Choice.Delta.ToolCall> = {}
 	let malformedFunctionCallError = false
+	let tokenUsage = emptyChatTokenUsage()
 
 	let answer = ''
 	for await (const chunk of completion) {
+		if ('usage' in chunk && chunk.usage) {
+			tokenUsage = openAICompletionsUsageToChatTokenUsage(chunk.usage)
+		}
 		if (!('choices' in chunk && chunk.choices.length > 0 && 'delta' in chunk.choices[0])) {
 			continue
 		}
@@ -1118,9 +1138,9 @@ export async function parseOpenAICompletion(
 		messages.push(toolResponse)
 		addedMessages.push(toolResponse)
 	} else {
-		return false
+		return { shouldContinue: false, tokenUsage }
 	}
-	return true
+	return { shouldContinue: true, tokenUsage }
 }
 
 export function getResponseFromEvent(part: OpenAI.Chat.Completions.ChatCompletionChunk): string {