Compare commits
186 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d3cb0c6220 | ||
|
|
a3f24aeff8 | ||
|
|
f1e84cb088 | ||
|
|
3aa279cfd7 | ||
|
|
5c179e5448 | ||
|
|
12d0a3de08 | ||
|
|
a98f5b9dfd | ||
|
|
75e204dad1 | ||
|
|
6158ff2ebe | ||
|
|
8ee14644f4 | ||
|
|
f273341759 | ||
|
|
64ba3a632e | ||
|
|
aebf758412 | ||
|
|
91064ce857 | ||
|
|
2c1fe88fed | ||
|
|
7fe639d91e | ||
|
|
06fe809ecc | ||
|
|
5069a3b2e3 | ||
|
|
e1dbce02c2 | ||
|
|
6bb80ff28b | ||
|
|
5b3913052e | ||
|
|
4dc54ca3aa | ||
|
|
89c8e4bb96 | ||
|
|
eb85da932a | ||
|
|
f7f26b3224 | ||
|
|
e0066b266f | ||
|
|
42d3e8c789 | ||
|
|
c889a185d5 | ||
|
|
baeb202037 | ||
|
|
9fb78164b4 | ||
|
|
64c58c824f | ||
|
|
b3ef4bc26c | ||
|
|
3f5841f84d | ||
|
|
78a877eb96 | ||
|
|
378ba78284 | ||
|
|
95411b2563 | ||
|
|
b6f1cc70cd | ||
|
|
cdcc56461b | ||
|
|
60211c1d19 | ||
|
|
6cebc6f61b | ||
|
|
59c457a138 | ||
|
|
b783bf2d83 | ||
|
|
9c85565221 | ||
|
|
e48c7cf448 | ||
|
|
8b2a8882bc | ||
|
|
5eb9a2e965 | ||
|
|
946848feef | ||
|
|
3d43d31aba | ||
|
|
8957d8f19b | ||
|
|
3c64a4282d | ||
|
|
ec9cec1d02 | ||
|
|
09666af157 | ||
|
|
6cf7ffc26b | ||
|
|
ce3e676f4a | ||
|
|
4fff89f98c | ||
|
|
d243eb31b0 | ||
|
|
a7512f9034 | ||
|
|
5b97092997 | ||
|
|
29e7701972 | ||
|
|
435b25e6a4 | ||
|
|
1deb31f1e0 | ||
|
|
c57c769dea | ||
|
|
b73be37916 | ||
|
|
4b876392a0 | ||
|
|
5f57727a4d | ||
|
|
6d36eca216 | ||
|
|
3fb557a7f5 | ||
|
|
e63924e377 | ||
|
|
3d02be98f7 | ||
|
|
89920e77f3 | ||
|
|
11ecb5a774 | ||
|
|
506b7f55e1 | ||
|
|
25f4242a87 | ||
|
|
609d94aa31 | ||
|
|
80c8e076fc | ||
|
|
d2992af8be | ||
|
|
e36d440a25 | ||
|
|
fa668707c0 | ||
|
|
c69f10d20d | ||
|
|
84778ca3e9 | ||
|
|
c4c003dab8 | ||
|
|
470b8aa5f1 | ||
|
|
5713760b7a | ||
|
|
f5c9ff709b | ||
|
|
f0bb270723 | ||
|
|
4c16877366 | ||
|
|
4342c18541 | ||
|
|
01e6414ddb | ||
|
|
2d18a68099 | ||
|
|
9e6427d150 | ||
|
|
3d4f4c6c38 | ||
|
|
0bcbc8bd3c | ||
|
|
2413dbefe3 | ||
|
|
db55e8efb0 | ||
|
|
5125263859 | ||
|
|
d938625785 | ||
|
|
eb32206940 | ||
|
|
8b9523e03c | ||
|
|
2f7ba9edac | ||
|
|
208a597d59 | ||
|
|
c4be833bc0 | ||
|
|
b5c1eb3137 | ||
|
|
edfe074e98 | ||
|
|
c09a4311fd | ||
|
|
09bbc18bb7 | ||
|
|
a78eb6e93d | ||
|
|
af1f6506d2 | ||
|
|
d2abc0d430 | ||
|
|
e32662169a | ||
|
|
c721fac466 | ||
|
|
7eaaf3021a | ||
|
|
f703fba1ef | ||
|
|
eae46a21a9 | ||
|
|
31df861ee2 | ||
|
|
e605bc4e07 | ||
|
|
7bf6ac2b69 | ||
|
|
01e39d9cd1 | ||
|
|
02d0ee9198 | ||
|
|
dd39c110a8 | ||
|
|
342defecd2 | ||
|
|
2b865c0694 | ||
|
|
7c3bd67639 | ||
|
|
ff8e39c69b | ||
|
|
f394e674f2 | ||
|
|
6bb5cac0c4 | ||
|
|
3e9a9f44bb | ||
|
|
aff95c33b2 | ||
|
|
653356011c | ||
|
|
fda68a72e5 | ||
|
|
ef74a1bb4d | ||
|
|
1a0f580f3d | ||
|
|
6d58d1a74d | ||
|
|
ce290f68db | ||
|
|
dcd615fdc3 | ||
|
|
18eb6e0df7 | ||
|
|
adc9fe722d | ||
|
|
0aea49f960 | ||
|
|
ba214709b9 | ||
|
|
bffa61e33f | ||
|
|
27ca417201 | ||
|
|
c4c9ef5fd7 | ||
|
|
b960598431 | ||
|
|
f234df97ec | ||
|
|
a3073ad824 | ||
|
|
ceaa613522 | ||
|
|
0317d5891c | ||
|
|
7485a5db04 | ||
|
|
0cfa462c37 | ||
|
|
6656b46f10 | ||
|
|
5b7fa63bf1 | ||
|
|
cdf3c29664 | ||
|
|
81ab777cbb | ||
|
|
61a867f086 | ||
|
|
8581a3300d | ||
|
|
ff5fa9f64f | ||
|
|
55e8a5cff1 | ||
|
|
1a39bd538d | ||
|
|
1049c1026b | ||
|
|
619ebb65ce | ||
|
|
d2d6810db9 | ||
|
|
39af1b75af | ||
|
|
d569e9e29c | ||
|
|
381011a4a8 | ||
|
|
f0437eba19 | ||
|
|
d8edaad99c | ||
|
|
7fd0bf974d | ||
|
|
6a5cfbc159 | ||
|
|
8c3c97f7a6 | ||
|
|
c3a1c26be1 | ||
|
|
c87a6a0f2c | ||
|
|
350ffdce29 | ||
|
|
28c073056c | ||
|
|
c86846ac19 | ||
|
|
7ab0ea581d | ||
|
|
bcce627387 | ||
|
|
175af8032f | ||
|
|
1784bed4ac | ||
|
|
a46aa641f9 | ||
|
|
7069202190 | ||
|
|
df7a8eebcf | ||
|
|
2862c1cf56 | ||
|
|
d67223de9b | ||
|
|
da8886be85 | ||
|
|
040a199685 | ||
|
|
6c3c971af5 | ||
|
|
852c59efbb |
23
.github/codex/pr-review.prompt.md
vendored
Normal file
23
.github/codex/pr-review.prompt.md
vendored
Normal file
@@ -0,0 +1,23 @@
|
||||
You are reviewing a GitHub pull request for this repository.
|
||||
|
||||
Review policy:
|
||||
- Read `CLAUDE.md` before reviewing code.
|
||||
- Only report issues you are confident are real and introduced by this pull request.
|
||||
- Focus on bugs, security problems, and clear `CLAUDE.md` violations.
|
||||
- Do not report style nits, speculative concerns, pre-existing issues, or problems that a normal linter/typechecker would obviously catch.
|
||||
- Keep the review high signal. If there is no clear issue, return no findings.
|
||||
|
||||
Repository context:
|
||||
- Read `./.github/codex/pr-review-context.md` for the PR metadata and the exact diff commands to use.
|
||||
- Review only the changes introduced by this PR.
|
||||
- Read additional files only when the diff is not enough to validate a finding.
|
||||
- Do not modify any files.
|
||||
|
||||
Output requirements:
|
||||
- Return a GitHub PR comment in markdown, not JSON.
|
||||
- Start with `## Codex Review`.
|
||||
- Give a short overall summary first.
|
||||
- If you found high-signal issues, list them in a short numbered list with file paths and line numbers when you know them confidently.
|
||||
- If you found no high-signal issues, say that explicitly.
|
||||
- End with a `### Reproduction instructions` section containing a short descriptive paragraph for a tester explaining how to navigate the app to observe the change. Do not make it a numbered list. If the diff is not enough to infer this safely, say that plainly.
|
||||
- Prefer at most 10 findings.
|
||||
145
.github/workflows/codex-pr-review.yml
vendored
Normal file
145
.github/workflows/codex-pr-review.yml
vendored
Normal file
@@ -0,0 +1,145 @@
|
||||
name: Codex Auto Review
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types: [ready_for_review, opened]
|
||||
|
||||
concurrency:
|
||||
group: codex-review-${{ github.event.pull_request.number }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
codex-review:
|
||||
runs-on: ubicloud-standard-2
|
||||
timeout-minutes: 30
|
||||
if: github.event.pull_request.draft == false && github.event.pull_request.head.repo.fork == false
|
||||
permissions:
|
||||
contents: read
|
||||
issues: write
|
||||
steps:
|
||||
- name: Check Codex configuration
|
||||
id: codex_config
|
||||
env:
|
||||
CODEX_AUTH_JSON: ${{ secrets.CODEX_AUTH_JSON }}
|
||||
run: |
|
||||
if [ -n "$CODEX_AUTH_JSON" ]; then
|
||||
echo "enabled=true" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "enabled=false" >> "$GITHUB_OUTPUT"
|
||||
echo "CODEX_AUTH_JSON is not configured; skipping Codex review."
|
||||
fi
|
||||
|
||||
- name: Checkout repository
|
||||
if: steps.codex_config.outputs.enabled == 'true'
|
||||
uses: actions/checkout@v5
|
||||
with:
|
||||
ref: refs/pull/${{ github.event.pull_request.number }}/merge
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Set up Node.js
|
||||
if: steps.codex_config.outputs.enabled == 'true'
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 22
|
||||
|
||||
- name: Install Codex CLI
|
||||
if: steps.codex_config.outputs.enabled == 'true'
|
||||
run: npm install --global @openai/codex@0.117.0
|
||||
|
||||
- name: Configure file-backed Codex auth
|
||||
if: steps.codex_config.outputs.enabled == 'true'
|
||||
env:
|
||||
CODEX_AUTH_JSON: ${{ secrets.CODEX_AUTH_JSON }}
|
||||
run: |
|
||||
CODEX_HOME="$HOME/.codex"
|
||||
echo "CODEX_HOME=$CODEX_HOME" >> "$GITHUB_ENV"
|
||||
mkdir -p "$CODEX_HOME"
|
||||
chmod 700 "$CODEX_HOME"
|
||||
cat > "$CODEX_HOME/config.toml" <<'EOF'
|
||||
cli_auth_credentials_store = "file"
|
||||
EOF
|
||||
printf '%s' "$CODEX_AUTH_JSON" > "$CODEX_HOME/auth.json"
|
||||
chmod 600 "$CODEX_HOME/auth.json"
|
||||
node -e 'JSON.parse(require("fs").readFileSync(process.argv[1], "utf8"))' "$CODEX_HOME/auth.json"
|
||||
|
||||
- name: Pre-fetch base and head refs for the PR
|
||||
if: steps.codex_config.outputs.enabled == 'true'
|
||||
env:
|
||||
PR_BASE_REF: ${{ github.event.pull_request.base.ref }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
run: |
|
||||
git fetch --no-tags origin \
|
||||
"$PR_BASE_REF" \
|
||||
"+refs/pull/$PR_NUMBER/head"
|
||||
|
||||
- name: Write Codex review context
|
||||
if: steps.codex_config.outputs.enabled == 'true'
|
||||
env:
|
||||
PR_REPOSITORY: ${{ github.repository }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
|
||||
PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
|
||||
PR_TITLE: ${{ github.event.pull_request.title }}
|
||||
PR_BODY: ${{ github.event.pull_request.body || '' }}
|
||||
run: |
|
||||
mkdir -p .github/codex
|
||||
node <<'NODE'
|
||||
const fs = require('fs');
|
||||
const lines = [
|
||||
`Repository: ${process.env.PR_REPOSITORY}`,
|
||||
`PR number: ${process.env.PR_NUMBER}`,
|
||||
`Base SHA: ${process.env.PR_BASE_SHA}`,
|
||||
`Head SHA: ${process.env.PR_HEAD_SHA}`,
|
||||
'',
|
||||
'PR title:',
|
||||
process.env.PR_TITLE || '(empty)',
|
||||
'',
|
||||
'PR body:',
|
||||
process.env.PR_BODY || '(empty)',
|
||||
'',
|
||||
'Changed commits command:',
|
||||
`git log --oneline ${process.env.PR_BASE_SHA}...${process.env.PR_HEAD_SHA}`,
|
||||
'',
|
||||
'Changed files command:',
|
||||
`git diff --stat ${process.env.PR_BASE_SHA}...${process.env.PR_HEAD_SHA}`,
|
||||
'',
|
||||
'Full review diff command:',
|
||||
`git diff --unified=0 ${process.env.PR_BASE_SHA}...${process.env.PR_HEAD_SHA}`
|
||||
];
|
||||
fs.writeFileSync('.github/codex/pr-review-context.md', `${lines.join('\n')}\n`);
|
||||
NODE
|
||||
|
||||
- name: Run Codex review
|
||||
if: steps.codex_config.outputs.enabled == 'true'
|
||||
run: |
|
||||
codex exec \
|
||||
-C "$GITHUB_WORKSPACE" \
|
||||
-m gpt-5.4 \
|
||||
-c 'model_reasoning_effort="xhigh"' \
|
||||
-s read-only \
|
||||
-o codex-final-message.md \
|
||||
- < .github/codex/pr-review.prompt.md
|
||||
|
||||
- name: Post Codex review comment
|
||||
if: steps.codex_config.outputs.enabled == 'true'
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
github-token: ${{ github.token }}
|
||||
script: |
|
||||
const fs = require('fs');
|
||||
const path = `${process.env.GITHUB_WORKSPACE}/codex-final-message.md`;
|
||||
if (!fs.existsSync(path)) {
|
||||
core.info('Codex did not produce a final message; skipping PR comment.');
|
||||
return;
|
||||
}
|
||||
const body = fs.readFileSync(path, 'utf8').trim();
|
||||
if (!body) {
|
||||
core.info('Codex final message was empty; skipping PR comment.');
|
||||
return;
|
||||
}
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.payload.pull_request.number,
|
||||
body,
|
||||
});
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -25,7 +25,10 @@ rust-client/Cargo.toml
|
||||
backend/target
|
||||
frontend/node_modules
|
||||
typescript-client/node_modules
|
||||
ai_evals/node_modules
|
||||
ai_evals/results/
|
||||
frontend/.svelte-kit
|
||||
backend/chrome_profiler.json
|
||||
.fast-check/
|
||||
__pycache__/
|
||||
.playwright-mcp/
|
||||
|
||||
@@ -43,7 +43,7 @@ profiles:
|
||||
- Pane 0: this pane (claude agent)
|
||||
- Pane 1: backend (cargo watch -x run)
|
||||
- Pane 2: frontend (npm run dev)
|
||||
To check logs, use: \`tmux capture-pane -t .1 -p -S -50\` (backend) or \`tmux capture-pane -t .2 -p -S -50\` (frontend).
|
||||
To check logs, use: \`tmux capture-pane -t $(tmux display-message -t "$TMUX_PANE" -p '#{session_name}:#{window_name}').1 -p -S -50\` (backend) or \`tmux capture-pane -t $(tmux display-message -t "$TMUX_PANE" -p '#{session_name}:#{window_name}').2 -p -S -50\` (frontend).
|
||||
For this window specifically, backend is running on: ${BACKEND_PORT} and frontend is running on: ${FRONTEND_PORT}.
|
||||
To connect to the database, use this connection string: ${DATABASE_URL}
|
||||
Because we are running backend with cargo watch, to verify your changes, just check the logs in the backend pane. No need for cargo check.
|
||||
@@ -72,7 +72,7 @@ profiles:
|
||||
Pane layout (current window):
|
||||
- Pane 0: this pane (claude agent)
|
||||
- Pane 1: frontend (npm run dev)
|
||||
To check logs, use: \`tmux capture-pane -t .1 -p -S -50\` (frontend).
|
||||
To check logs, use: \`tmux capture-pane -t $(tmux display-message -t "$TMUX_PANE" -p '#{session_name}:#{window_name}').1 -p -S -50\` (frontend).
|
||||
On this window specifically, frontend is running on: ${FRONTEND_PORT}.
|
||||
To connect to the database, use this connection string: ${DATABASE_URL}
|
||||
Because we are running frontend with npm run dev, to verify your changes, just check the logs in the frontend pane. No need for npm run build.
|
||||
|
||||
298
CHANGELOG.md
298
CHANGELOG.md
@@ -1,5 +1,303 @@
|
||||
# Changelog
|
||||
|
||||
## [1.684.1](https://github.com/windmill-labs/windmill/compare/v1.684.0...v1.684.1) (2026-04-14)
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* stop escalating missing email recipients to critical alert ([#8833](https://github.com/windmill-labs/windmill/issues/8833)) ([6158ff2](https://github.com/windmill-labs/windmill/commit/6158ff2ebe29d6a9a7ff4d524e152bb2f7c24dfc))
|
||||
|
||||
## [1.684.0](https://github.com/windmill-labs/windmill/compare/v1.683.2...v1.684.0) (2026-04-14)
|
||||
|
||||
|
||||
### Features
|
||||
|
||||
* cascade trigger script_path on runnable rename + fix trigger permissioned_as ([#8823](https://github.com/windmill-labs/windmill/issues/8823)) ([64ba3a6](https://github.com/windmill-labs/windmill/commit/64ba3a632eee041d09093e89961f63f2a090fcad))
|
||||
* **frontend:** improve permissions drawer UX and auto-share resource variables ([#8824](https://github.com/windmill-labs/windmill/issues/8824)) ([91064ce](https://github.com/windmill-labs/windmill/commit/91064ce85712b85e32e3f8cff2a0794cd5597ed6))
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* allow dedicated flow substeps to inherit parent tag ([#8832](https://github.com/windmill-labs/windmill/issues/8832)) ([aebf758](https://github.com/windmill-labs/windmill/commit/aebf758412383dd65e0bf6c72de8f2668561cd88))
|
||||
* compute wall-clock duration for flow job groups in CLI ([#8826](https://github.com/windmill-labs/windmill/issues/8826)) ([e1dbce0](https://github.com/windmill-labs/windmill/commit/e1dbce02c22bcaa3d7d447ee54db69373bc1cf7b))
|
||||
* DB Manager delete/update for timestamp and serial types ([#8830](https://github.com/windmill-labs/windmill/issues/8830)) ([06fe809](https://github.com/windmill-labs/windmill/commit/06fe809ecc3c6b37af7582175f9dd90c2c2a8f98))
|
||||
* hide serial types in column type dropdown for existing columns ([#8828](https://github.com/windmill-labs/windmill/issues/8828)) ([7fe639d](https://github.com/windmill-labs/windmill/commit/7fe639d91e93a6b3069e0d87b57c232d67c8ad65))
|
||||
|
||||
## [1.683.2](https://github.com/windmill-labs/windmill/compare/v1.683.1...v1.683.2) (2026-04-14)
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* detect WAC v2 Python workflows that only use step() (no [@task](https://github.com/task)) ([#8819](https://github.com/windmill-labs/windmill/issues/8819)) ([89c8e4b](https://github.com/windmill-labs/windmill/commit/89c8e4bb9680c179bf44a66a22dcf047334944ae))
|
||||
* persist indexer max_index_time_window_secs setting ([#8821](https://github.com/windmill-labs/windmill/issues/8821)) ([4dc54ca](https://github.com/windmill-labs/windmill/commit/4dc54ca3aa14beab175da59eb8b9072918301b43))
|
||||
|
||||
## [1.683.1](https://github.com/windmill-labs/windmill/compare/v1.683.0...v1.683.1) (2026-04-13)
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* use OpenAPI 3.0 nullable pattern for getOpenDeploymentRequest ([#8816](https://github.com/windmill-labs/windmill/issues/8816)) ([f7f26b3](https://github.com/windmill-labs/windmill/commit/f7f26b32244536b6efb7c1b5aafd4a7644dcb42f))
|
||||
|
||||
## [1.683.0](https://github.com/windmill-labs/windmill/compare/v1.682.0...v1.683.0) (2026-04-13)
|
||||
|
||||
|
||||
### Features
|
||||
|
||||
* add black-box ai eval benchmarks ([#8618](https://github.com/windmill-labs/windmill/issues/8618)) ([cdcc564](https://github.com/windmill-labs/windmill/commit/cdcc56461b77554964622f490ae901f170886595))
|
||||
* add deploy restriction rule and fork review requests ([#8804](https://github.com/windmill-labs/windmill/issues/8804)) ([64c58c8](https://github.com/windmill-labs/windmill/commit/64c58c824fcefe00f15405b7e3877eb566a3ffa2))
|
||||
* allow non-admins to create and edit HTTP triggers ([#8810](https://github.com/windmill-labs/windmill/issues/8810)) ([9fb7816](https://github.com/windmill-labs/windmill/commit/9fb78164b4baa14c10d10f91ae969d48590c29f3))
|
||||
* display agent message in flow graph ([#8806](https://github.com/windmill-labs/windmill/issues/8806)) ([95411b2](https://github.com/windmill-labs/windmill/commit/95411b256332fa41816a93b19906f1534da9b300))
|
||||
* folder default_permissioned_as rules for ownership defaults on deploy ([#8801](https://github.com/windmill-labs/windmill/issues/8801)) ([60211c1](https://github.com/windmill-labs/windmill/commit/60211c1d1910b5f7ac6fed112f790201d2047a4c))
|
||||
* instance-level ruff config auto-pulled by LSP container ([#8803](https://github.com/windmill-labs/windmill/issues/8803)) ([3f5841f](https://github.com/windmill-labs/windmill/commit/3f5841f84d878cd3f43c435fa237d3f0c2265fb9))
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* **cli:** make cli help resilient to npm registry fetch failures ([#8809](https://github.com/windmill-labs/windmill/issues/8809)) ([b6f1cc7](https://github.com/windmill-labs/windmill/commit/b6f1cc70cd87c61df7112d3838fbb5fe9bcdc145))
|
||||
* enrich OTEL log records with per-request LogContext ([#8812](https://github.com/windmill-labs/windmill/issues/8812)) ([42d3e8c](https://github.com/windmill-labs/windmill/commit/42d3e8c7893cd959c7faffd19cd210c869c604f8))
|
||||
* silence user-facing toast for non-critical hub script tracking error ([#8808](https://github.com/windmill-labs/windmill/issues/8808)) ([378ba78](https://github.com/windmill-labs/windmill/commit/378ba7828456c871b5778f1144c4bb559bd5a733))
|
||||
|
||||
|
||||
### Performance Improvements
|
||||
|
||||
* add inline-persist fast path for WAC v2 step() ([#8807](https://github.com/windmill-labs/windmill/issues/8807)) ([b3ef4bc](https://github.com/windmill-labs/windmill/commit/b3ef4bc26c5696624efee89b5e4e33e77e10cf15))
|
||||
|
||||
## [1.682.0](https://github.com/windmill-labs/windmill/compare/v1.681.0...v1.682.0) (2026-04-10)
|
||||
|
||||
|
||||
### Features
|
||||
|
||||
* enrich hanging flow error with worker and service log info ([#8800](https://github.com/windmill-labs/windmill/issues/8800)) ([59c457a](https://github.com/windmill-labs/windmill/commit/59c457a13881e35c229baed3edd87e618f89b9a0))
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* bypass OTEL MITM tracing proxy for git sync jobs ([#8796](https://github.com/windmill-labs/windmill/issues/8796)) ([9c85565](https://github.com/windmill-labs/windmill/commit/9c855652212dbac0e49f87dedd447d3d7d7b500a))
|
||||
* show full path on hover in deploy drawer and widen drawer ([#8799](https://github.com/windmill-labs/windmill/issues/8799)) ([b783bf2](https://github.com/windmill-labs/windmill/commit/b783bf2d835cde0843739f7d1099193bb0af042e))
|
||||
|
||||
## [1.681.0](https://github.com/windmill-labs/windmill/compare/v1.680.0...v1.681.0) (2026-04-10)
|
||||
|
||||
|
||||
### Features
|
||||
|
||||
* add CI test scripts with auto-trigger on deploy ([#8736](https://github.com/windmill-labs/windmill/issues/8736)) ([c57c769](https://github.com/windmill-labs/windmill/commit/c57c769deaa207e7ba7995f75649d3630774e898))
|
||||
* add edit yaml button to raw app settings ([#8771](https://github.com/windmill-labs/windmill/issues/8771)) ([b73be37](https://github.com/windmill-labs/windmill/commit/b73be37916de808dc64bec1337edf6e7d3993c5e))
|
||||
* add user offboarding flow with object reassignment ([#8647](https://github.com/windmill-labs/windmill/issues/8647)) ([435b25e](https://github.com/windmill-labs/windmill/commit/435b25e6a4c7272c0189cbcfb83526379f41ebf0))
|
||||
* allow selecting hub flows as raw app backend runnables ([#8772](https://github.com/windmill-labs/windmill/issues/8772)) ([5f57727](https://github.com/windmill-labs/windmill/commit/5f57727a4d956a9066b005b3c55f08dd6780475a))
|
||||
* list external JWT tokens in instance settings ([#8783](https://github.com/windmill-labs/windmill/issues/8783)) ([ce3e676](https://github.com/windmill-labs/windmill/commit/ce3e676f4ab0c442058c64db4ebf35545a805ef5))
|
||||
* oauth manual connect option ([#8770](https://github.com/windmill-labs/windmill/issues/8770)) ([4b87639](https://github.com/windmill-labs/windmill/commit/4b876392a0ce41ae42bd882ced10fe0187e532bc))
|
||||
* unify CLI config to workspaces, deprecate gitBranches/environments ([#8767](https://github.com/windmill-labs/windmill/issues/8767)) ([5b97092](https://github.com/windmill-labs/windmill/commit/5b9709299761b83a88df17a4259c431dfcd244f9))
|
||||
* **vault:** add skip_ssl_verify option for HashiCorp Vault ([#8791](https://github.com/windmill-labs/windmill/issues/8791)) ([6cf7ffc](https://github.com/windmill-labs/windmill/commit/6cf7ffc26bcbc8f4ef0e4ad2879fcd114332c4e2))
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* bypass sql type injection during formatting to prevent offset corruption ([#8786](https://github.com/windmill-labs/windmill/issues/8786)) ([8957d8f](https://github.com/windmill-labs/windmill/commit/8957d8f19bce3430871c2858b3accd53e0be178f))
|
||||
* CLI falls back to workspace whoami for workspace-scoped tokens ([#8789](https://github.com/windmill-labs/windmill/issues/8789)) ([d243eb3](https://github.com/windmill-labs/windmill/commit/d243eb31b014781a249f903b2a467aa58909ddd6))
|
||||
* disable scroll-to-change-number on number inputs ([#8777](https://github.com/windmill-labs/windmill/issues/8777)) ([e63924e](https://github.com/windmill-labs/windmill/commit/e63924e3778b40486813192dc2913e565e0a765e))
|
||||
* error on flow/app folder suffix format mismatch during sync push/pull ([#8775](https://github.com/windmill-labs/windmill/issues/8775)) ([1deb31f](https://github.com/windmill-labs/windmill/commit/1deb31f1e01d6168eee3c2cc242cb483272d1965))
|
||||
* flow dev page layout and compact toolbar improvements ([#8776](https://github.com/windmill-labs/windmill/issues/8776)) ([89920e7](https://github.com/windmill-labs/windmill/commit/89920e77f3f5dc45db939ec938d92c881dccc8a0))
|
||||
* Flow status viewer layout nits (avoid excess y space and scroll) ([#8780](https://github.com/windmill-labs/windmill/issues/8780)) ([6d36eca](https://github.com/windmill-labs/windmill/commit/6d36eca21684f9d3ab36658c2b66f85b9be8d331))
|
||||
* flow step testing UX improvements ([#8781](https://github.com/windmill-labs/windmill/issues/8781)) ([3fb557a](https://github.com/windmill-labs/windmill/commit/3fb557a7f51dbbd3fac445734196f1b9a1d2e287))
|
||||
* hide legacy global_settings.worker_configs ghost row ([#8790](https://github.com/windmill-labs/windmill/issues/8790)) ([4fff89f](https://github.com/windmill-labs/windmill/commit/4fff89f98ce72997a055cc313c8fe217d2f1fe78))
|
||||
* limit multi-runnable dedicated workers to one job at a time ([#8782](https://github.com/windmill-labs/windmill/issues/8782)) ([946848f](https://github.com/windmill-labs/windmill/commit/946848feef60aba2a54bc2f5b686b33cc96ec9ef))
|
||||
* normalize multi-word pg types in build_parameters to fix float8 serialization ([#8778](https://github.com/windmill-labs/windmill/issues/8778)) ([3d02be9](https://github.com/windmill-labs/windmill/commit/3d02be98f748d985f688243f3215d15ca4227f8f))
|
||||
* refresh custom instance user password if auth failed ([#8787](https://github.com/windmill-labs/windmill/issues/8787)) ([3d43d31](https://github.com/windmill-labs/windmill/commit/3d43d31aba276f23903f16f06035a4c4955b52e2))
|
||||
* treat empty global setting strings as unset ([#8793](https://github.com/windmill-labs/windmill/issues/8793)) ([ec9cec1](https://github.com/windmill-labs/windmill/commit/ec9cec1d02d87328db92a71a1b3a945e9e0c6bd2))
|
||||
* zero-downtime coordinated restarts for OTEL and other setting changes ([#8768](https://github.com/windmill-labs/windmill/issues/8768)) ([506b7f5](https://github.com/windmill-labs/windmill/commit/506b7f55e17472d1384e9676c1b6df7a9d7a118b))
|
||||
|
||||
## [1.680.0](https://github.com/windmill-labs/windmill/compare/v1.679.0...v1.680.0) (2026-04-08)
|
||||
|
||||
|
||||
### Features
|
||||
|
||||
* add CLI workspace merge command and enhance fork with datatable/color support ([#8756](https://github.com/windmill-labs/windmill/issues/8756)) ([4342c18](https://github.com/windmill-labs/windmill/commit/4342c1854134500d3b2bc46280f9885ee84e2c9e))
|
||||
* add scheduled job deletion with configurable retention period ([#8753](https://github.com/windmill-labs/windmill/issues/8753)) ([2d18a68](https://github.com/windmill-labs/windmill/commit/2d18a680991babe317ca315bbce40e6ce733afda))
|
||||
* add status indicator dots to parallel loop iteration picker ([#8761](https://github.com/windmill-labs/windmill/issues/8761)) ([470b8aa](https://github.com/windmill-labs/windmill/commit/470b8aa5f1870e26fea022c1e2a9f48471d8a205))
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* move alert config from config table to global_settings ([#8762](https://github.com/windmill-labs/windmill/issues/8762)) ([fa66870](https://github.com/windmill-labs/windmill/commit/fa668707c0ee7f261d78e145666b1073471259fd))
|
||||
* resolve esbuild host/binary version mismatch in app sync push ([#8765](https://github.com/windmill-labs/windmill/issues/8765)) ([e36d440](https://github.com/windmill-labs/windmill/commit/e36d440a251a43ea888e3ce378d0bb8ed8f42e11))
|
||||
* skip serializing ws_specific on resources when false ([#8764](https://github.com/windmill-labs/windmill/issues/8764)) ([c69f10d](https://github.com/windmill-labs/windmill/commit/c69f10d20dd064f0c329934096c2945424ff81f2))
|
||||
|
||||
## [1.679.0](https://github.com/windmill-labs/windmill/compare/v1.678.0...v1.679.0) (2026-04-07)
|
||||
|
||||
|
||||
### Features
|
||||
|
||||
* Fork datatables ([#8339](https://github.com/windmill-labs/windmill/issues/8339)) ([3d4f4c6](https://github.com/windmill-labs/windmill/commit/3d4f4c6c38155396e9b2236a6a7a7ad4e02da877))
|
||||
|
||||
## [1.678.0](https://github.com/windmill-labs/windmill/compare/v1.677.0...v1.678.0) (2026-04-07)
|
||||
|
||||
|
||||
### Features
|
||||
|
||||
* accept any content type on webhooks/http triggers with fallback ([#8743](https://github.com/windmill-labs/windmill/issues/8743)) ([208a597](https://github.com/windmill-labs/windmill/commit/208a597d599b4d203f7ab817a5d8ce2c06f79d0a))
|
||||
* add download all logs button for flow jobs ([#8748](https://github.com/windmill-labs/windmill/issues/8748)) ([d938625](https://github.com/windmill-labs/windmill/commit/d938625785ba301fbd2c5f3d001c320eab1c504c))
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* delete raw_script_temp rows before workspace deletion to avoid FK violation ([#8752](https://github.com/windmill-labs/windmill/issues/8752)) ([8b9523e](https://github.com/windmill-labs/windmill/commit/8b9523e03c82c5a095b7cb2d5f70a87b7bbc8608))
|
||||
* Fix FlowTimeline duplicate key ([#8754](https://github.com/windmill-labs/windmill/issues/8754)) ([2413dbe](https://github.com/windmill-labs/windmill/commit/2413dbefe3cc3b65c28bea437cd4471cf7e9ecba))
|
||||
* remove span.enter() in dedicated worker to prevent tracing panic ([#8749](https://github.com/windmill-labs/windmill/issues/8749)) ([db55e8e](https://github.com/windmill-labs/windmill/commit/db55e8efb0c9ae198ca5ac7013439a94dfe9f550))
|
||||
* restore ai agent tool deletion ([#8744](https://github.com/windmill-labs/windmill/issues/8744)) ([2f7ba9e](https://github.com/windmill-labs/windmill/commit/2f7ba9edac1a57dfc0eb3417574c72292855fc56))
|
||||
|
||||
## [1.677.0](https://github.com/windmill-labs/windmill/compare/v1.676.0...v1.677.0) (2026-04-06)
|
||||
|
||||
|
||||
### Features
|
||||
|
||||
* add AWS Secrets Manager as secret storage backend (Beta) ([#8734](https://github.com/windmill-labs/windmill/issues/8734)) ([09bbc18](https://github.com/windmill-labs/windmill/commit/09bbc18bb773d9ffaa5aaa4bd9d7ce296f3ac468))
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* remove stale KMS openapi/description, restore stripped doc comments ([c09a431](https://github.com/windmill-labs/windmill/commit/c09a4311fd73c58acc8f3997428f002598dacce6))
|
||||
* use runnable key for file naming in generate-metadata to prevent duplicate scripts in raw apps ([#8740](https://github.com/windmill-labs/windmill/issues/8740)) ([edfe074](https://github.com/windmill-labs/windmill/commit/edfe074e98cb3955be0768de7ed19e6ed8525916))
|
||||
|
||||
## [1.676.0](https://github.com/windmill-labs/windmill/compare/v1.675.1...v1.676.0) (2026-04-06)
|
||||
|
||||
|
||||
### Features
|
||||
|
||||
* add path name autocomplete with ghost text and folder cycling ([#8731](https://github.com/windmill-labs/windmill/issues/8731)) ([e326621](https://github.com/windmill-labs/windmill/commit/e32662169a9762605de2dbe058514ddefbe07982))
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* fix custom urls not found ([d2abc0d](https://github.com/windmill-labs/windmill/commit/d2abc0d4300bb53f4035102f214d3c05bf0976a1))
|
||||
|
||||
|
||||
### Performance Improvements
|
||||
|
||||
* add partial index for expired cache resource cleanup ([#8728](https://github.com/windmill-labs/windmill/issues/8728)) ([c721fac](https://github.com/windmill-labs/windmill/commit/c721fac466524747de04e3623c8cd62de8bd4dae))
|
||||
|
||||
## [1.675.1](https://github.com/windmill-labs/windmill/compare/v1.675.0...v1.675.1) (2026-04-05)
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* log cleanup scans S3 orphans and works cross-server ([#8729](https://github.com/windmill-labs/windmill/issues/8729)) ([f703fba](https://github.com/windmill-labs/windmill/commit/f703fba1ef56c89a97b2b4da7b4c188158f4c982))
|
||||
|
||||
|
||||
### Performance Improvements
|
||||
|
||||
* add indexes for cleanup deletes on concurrency_key and autoscaling_event ([#8726](https://github.com/windmill-labs/windmill/issues/8726)) ([eae46a2](https://github.com/windmill-labs/windmill/commit/eae46a21a93fe7ab191228658dd5825f472bd851))
|
||||
|
||||
## [1.675.0](https://github.com/windmill-labs/windmill/compare/v1.674.2...v1.675.0) (2026-04-05)
|
||||
|
||||
|
||||
### Features
|
||||
|
||||
* add object storage usage view and manual log cleanup ([#8724](https://github.com/windmill-labs/windmill/issues/8724)) ([02d0ee9](https://github.com/windmill-labs/windmill/commit/02d0ee919880823a33b112bcaf626a8933e1f715))
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* add admin check to count_completed_jobs_detail and document query builder SQL safety ([#8722](https://github.com/windmill-labs/windmill/issues/8722)) ([dd39c11](https://github.com/windmill-labs/windmill/commit/dd39c110a8468bf31d42428fc978cd302426fa86))
|
||||
* allow private AI base URLs in ai_proxy integration test ([#8715](https://github.com/windmill-labs/windmill/issues/8715)) ([2b865c0](https://github.com/windmill-labs/windmill/commit/2b865c0694d79ce6477e5f14a077b73837007500))
|
||||
* enrich OTEL spans with job_kind, trigger_kind, trigger, created_by, and script_hash ([#8718](https://github.com/windmill-labs/windmill/issues/8718)) ([7bf6ac2](https://github.com/windmill-labs/windmill/commit/7bf6ac2b694fc829327248ff2480c20c97e03e48))
|
||||
* split DB health endpoint and add slow query controls ([#8725](https://github.com/windmill-labs/windmill/issues/8725)) ([01e39d9](https://github.com/windmill-labs/windmill/commit/01e39d9cd1b841d085bcc28a578654a5486cf76e))
|
||||
|
||||
## [1.674.2](https://github.com/windmill-labs/windmill/compare/v1.674.1...v1.674.2) (2026-04-04)
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* enforce RLS on $var: resolution in AI proxy (GHSA-jwg4-v3cj-rvfm) ([#8713](https://github.com/windmill-labs/windmill/issues/8713)) ([ff8e39c](https://github.com/windmill-labs/windmill/commit/ff8e39c69b1438defcaabd9d4906e7adafa7010c))
|
||||
* SSRF via X-Resource-Path header in AI proxy endpoint ([#8712](https://github.com/windmill-labs/windmill/issues/8712)) ([f394e67](https://github.com/windmill-labs/windmill/commit/f394e674f22af13bb77915f33aa1e8de402b6fe1))
|
||||
|
||||
## [1.674.1](https://github.com/windmill-labs/windmill/compare/v1.674.0...v1.674.1) (2026-04-04)
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* create pg connection for cloud-hosted jobs instead of panicking ([#8710](https://github.com/windmill-labs/windmill/issues/8710)) ([aff95c3](https://github.com/windmill-labs/windmill/commit/aff95c33b2fd4c248dfaf595b8d18a6dbc50f0e6))
|
||||
|
||||
## [1.674.0](https://github.com/windmill-labs/windmill/compare/v1.673.0...v1.674.0) (2026-04-03)
|
||||
|
||||
|
||||
### Features
|
||||
|
||||
* add application-level heartbeat support for websocket triggers ([#8686](https://github.com/windmill-labs/windmill/issues/8686)) ([5b7fa63](https://github.com/windmill-labs/windmill/commit/5b7fa63bf1800313e9b82465b8a4399a48634371))
|
||||
* add Azure Key Vault as secret storage backend ([#8704](https://github.com/windmill-labs/windmill/issues/8704)) ([dcd615f](https://github.com/windmill-labs/windmill/commit/dcd615fdc3c66ec2a8e39c01f8142a7e7c82f534))
|
||||
* add http/protobuf support for OTEL exporters ([#8702](https://github.com/windmill-labs/windmill/issues/8702)) ([0aea49f](https://github.com/windmill-labs/windmill/commit/0aea49f9607d5cbb5bcfa3068a179c9b7bf9afd6))
|
||||
* add optional labels to scripts, flows, apps, schedules, triggers ([#8609](https://github.com/windmill-labs/windmill/issues/8609)) ([c4c9ef5](https://github.com/windmill-labs/windmill/commit/c4c9ef5fd7b41052b08ee941725434e8ca4ac970))
|
||||
* add powershell common parameters support ([#8683](https://github.com/windmill-labs/windmill/issues/8683)) ([0317d58](https://github.com/windmill-labs/windmill/commit/0317d5891cfcfbde7b04795c034c088e933ee3d0))
|
||||
* sql.raw in Typescript client ([#8706](https://github.com/windmill-labs/windmill/issues/8706)) ([ce290f6](https://github.com/windmill-labs/windmill/commit/ce290f68db866c07b30c97c2c0b3e39fee0a26d8))
|
||||
* Support .ducklake() and .datatable() in agent workers ([#8697](https://github.com/windmill-labs/windmill/issues/8697)) ([fda68a7](https://github.com/windmill-labs/windmill/commit/fda68a72e5dfcded2350d1ff33ca4c695ab337b7))
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* add secretKeyRef support for jwt_secret and rsa_keys ([#8698](https://github.com/windmill-labs/windmill/issues/8698)) ([ba21470](https://github.com/windmill-labs/windmill/commit/ba214709b94f9467738e66b016331e97ac7d5d10))
|
||||
* align script push metadata warning with generated locks ([#8690](https://github.com/windmill-labs/windmill/issues/8690)) ([6656b46](https://github.com/windmill-labs/windmill/commit/6656b46f10408e1c15961a72cde4c13b5c5b3923))
|
||||
* debounce S3 proxy logs ([#8694](https://github.com/windmill-labs/windmill/issues/8694)) ([a3073ad](https://github.com/windmill-labs/windmill/commit/a3073ad8244efd9043e27f6731f7b53dbda662c1))
|
||||
* dedicated worker dispatch, cross-workspace deps, UI improvements ([#8689](https://github.com/windmill-labs/windmill/issues/8689)) ([bffa61e](https://github.com/windmill-labs/windmill/commit/bffa61e33f2305bbeb79a2c91989a47baa7dff31))
|
||||
* gate relock_skip tests on private feature and update ee-repo-ref ([#8703](https://github.com/windmill-labs/windmill/issues/8703)) ([adc9fe7](https://github.com/windmill-labs/windmill/commit/adc9fe722d8511a5914d81faac40af757e7f5e3f))
|
||||
* hide deprecated cli metadata commands ([#8699](https://github.com/windmill-labs/windmill/issues/8699)) ([b960598](https://github.com/windmill-labs/windmill/commit/b96059843168c072f24072f93fecd80431e5d4cf))
|
||||
* optimize S3 proxy performance ([#8685](https://github.com/windmill-labs/windmill/issues/8685)) ([0cfa462](https://github.com/windmill-labs/windmill/commit/0cfa462c379e887fdb5ad5e3bbff7798648d4e91))
|
||||
* pipeline DISCARD ALL with first query on cached pg connections ([#8707](https://github.com/windmill-labs/windmill/issues/8707)) ([6d58d1a](https://github.com/windmill-labs/windmill/commit/6d58d1a74d1e69b163210a795502a7b3931001b5))
|
||||
* resolve schedule update deadlock ([#8701](https://github.com/windmill-labs/windmill/issues/8701)) ([27ca417](https://github.com/windmill-labs/windmill/commit/27ca417201c99cf6fe0ae5b52a63c0395033e196))
|
||||
* support raw app deployment history ([#8657](https://github.com/windmill-labs/windmill/issues/8657)) ([f234df9](https://github.com/windmill-labs/windmill/commit/f234df97ec3cdc480ee9d403370a3512496b024b))
|
||||
* use pre-aggregated stats for telemetry job usage queries ([#8688](https://github.com/windmill-labs/windmill/issues/8688)) ([cdf3c29](https://github.com/windmill-labs/windmill/commit/cdf3c29664e4142c0f4487c07e585d1af3f97f91))
|
||||
|
||||
## [1.673.0](https://github.com/windmill-labs/windmill/compare/v1.672.0...v1.673.0) (2026-04-02)
|
||||
|
||||
|
||||
### Features
|
||||
|
||||
* add endpoint to restart workers in a worker group ([#8659](https://github.com/windmill-labs/windmill/issues/8659)) ([f0437eb](https://github.com/windmill-labs/windmill/commit/f0437eba1925a9aa4c430008027d637a0c89ee39))
|
||||
* add Entra ID (Azure Workload Identity) database auth ([#8526](https://github.com/windmill-labs/windmill/issues/8526)) ([6a5cfbc](https://github.com/windmill-labs/windmill/commit/6a5cfbc159a0ad7925fd7ce5eefc8eaa21bbb70b))
|
||||
* add LIMIT_WINDOWS_TO_1CU env var for Windows worker memory limits ([#8681](https://github.com/windmill-labs/windmill/issues/8681)) ([d2d6810](https://github.com/windmill-labs/windmill/commit/d2d6810db954114f3333853bd3476cb8fc735f92))
|
||||
* restore bun for dedicated workers, fix dispatch & serialization, cross-workspace deps ([#8645](https://github.com/windmill-labs/windmill/issues/8645)) ([619ebb6](https://github.com/windmill-labs/windmill/commit/619ebb65ce8dce8264add31c3147919802a8286a))
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* add HMAC signature verification to Slack interactive callback endpoint ([#8611](https://github.com/windmill-labs/windmill/issues/8611)) ([55e8a5c](https://github.com/windmill-labs/windmill/commit/55e8a5cff1f185b1dbd332d37b877972efa1ed7d))
|
||||
* correct raw app flow inputs ([#8667](https://github.com/windmill-labs/windmill/issues/8667)) ([28c0730](https://github.com/windmill-labs/windmill/commit/28c073056c65d4ed1600e39679497e5af964347f))
|
||||
* pass selected language to AI agent when generating flow scripts ([#8680](https://github.com/windmill-labs/windmill/issues/8680)) ([381011a](https://github.com/windmill-labs/windmill/commit/381011a4a8e48454e9c146c64db502293e646b99))
|
||||
* poll for preview results to avoid undici headers timeout ([#8682](https://github.com/windmill-labs/windmill/issues/8682)) ([ff5fa9f](https://github.com/windmill-labs/windmill/commit/ff5fa9f64fe4aaf33e06b20f02373894b5df0f95))
|
||||
* pre-fix trigger edited_by for superadmins not in workspace ([#8669](https://github.com/windmill-labs/windmill/issues/8669)) ([350ffdc](https://github.com/windmill-labs/windmill/commit/350ffdce297ba5b84f9dd247eede6da0c6b0956c))
|
||||
* resolve race condition where flow sync push reverts to stale version ([#8673](https://github.com/windmill-labs/windmill/issues/8673)) ([d569e9e](https://github.com/windmill-labs/windmill/commit/d569e9e29c588243a90b1cd25f866efb0d178640))
|
||||
* respect disabled fields in JSON input mode ([#8663](https://github.com/windmill-labs/windmill/issues/8663)) ([7fd0bf9](https://github.com/windmill-labs/windmill/commit/7fd0bf974d2ba2644bb01dd5e9ddc84749e166f5))
|
||||
* Run typed pg queries in a single protocol conversation ([#8679](https://github.com/windmill-labs/windmill/issues/8679)) ([8581a33](https://github.com/windmill-labs/windmill/commit/8581a3300d056040b7e3ab77d629c74f034c9c97))
|
||||
* sanitize MCP tool schemas for JSON Schema draft 2020-12 compliance ([#8666](https://github.com/windmill-labs/windmill/issues/8666)) ([8c3c97f](https://github.com/windmill-labs/windmill/commit/8c3c97f7a670d47019cc666219f8187f48499672))
|
||||
* skip generate-metadata confirmation prompt in non-interactive CI ([#8678](https://github.com/windmill-labs/windmill/issues/8678)) ([39af1b7](https://github.com/windmill-labs/windmill/commit/39af1b75afc8458f85dec4fe51dfaed3d0cb000d))
|
||||
* strip f/ prefix from folder paths when deploying from workspace forks ([#8662](https://github.com/windmill-labs/windmill/issues/8662)) ([7ab0ea5](https://github.com/windmill-labs/windmill/commit/7ab0ea581d349fbfdb56d22cf9903a90efa045bb))
|
||||
* support branch-specific folder.meta.yaml in missing-meta check ([#8661](https://github.com/windmill-labs/windmill/issues/8661)) ([c87a6a0](https://github.com/windmill-labs/windmill/commit/c87a6a0f2c1346bf5e21f128d32d89bdca039243))
|
||||
* validate rd redirect on login with same rules as logout ([#8655](https://github.com/windmill-labs/windmill/issues/8655)) ([bcce627](https://github.com/windmill-labs/windmill/commit/bcce62738791a4e9b9f4dbc64731eef163230172))
|
||||
|
||||
## [1.672.0](https://github.com/windmill-labs/windmill/compare/v1.671.0...v1.672.0) (2026-04-01)
|
||||
|
||||
|
||||
### Features
|
||||
|
||||
* add R language support ([#8263](https://github.com/windmill-labs/windmill/issues/8263)) ([a46aa64](https://github.com/windmill-labs/windmill/commit/a46aa641f9d72809c52a0eb11a877a0f2d587c32))
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* approval page freeze, stale state, and missing approval link ([#8653](https://github.com/windmill-labs/windmill/issues/8653)) ([7069202](https://github.com/windmill-labs/windmill/commit/70692021909443b86ed61fa621fe49f28742fb54))
|
||||
|
||||
## [1.671.0](https://github.com/windmill-labs/windmill/compare/v1.670.0...v1.671.0) (2026-03-31)
|
||||
|
||||
|
||||
### Features
|
||||
|
||||
* add configurable preview job tag override in default tags settings ([#8649](https://github.com/windmill-labs/windmill/issues/8649)) ([da8886b](https://github.com/windmill-labs/windmill/commit/da8886be8575dd925b6d24c55ab379bc6984c5f8))
|
||||
* improve CLI flow log streaming and job inspection ([#8644](https://github.com/windmill-labs/windmill/issues/8644)) ([6c3c971](https://github.com/windmill-labs/windmill/commit/6c3c971af5aa1362632ee0deeddf91b8bc47c853))
|
||||
* support hub flows in raw app runnables ([#8627](https://github.com/windmill-labs/windmill/issues/8627)) ([040a199](https://github.com/windmill-labs/windmill/commit/040a199685cea5c99c944bacb5584a381d6ec829))
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* return default_args/enums in approval info and fix subflow resume buttons ([#8648](https://github.com/windmill-labs/windmill/issues/8648)) ([852c59e](https://github.com/windmill-labs/windmill/commit/852c59efbb04510e5e6f99919707effcf6769a2f))
|
||||
|
||||
## [1.670.0](https://github.com/windmill-labs/windmill/compare/v1.669.1...v1.670.0) (2026-03-31)
|
||||
|
||||
|
||||
|
||||
@@ -26,6 +26,7 @@ Open-source platform for internal tools, workflows, API integrations, background
|
||||
- **DB**: `psql postgres://postgres:changeme@localhost:5432/windmill`
|
||||
- **Login**: `admin@windmill.dev` / `changeme`
|
||||
- **Instance settings**: navigate to `/#superadmin-settings`
|
||||
- **Migrations**: use `cargo sqlx migrate add -r <name>` from `backend/` to create new migrations (never generate timestamps manually)
|
||||
|
||||
## Banned Patterns
|
||||
|
||||
|
||||
10
Dockerfile
10
Dockerfile
@@ -162,11 +162,19 @@ ENV PATH /usr/local/bin:/root/.local/bin:/tmp/.local/bin:$PATH
|
||||
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends netbase tzdata ca-certificates wget curl jq unzip build-essential unixodbc xmlsec1 software-properties-common tini \
|
||||
&& apt-get install -y --no-install-recommends netbase tzdata ca-certificates wget curl jq unzip build-essential unixodbc xmlsec1 software-properties-common tini gnupg lsb-release \
|
||||
&& if echo "$features" | grep -q "ee"; then apt-get install -y --no-install-recommends libsasl2-modules-gssapi-mit krb5-user; fi \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install latest PostgreSQL client (pg_dump) from official PostgreSQL apt repository
|
||||
RUN curl -fsSL https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor -o /usr/share/keyrings/postgresql-archive-keyring.gpg \
|
||||
&& echo "deb [signed-by=/usr/share/keyrings/postgresql-archive-keyring.gpg] https://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y --no-install-recommends postgresql-client \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN if [ "$WITH_GIT" = "true" ]; then \
|
||||
apt-get update -y \
|
||||
&& apt-get install -y git \
|
||||
|
||||
2
ai_evals/.gitignore
vendored
Normal file
2
ai_evals/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
.env
|
||||
results/
|
||||
172
ai_evals/AGENTS.md
Normal file
172
ai_evals/AGENTS.md
Normal file
@@ -0,0 +1,172 @@
|
||||
# AI Evals Authoring Guide
|
||||
|
||||
This folder contains black-box benchmark cases for:
|
||||
|
||||
- `flow`
|
||||
- `app`
|
||||
- `script`
|
||||
- `cli`
|
||||
|
||||
The goal is to test the current production prompts and guidance with realistic user requests, not to test one exact implementation shape.
|
||||
|
||||
## Core rules
|
||||
|
||||
1. Write prompts like a real user request.
|
||||
2. Prefer behavior, inputs, constraints, and outcomes over internal implementation details.
|
||||
3. Keep deterministic validation narrow and hard.
|
||||
4. Put semantic expectations in `judgeChecklist`.
|
||||
5. Use `expected` fixtures only when exact structure really matters.
|
||||
|
||||
## Prompt writing
|
||||
|
||||
Prompts should sound like something a user would naturally ask.
|
||||
|
||||
Good:
|
||||
|
||||
- "Create a flow that routes support requests based on customer tier."
|
||||
- "Add a reset button that sets the counter back to 0."
|
||||
- "Create a flow that reuses the existing greeting script instead of duplicating the logic."
|
||||
|
||||
Bad:
|
||||
|
||||
- "Use `branchone` with 3 branches and a default branch."
|
||||
- "Create a `rawscript` step with this exact topology."
|
||||
- "This is a benchmark harness."
|
||||
|
||||
Do not write prompts as if the user knows Windmill internals unless the case is explicitly testing a power-user workflow.
|
||||
|
||||
## Flow-specific rules
|
||||
|
||||
This is the main principle you asked for:
|
||||
|
||||
- flow prompts should read like requests from a user who does not know the product internals
|
||||
- the user should ask for behavior, not for `branchone`, `branchall`, `rawscript`, `preprocessor_module`, `failure_module`, exact graph topology, or other internal constructs
|
||||
|
||||
That means:
|
||||
|
||||
- creation cases should describe the business behavior and expected result
|
||||
- modification cases may mention existing step names, because the user can see the current flow
|
||||
- only mention special Windmill constructs when the case is explicitly about those constructs
|
||||
|
||||
Examples:
|
||||
|
||||
- acceptable creation prompt:
|
||||
"Create a purchase approval flow that pauses for approval and asks the approver for a comment."
|
||||
- avoid:
|
||||
"Create a suspend step with one required event and a resume form."
|
||||
|
||||
For flow cases, do not fail a case just because the model chose a different valid topology.
|
||||
|
||||
## App-specific rules
|
||||
|
||||
App prompts should focus on user-visible behavior:
|
||||
|
||||
- what the UI should let the user do
|
||||
- what should persist
|
||||
- what backend behavior is needed
|
||||
|
||||
Avoid prompting in terms of React structure, component names, or implementation unless the case is specifically about editing an existing app.
|
||||
|
||||
## CLI-specific rules
|
||||
|
||||
CLI prompts can be more explicit about paths and file names because real CLI users often do specify them.
|
||||
|
||||
Still, avoid benchmark phrasing. The prompt should read like a repo task, not a harness instruction.
|
||||
|
||||
When relevant, ask the assistant to tell the user which `wmill` commands to run next. That is part of the benchmarked behavior.
|
||||
|
||||
## Deterministic validation
|
||||
|
||||
Use deterministic validation only for hard failures such as:
|
||||
|
||||
- missing required files
|
||||
- unexpected extra files when the prompt says not to create them
|
||||
- syntax errors
|
||||
- unresolved flow refs
|
||||
- missing required special modules or suspend config
|
||||
- obvious artifact corruption
|
||||
|
||||
Do not use deterministic validation to enforce one preferred implementation for broad creation tasks.
|
||||
|
||||
Examples of bad hard checks:
|
||||
|
||||
- exact step topology for a creation flow
|
||||
- exact branch structure when the prompt only asked for routing behavior
|
||||
- exact input shape when multiple reasonable shapes are acceptable
|
||||
|
||||
## Judge checklist
|
||||
|
||||
Every non-trivial case should have a `judgeChecklist`.
|
||||
|
||||
The checklist should capture:
|
||||
|
||||
- the user-visible behavior that must be present
|
||||
- important constraints
|
||||
- key completion criteria
|
||||
|
||||
The checklist should not duplicate low-level implementation details unless they are truly required by the task.
|
||||
|
||||
Good checklist items:
|
||||
|
||||
- "the flow calculates the order total with 8% tax"
|
||||
- "the app persists recipes appropriately for a raw Windmill app"
|
||||
- "the flow reuses the existing workspace script instead of rewriting the logic"
|
||||
|
||||
Bad checklist items:
|
||||
|
||||
- "uses `branchone`"
|
||||
- "contains a `rawscript` node"
|
||||
|
||||
## When to use `expected`
|
||||
|
||||
Use `expected` fixtures when the case is structure-sensitive, for example:
|
||||
|
||||
- exact file creation
|
||||
- exact script content
|
||||
- modification cases where a specific file must change in a specific way
|
||||
- cases where preserving an existing structure is part of the requirement
|
||||
|
||||
Do not use a full `expected` artifact as the semantic oracle for broad creation tasks when multiple valid outputs should pass.
|
||||
|
||||
## When to use `initial`
|
||||
|
||||
Use `initial` when the benchmark is about:
|
||||
|
||||
- editing an existing artifact
|
||||
- reusing existing workspace assets
|
||||
- preserving existing behavior while adding a change
|
||||
|
||||
If the case is greenfield, prefer no `initial`.
|
||||
|
||||
## Case design ladder
|
||||
|
||||
Prefer suites that get gradually harder:
|
||||
|
||||
1. trivial create case
|
||||
2. realistic create case
|
||||
3. reuse-existing-assets case
|
||||
4. modification case
|
||||
5. refactor case
|
||||
6. edge-case or niche product behavior
|
||||
|
||||
The last cases in a suite should cover unusual or product-specific behavior.
|
||||
|
||||
## Anti-patterns
|
||||
|
||||
Avoid these:
|
||||
|
||||
- benchmark framing in prompts
|
||||
- over-specified internal topology for creation tasks
|
||||
- judge checklists that just restate implementation details
|
||||
- deterministic validation that encodes one preferred solution
|
||||
- fixtures that are so minimal or brittle that they create false negatives
|
||||
|
||||
## Before adding a case
|
||||
|
||||
Ask:
|
||||
|
||||
1. Would a real user plausibly write this prompt?
|
||||
2. If the model solves it in a different valid way, would the case still pass?
|
||||
3. Are the hard deterministic checks only catching objectively broken output?
|
||||
4. Does the `judgeChecklist` describe the real success criteria?
|
||||
5. If this case fails, will the reason be understandable from the saved artifacts?
|
||||
1
ai_evals/CLAUDE.md
Normal file
1
ai_evals/CLAUDE.md
Normal file
@@ -0,0 +1 @@
|
||||
@AGENTS.md
|
||||
197
ai_evals/README.md
Normal file
197
ai_evals/README.md
Normal file
@@ -0,0 +1,197 @@
|
||||
# AI Evals
|
||||
|
||||
Small benchmark runner for the four Windmill AI generation modes:
|
||||
|
||||
- `cli`
|
||||
- `flow`
|
||||
- `script`
|
||||
- `app`
|
||||
|
||||
The benchmark always tests the current production prompts, tools, and guidance in this checkout.
|
||||
|
||||
Each attempt runs:
|
||||
|
||||
1. the real production path
|
||||
2. deterministic validation
|
||||
3. LLM judging
|
||||
|
||||
## Install
|
||||
|
||||
```bash
|
||||
cd ai_evals
|
||||
bun install
|
||||
```
|
||||
|
||||
Frontend modes also require frontend dependencies:
|
||||
|
||||
```bash
|
||||
cd frontend
|
||||
bun install
|
||||
```
|
||||
|
||||
## Commands
|
||||
|
||||
List model aliases:
|
||||
|
||||
```bash
|
||||
cd ai_evals
|
||||
bun run cli -- models
|
||||
```
|
||||
|
||||
List cases:
|
||||
|
||||
```bash
|
||||
cd ai_evals
|
||||
bun run cli -- cases
|
||||
bun run cli -- cases flow
|
||||
```
|
||||
|
||||
Run benchmarks:
|
||||
|
||||
```bash
|
||||
cd ai_evals
|
||||
bun run cli -- run flow
|
||||
bun run cli -- run flow flow-test4-order-processing-loop --model opus
|
||||
bun run cli -- run flow flow-test0-sum-two-numbers --models haiku,opus,4o
|
||||
bun run cli -- run flow flow-test0-sum-two-numbers --runs 3 --verbose
|
||||
bun run cli -- run flow --record
|
||||
WMILL_AI_EVAL_BACKEND_URL=http://127.0.0.1:8000 bun run cli -- run flow --backend-validation preview
|
||||
bun run cli -- run cli bun-hello-script
|
||||
```
|
||||
|
||||
Public CLI surface:
|
||||
|
||||
- `models`
|
||||
- `cases [mode]`
|
||||
- `run <mode> [caseIds...]`
|
||||
|
||||
`run` options:
|
||||
|
||||
- `--runs <n>`: repeat each case `n` times
|
||||
- `--output <path>`: custom result JSON path
|
||||
- `--model <alias>`: choose the model under test
|
||||
- `--models <a,b,c>`: run the same cases sequentially against several model aliases
|
||||
- `--verbose`: stream assistant output for frontend runs
|
||||
- `--record`: append a compact tracked summary line to `ai_evals/history/<mode>.jsonl` for full-suite runs only
|
||||
- `--backend-validation <mode>`: optional backend smoke validation (`off` or `preview`) for `script` and `flow` evals
|
||||
|
||||
## Models
|
||||
|
||||
Use `bun run cli -- models` to see the current aliases.
|
||||
|
||||
Today:
|
||||
|
||||
- `haiku`
|
||||
- `sonnet`
|
||||
- `opus`
|
||||
- `4o`
|
||||
- `gemini-flash`
|
||||
- `gemini-pro`
|
||||
- `gemini-3-flash-preview`
|
||||
- `gemini-3.1-pro-preview`
|
||||
|
||||
Notes:
|
||||
|
||||
- the command also prints accepted alias spellings such as `gpt-4o`, `claude-opus-4.6`, and `claude-haiku-4.5`
|
||||
- frontend modes (`flow`, `script`, `app`) can use Anthropic, OpenAI, and Gemini-backed aliases
|
||||
- `cli` mode always uses the Anthropic agent SDK, so only Anthropic aliases are valid there
|
||||
- the judge model is separate and currently defaults to `claude-sonnet-4-6`
|
||||
|
||||
## Case Format
|
||||
|
||||
Cases live in one YAML file per mode under `ai_evals/cases/`.
|
||||
|
||||
Minimal shape:
|
||||
|
||||
```yaml
|
||||
- id: flow-test0-sum-two-numbers
|
||||
prompt: |-
|
||||
Create a flow that takes two numbers, `a` and `b`, and returns their sum.
|
||||
initial: ai_evals/fixtures/...
|
||||
expected: ai_evals/fixtures/...
|
||||
```
|
||||
|
||||
Optional fields:
|
||||
|
||||
- `initial`: starting state fixture
|
||||
- `expected`: expected artifact fixture
|
||||
- `validate`: extra deterministic validation rules
|
||||
- `runtime.backendPreview`: optional real backend preview config for smoke validation
|
||||
|
||||
For `flow` mode, `validate` can express requirements such as:
|
||||
|
||||
- accepted input schema shapes
|
||||
- required `results.*` reference validity
|
||||
- required module/code/input characteristics
|
||||
|
||||
For `flow` mode, an `initial` fixture can also include a benchmark workspace catalog of
|
||||
existing scripts and flows. That lets the real `search_workspace` and
|
||||
`get_runnable_details` tools discover reusable workspace runnables during evals.
|
||||
|
||||
If `--backend-validation preview` is enabled:
|
||||
|
||||
- `script` evals run a real backend script preview in an isolated temp workspace
|
||||
- `flow` evals run a real backend flow preview only for cases that define `runtime.backendPreview`
|
||||
- `flow` cases with `initial.workspace` fixtures seed those scripts and flows into the preview workspace before preview
|
||||
- when `WMILL_AI_EVAL_BACKEND_WORKSPACE` is set, `ai_evals` treats that workspace as a dedicated test workspace, clears managed eval assets under `f/evals/*` before each preview run, and then reseeds the current case fixtures
|
||||
|
||||
Supported backend validation env vars:
|
||||
|
||||
- `WMILL_AI_EVAL_BACKEND_VALIDATION=preview`
|
||||
- `WMILL_AI_EVAL_BACKEND_URL=http://127.0.0.1:8000`
|
||||
- `WMILL_AI_EVAL_BACKEND_EMAIL=admin@windmill.dev`
|
||||
- `WMILL_AI_EVAL_BACKEND_PASSWORD=changeme`
|
||||
- `WMILL_AI_EVAL_BACKEND_WORKSPACE=integration-tests` to reuse an existing workspace on CE installs with low workspace limits
|
||||
- `WMILL_AI_EVAL_KEEP_WORKSPACES=1`
|
||||
- `WMILL_AI_EVAL_WORKSPACE_PREFIX=ai-evals`
|
||||
|
||||
## Results And Artifacts
|
||||
|
||||
Every run writes:
|
||||
|
||||
- a summary JSON under `ai_evals/results/`
|
||||
- generated artifacts in a sibling directory
|
||||
|
||||
If `--record` is used, the CLI also appends one compact JSON line to:
|
||||
|
||||
- `ai_evals/history/flow.jsonl`
|
||||
- `ai_evals/history/script.jsonl`
|
||||
- `ai_evals/history/app.jsonl`
|
||||
- `ai_evals/history/cli.jsonl`
|
||||
|
||||
Each recorded line contains:
|
||||
|
||||
- run metadata (`createdAt`, `gitSha`, `mode`, `runModel`, `judgeModel`)
|
||||
- suite totals (`caseCount`, `attemptCount`, `passedAttempts`, `passRate`, `averageDurationMs`, `averageJudgeScore`)
|
||||
- average token usage (`averageTokenUsagePerAttempt`)
|
||||
- per-case metrics under `cases[]` (`averageDurationMs`, `averageJudgeScore`, `averageTokenUsagePerAttempt`, pass rate)
|
||||
- `failedCaseIds`
|
||||
|
||||
Example:
|
||||
|
||||
- summary: `ai_evals/results/2026-04-09T09-40-33.051Z__flow.json`
|
||||
- artifacts: `ai_evals/results/2026-04-09T09-40-33.051Z__flow/`
|
||||
|
||||
Typical artifacts by mode:
|
||||
|
||||
- `flow`: `flow.json`
|
||||
- `script`: `script.json` plus the generated script file
|
||||
- `app`: `app.json` plus frontend/backend files
|
||||
- `cli`: `assistant-output.txt` plus generated workspace files
|
||||
- backend-validated attempts also include `backend-preview.json`
|
||||
|
||||
## Layout
|
||||
|
||||
- `cases/`: one YAML file per mode
|
||||
- `fixtures/`: initial and expected fixtures
|
||||
- `core/`: shared loading, model resolution, validation, judging, and result writing
|
||||
- `modes/`: one runner per mode
|
||||
- `history/`: optional tracked pass-rate history written by `run --record`, one JSONL file per mode
|
||||
- `results/`: local benchmark output and artifacts
|
||||
|
||||
## Notes
|
||||
|
||||
- Frontend modes reuse the production frontend chat code through the Vitest bridge.
|
||||
- CLI mode creates an isolated workspace, writes the current checkout guidance into it, and benchmarks the real skills / `AGENTS.md` flow.
|
||||
- Frontend progress streams live while the benchmark is running.
|
||||
- Deterministic validators should stay focused on real correctness constraints, not one exact implementation shape.
|
||||
72
ai_evals/adapters/cli/runtime.test.ts
Normal file
72
ai_evals/adapters/cli/runtime.test.ts
Normal file
@@ -0,0 +1,72 @@
|
||||
import { describe, expect, it } from "bun:test";
|
||||
import {
|
||||
anthropicUsageToBenchmarkTokenUsage,
|
||||
extractCliResultTokenUsage,
|
||||
} from "./runtime";
|
||||
|
||||
describe("anthropicUsageToBenchmarkTokenUsage", () => {
|
||||
it("includes cache tokens in prompt usage", () => {
|
||||
expect(
|
||||
anthropicUsageToBenchmarkTokenUsage({
|
||||
input_tokens: 120,
|
||||
output_tokens: 45,
|
||||
cache_creation_input_tokens: 30,
|
||||
cache_read_input_tokens: 5,
|
||||
})
|
||||
).toEqual({
|
||||
prompt: 155,
|
||||
completion: 45,
|
||||
total: 200,
|
||||
});
|
||||
});
|
||||
|
||||
it("returns null when usage is absent", () => {
|
||||
expect(anthropicUsageToBenchmarkTokenUsage(null)).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
describe("extractCliResultTokenUsage", () => {
|
||||
it("reads aggregate usage from the SDK result event", () => {
|
||||
expect(
|
||||
extractCliResultTokenUsage({
|
||||
type: "result",
|
||||
usage: {
|
||||
input_tokens: 400,
|
||||
output_tokens: 120,
|
||||
cache_creation_input_tokens: 50,
|
||||
cache_read_input_tokens: 25,
|
||||
},
|
||||
})
|
||||
).toEqual({
|
||||
prompt: 475,
|
||||
completion: 120,
|
||||
total: 595,
|
||||
});
|
||||
});
|
||||
|
||||
it("falls back to modelUsage when aggregate usage is unavailable", () => {
|
||||
expect(
|
||||
extractCliResultTokenUsage({
|
||||
type: "result",
|
||||
modelUsage: {
|
||||
opus: {
|
||||
inputTokens: 200,
|
||||
outputTokens: 60,
|
||||
cacheCreationInputTokens: 10,
|
||||
cacheReadInputTokens: 5,
|
||||
},
|
||||
haiku: {
|
||||
inputTokens: 80,
|
||||
outputTokens: 20,
|
||||
cacheCreationInputTokens: 0,
|
||||
cacheReadInputTokens: 15,
|
||||
},
|
||||
},
|
||||
})
|
||||
).toEqual({
|
||||
prompt: 310,
|
||||
completion: 80,
|
||||
total: 390,
|
||||
});
|
||||
});
|
||||
});
|
||||
199
ai_evals/adapters/cli/runtime.ts
Normal file
199
ai_evals/adapters/cli/runtime.ts
Normal file
@@ -0,0 +1,199 @@
|
||||
import { query, type Options } from "@anthropic-ai/claude-agent-sdk";
|
||||
import { join } from "path";
|
||||
import { fileURLToPath } from "url";
|
||||
import { getCliEvalModel, resolveEvalModel, type CliEvalModelConfig } from "../../core/models";
|
||||
import type { BenchmarkTokenUsage } from "../../core/types";
|
||||
|
||||
export interface ToolInvocation {
|
||||
tool: string;
|
||||
input: Record<string, unknown>;
|
||||
timestamp: number;
|
||||
}
|
||||
|
||||
export interface PromptRunResult {
|
||||
toolsUsed: ToolInvocation[];
|
||||
skillsInvoked: string[];
|
||||
output: string;
|
||||
durationMs: number;
|
||||
assistantMessageCount: number;
|
||||
tokenUsage: BenchmarkTokenUsage | null;
|
||||
}
|
||||
|
||||
interface AnthropicUsageLike {
|
||||
input_tokens?: number | null;
|
||||
output_tokens?: number | null;
|
||||
cache_creation_input_tokens?: number | null;
|
||||
cache_read_input_tokens?: number | null;
|
||||
}
|
||||
|
||||
interface AnthropicModelUsageLike {
|
||||
inputTokens?: number | null;
|
||||
outputTokens?: number | null;
|
||||
cacheCreationInputTokens?: number | null;
|
||||
cacheReadInputTokens?: number | null;
|
||||
}
|
||||
|
||||
interface CliResultMessageLike {
|
||||
type?: string;
|
||||
usage?: AnthropicUsageLike | null;
|
||||
modelUsage?: Record<string, AnthropicModelUsageLike> | null;
|
||||
}
|
||||
|
||||
const REPO_ROOT = fileURLToPath(new URL("../../../", import.meta.url));
|
||||
export const DEFAULT_CLI_EVAL_MODEL: CliEvalModelConfig = getCliEvalModel(resolveEvalModel("cli"));
|
||||
|
||||
export function getGeneratedSkillsSource(): string {
|
||||
return join(REPO_ROOT, "system_prompts", "auto-generated", "skills");
|
||||
}
|
||||
|
||||
export function anthropicUsageToBenchmarkTokenUsage(
|
||||
usage: AnthropicUsageLike | null | undefined
|
||||
): BenchmarkTokenUsage | null {
|
||||
if (!usage) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const prompt =
|
||||
(usage.input_tokens ?? 0) +
|
||||
(usage.cache_creation_input_tokens ?? 0) +
|
||||
(usage.cache_read_input_tokens ?? 0);
|
||||
const completion = usage.output_tokens ?? 0;
|
||||
|
||||
return {
|
||||
prompt,
|
||||
completion,
|
||||
total: prompt + completion,
|
||||
};
|
||||
}
|
||||
|
||||
export function extractCliResultTokenUsage(message: unknown): BenchmarkTokenUsage | null {
|
||||
if (!message || typeof message !== "object") {
|
||||
return null;
|
||||
}
|
||||
|
||||
const resultMessage = message as CliResultMessageLike;
|
||||
if (resultMessage.type !== "result") {
|
||||
return null;
|
||||
}
|
||||
|
||||
const usage = anthropicUsageToBenchmarkTokenUsage(resultMessage.usage);
|
||||
if (usage) {
|
||||
return usage;
|
||||
}
|
||||
|
||||
if (!resultMessage.modelUsage || typeof resultMessage.modelUsage !== "object") {
|
||||
return null;
|
||||
}
|
||||
|
||||
let prompt = 0;
|
||||
let completion = 0;
|
||||
let sawModelUsage = false;
|
||||
|
||||
for (const modelUsage of Object.values(resultMessage.modelUsage)) {
|
||||
if (!modelUsage || typeof modelUsage !== "object") {
|
||||
continue;
|
||||
}
|
||||
|
||||
prompt +=
|
||||
(modelUsage.inputTokens ?? 0) +
|
||||
(modelUsage.cacheCreationInputTokens ?? 0) +
|
||||
(modelUsage.cacheReadInputTokens ?? 0);
|
||||
completion += modelUsage.outputTokens ?? 0;
|
||||
sawModelUsage = true;
|
||||
}
|
||||
|
||||
if (!sawModelUsage) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
prompt,
|
||||
completion,
|
||||
total: prompt + completion,
|
||||
};
|
||||
}
|
||||
|
||||
export async function runPromptAndCapture(
|
||||
prompt: string,
|
||||
cwd: string,
|
||||
maxTurns: number = 3,
|
||||
modelConfig: CliEvalModelConfig = DEFAULT_CLI_EVAL_MODEL
|
||||
): Promise<PromptRunResult> {
|
||||
const toolsUsed: ToolInvocation[] = [];
|
||||
const skillsInvoked: string[] = [];
|
||||
let output = "";
|
||||
let assistantMessageCount = 0;
|
||||
let tokenUsage: BenchmarkTokenUsage | null = null;
|
||||
const startedAt = Date.now();
|
||||
|
||||
const options: Options = {
|
||||
cwd,
|
||||
model: modelConfig.model,
|
||||
maxTurns,
|
||||
settingSources: ["project"],
|
||||
allowedTools: ["Skill", "Read", "Glob", "Grep", "Bash", "Write", "Edit"]
|
||||
};
|
||||
|
||||
for await (const message of query({ prompt, options })) {
|
||||
if (message.type === "assistant") {
|
||||
assistantMessageCount += 1;
|
||||
const content = message.message?.content;
|
||||
if (Array.isArray(content)) {
|
||||
for (const block of content) {
|
||||
if (block.type === "tool_use") {
|
||||
toolsUsed.push({
|
||||
tool: block.name,
|
||||
input: block.input as Record<string, unknown>,
|
||||
timestamp: Date.now()
|
||||
});
|
||||
|
||||
if (block.name === "Skill" && typeof block.input === "object" && block.input !== null) {
|
||||
const skillInput = block.input as { skill?: string };
|
||||
if (skillInput.skill) {
|
||||
skillsInvoked.push(skillInput.skill);
|
||||
}
|
||||
}
|
||||
} else if (block.type === "text") {
|
||||
output += block.text;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (message.type === "result") {
|
||||
const resultMessage = message as { result?: string };
|
||||
tokenUsage = extractCliResultTokenUsage(message) ?? tokenUsage;
|
||||
if (typeof resultMessage.result === "string") {
|
||||
output += resultMessage.result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
toolsUsed,
|
||||
skillsInvoked,
|
||||
output,
|
||||
durationMs: Date.now() - startedAt,
|
||||
assistantMessageCount,
|
||||
tokenUsage,
|
||||
};
|
||||
}
|
||||
|
||||
export function wasSkillInvoked(result: PromptRunResult, skillName: string): boolean {
|
||||
return result.skillsInvoked.some((skill) => skill === skillName || skill.includes(skillName));
|
||||
}
|
||||
|
||||
export function wasToolUsed(result: PromptRunResult, toolName: string): boolean {
|
||||
return result.toolsUsed.some((tool) => tool.tool === toolName);
|
||||
}
|
||||
|
||||
export function formatCliRunModelLabel(modelConfig: CliEvalModelConfig): string {
|
||||
return `${modelConfig.provider}:${modelConfig.model}`;
|
||||
}
|
||||
|
||||
export function getToolInputs(
|
||||
result: PromptRunResult,
|
||||
toolName: string
|
||||
): Record<string, unknown>[] {
|
||||
return result.toolsUsed
|
||||
.filter((tool) => tool.tool === toolName)
|
||||
.map((tool) => tool.input);
|
||||
}
|
||||
246
ai_evals/adapters/frontend/backendPreview.test.ts
Normal file
246
ai_evals/adapters/frontend/backendPreview.test.ts
Normal file
@@ -0,0 +1,246 @@
|
||||
import { afterEach, describe, expect, it } from 'bun:test'
|
||||
import type { BackendValidationSettings } from '../../core/backendValidation'
|
||||
import { BackendPreviewClient } from './backendPreview'
|
||||
|
||||
const ORIGINAL_FETCH = globalThis.fetch
|
||||
|
||||
afterEach(() => {
|
||||
globalThis.fetch = ORIGINAL_FETCH
|
||||
})
|
||||
|
||||
describe('BackendPreviewClient', () => {
|
||||
it('updates an existing seeded script on path conflict and waits for deployment', async () => {
|
||||
const requests: Array<{ url: string; init?: RequestInit }> = []
|
||||
globalThis.fetch = mockFetch(
|
||||
requests,
|
||||
textResponse(200, 'token'),
|
||||
textResponse(200, ''),
|
||||
textResponse(400, 'Path conflict for f/evals/add_two_numbers with non-archived hash 123'),
|
||||
jsonResponse(200, { hash: '123' }),
|
||||
textResponse(200, '456'),
|
||||
jsonResponse(200, { lock: 'script.lock', lock_error_logs: null })
|
||||
)
|
||||
|
||||
const client = new BackendPreviewClient(
|
||||
buildSettings({ baseUrl: 'http://backend.test/script-upsert' })
|
||||
)
|
||||
|
||||
await client.createScript({
|
||||
workspaceId: 'test',
|
||||
path: 'f/evals/add_two_numbers',
|
||||
summary: 'Add two numbers',
|
||||
content: 'export async function main(a: number, b: number) { return a + b }',
|
||||
language: 'bun'
|
||||
})
|
||||
|
||||
expect(requests.map((entry) => entry.url)).toEqual([
|
||||
'http://backend.test/script-upsert/api/auth/login',
|
||||
'http://backend.test/script-upsert/api/w/test/folders/create',
|
||||
'http://backend.test/script-upsert/api/w/test/scripts/create',
|
||||
'http://backend.test/script-upsert/api/w/test/scripts/get/p/f/evals/add_two_numbers',
|
||||
'http://backend.test/script-upsert/api/w/test/scripts/create',
|
||||
'http://backend.test/script-upsert/api/w/test/scripts/deployment_status/h/456'
|
||||
])
|
||||
|
||||
const updateRequest = requests[4]
|
||||
expect(updateRequest.init?.method).toBe('POST')
|
||||
expect(JSON.parse(String(updateRequest.init?.body))).toMatchObject({
|
||||
path: 'f/evals/add_two_numbers',
|
||||
parent_hash: '123',
|
||||
language: 'bun'
|
||||
})
|
||||
})
|
||||
|
||||
it('updates an existing seeded flow on create conflict', async () => {
|
||||
const requests: Array<{ url: string; init?: RequestInit }> = []
|
||||
globalThis.fetch = mockFetch(
|
||||
requests,
|
||||
textResponse(200, 'token'),
|
||||
textResponse(200, ''),
|
||||
textResponse(400, 'Flow f/evals/add_numbers_flow already exists'),
|
||||
textResponse(200, '')
|
||||
)
|
||||
|
||||
const client = new BackendPreviewClient(
|
||||
buildSettings({ baseUrl: 'http://backend.test/flow-upsert' })
|
||||
)
|
||||
|
||||
await client.createFlow({
|
||||
workspaceId: 'test',
|
||||
path: 'f/evals/add_numbers_flow',
|
||||
summary: 'Add numbers',
|
||||
value: { modules: [] }
|
||||
})
|
||||
|
||||
expect(requests.map((entry) => entry.url)).toEqual([
|
||||
'http://backend.test/flow-upsert/api/auth/login',
|
||||
'http://backend.test/flow-upsert/api/w/test/folders/create',
|
||||
'http://backend.test/flow-upsert/api/w/test/flows/create',
|
||||
'http://backend.test/flow-upsert/api/w/test/flows/update/f/evals/add_numbers_flow'
|
||||
])
|
||||
|
||||
const updateRequest = requests[3]
|
||||
expect(updateRequest.init?.method).toBe('POST')
|
||||
expect(JSON.parse(String(updateRequest.init?.body))).toMatchObject({
|
||||
path: 'f/evals/add_numbers_flow',
|
||||
value: { modules: [] }
|
||||
})
|
||||
})
|
||||
|
||||
it('serializes shared-workspace validations inside the overridden workspace', async () => {
|
||||
globalThis.fetch = async (input) => {
|
||||
const url = String(input)
|
||||
if (url.endsWith('/api/auth/login')) {
|
||||
return textResponse(200, 'token')
|
||||
}
|
||||
if (url.endsWith('/api/workspaces/exists')) {
|
||||
return textResponse(200, 'true')
|
||||
}
|
||||
if (url.endsWith('/api/w/shared-preview/flows/list_paths')) {
|
||||
return jsonResponse(200, [])
|
||||
}
|
||||
if (url.endsWith('/api/w/shared-preview/scripts/list_paths')) {
|
||||
return jsonResponse(200, [])
|
||||
}
|
||||
throw new Error(`Unexpected fetch: ${url}`)
|
||||
}
|
||||
|
||||
const client = new BackendPreviewClient(
|
||||
buildSettings({
|
||||
baseUrl: 'http://backend.test/shared-lock',
|
||||
workspaceOverride: 'shared-preview'
|
||||
})
|
||||
)
|
||||
|
||||
const order: string[] = []
|
||||
let releaseFirst: (() => void) | undefined
|
||||
let notifyFirstStart: (() => void) | undefined
|
||||
const firstStarted = new Promise<void>((resolve) => {
|
||||
notifyFirstStart = resolve
|
||||
})
|
||||
|
||||
const first = client.withWorkspace('flow-test1', 1, async () => {
|
||||
order.push('first:start')
|
||||
notifyFirstStart?.()
|
||||
await new Promise<void>((resolve) => {
|
||||
releaseFirst = resolve
|
||||
})
|
||||
order.push('first:end')
|
||||
})
|
||||
|
||||
const second = client.withWorkspace('flow-test2', 1, async () => {
|
||||
order.push('second:start')
|
||||
order.push('second:end')
|
||||
})
|
||||
|
||||
await firstStarted
|
||||
expect(order).toEqual(['first:start'])
|
||||
|
||||
releaseFirst?.()
|
||||
await Promise.all([first, second])
|
||||
|
||||
expect(order).toEqual(['first:start', 'first:end', 'second:start', 'second:end'])
|
||||
})
|
||||
|
||||
it('clears managed shared-workspace assets before preview runs', async () => {
|
||||
const requests: Array<{ url: string; init?: RequestInit }> = []
|
||||
globalThis.fetch = mockFetch(
|
||||
requests,
|
||||
textResponse(200, 'token'),
|
||||
textResponse(200, 'true'),
|
||||
jsonResponse(200, ['f/evals/old_subflow', 'u/admin/keep_flow']),
|
||||
textResponse(200, ''),
|
||||
jsonResponse(200, ['f/evals/old_script', 'f/shared/keep_script']),
|
||||
textResponse(200, '')
|
||||
)
|
||||
|
||||
const client = new BackendPreviewClient(
|
||||
buildSettings({
|
||||
baseUrl: 'http://backend.test/shared-cleanup',
|
||||
workspaceOverride: 'shared-preview'
|
||||
})
|
||||
)
|
||||
|
||||
await client.withWorkspace('flow-test1', 1, async () => undefined)
|
||||
|
||||
expect(requests.map((entry) => entry.url)).toEqual([
|
||||
'http://backend.test/shared-cleanup/api/auth/login',
|
||||
'http://backend.test/shared-cleanup/api/workspaces/exists',
|
||||
'http://backend.test/shared-cleanup/api/w/shared-preview/flows/list_paths',
|
||||
'http://backend.test/shared-cleanup/api/w/shared-preview/flows/delete/f/evals/old_subflow',
|
||||
'http://backend.test/shared-cleanup/api/w/shared-preview/scripts/list_paths',
|
||||
'http://backend.test/shared-cleanup/api/w/shared-preview/scripts/delete/p/f/evals/old_script'
|
||||
])
|
||||
})
|
||||
|
||||
it('retries login after a cached login failure', async () => {
|
||||
const requests: Array<{ url: string; init?: RequestInit }> = []
|
||||
globalThis.fetch = mockFetch(
|
||||
requests,
|
||||
textResponse(503, 'backend starting'),
|
||||
textResponse(200, 'token'),
|
||||
textResponse(200, 'true'),
|
||||
jsonResponse(200, []),
|
||||
jsonResponse(200, [])
|
||||
)
|
||||
|
||||
const client = new BackendPreviewClient(
|
||||
buildSettings({
|
||||
baseUrl: 'http://backend.test/login-retry',
|
||||
workspaceOverride: 'shared-preview'
|
||||
})
|
||||
)
|
||||
|
||||
await expect(client.withWorkspace('flow-test1', 1, async () => undefined)).rejects.toThrow(
|
||||
'login for backend validation failed'
|
||||
)
|
||||
await expect(client.withWorkspace('flow-test1', 1, async () => 'ok')).resolves.toBe('ok')
|
||||
|
||||
expect(
|
||||
requests.filter((entry) => entry.url === 'http://backend.test/login-retry/api/auth/login')
|
||||
).toHaveLength(2)
|
||||
})
|
||||
})
|
||||
|
||||
function buildSettings(
|
||||
overrides: Partial<BackendValidationSettings> = {}
|
||||
): BackendValidationSettings {
|
||||
return {
|
||||
mode: 'preview',
|
||||
baseUrl: 'http://backend.test/default',
|
||||
email: 'admin@windmill.dev',
|
||||
password: 'changeme',
|
||||
keepWorkspaces: true,
|
||||
workspacePrefix: 'ai-evals',
|
||||
pollIntervalMs: 1,
|
||||
maxWaitMs: 50,
|
||||
...overrides
|
||||
}
|
||||
}
|
||||
|
||||
function mockFetch(
|
||||
requests: Array<{ url: string; init?: RequestInit }>,
|
||||
...responses: Response[]
|
||||
): typeof fetch {
|
||||
const queue = [...responses]
|
||||
return async (input, init) => {
|
||||
const url = String(input)
|
||||
requests.push({ url, init })
|
||||
const next = queue.shift()
|
||||
if (!next) {
|
||||
throw new Error(`Unexpected fetch: ${url}`)
|
||||
}
|
||||
return next
|
||||
}
|
||||
}
|
||||
|
||||
function jsonResponse(status: number, body: unknown): Response {
|
||||
return new Response(JSON.stringify(body), {
|
||||
status,
|
||||
headers: { 'Content-Type': 'application/json' }
|
||||
})
|
||||
}
|
||||
|
||||
function textResponse(status: number, body: string): Response {
|
||||
return new Response(body, { status })
|
||||
}
|
||||
502
ai_evals/adapters/frontend/backendPreview.ts
Normal file
502
ai_evals/adapters/frontend/backendPreview.ts
Normal file
@@ -0,0 +1,502 @@
|
||||
import { randomUUID } from 'node:crypto'
|
||||
import type { BackendValidationSettings } from '../../core/backendValidation'
|
||||
|
||||
interface CompletedJobResultMaybe {
|
||||
completed: boolean
|
||||
result: unknown
|
||||
success?: boolean
|
||||
started?: boolean
|
||||
}
|
||||
|
||||
interface ScriptDeploymentStatus {
|
||||
lock?: unknown
|
||||
lock_error_logs?: string | null
|
||||
}
|
||||
|
||||
export interface CompletedPreviewJob {
|
||||
id: string
|
||||
success: boolean
|
||||
result: unknown
|
||||
logs?: string | null
|
||||
raw: Record<string, unknown>
|
||||
}
|
||||
|
||||
const tokenCache = new Map<string, Promise<string>>()
|
||||
const sharedWorkspaceQueue = new Map<string, Promise<void>>()
|
||||
const managedSharedWorkspacePrefixes = ['f/evals/']
|
||||
|
||||
export class BackendPreviewClient {
|
||||
constructor(private readonly settings: BackendValidationSettings) {}
|
||||
|
||||
async withWorkspace<T>(
|
||||
caseId: string,
|
||||
attempt: number,
|
||||
body: (workspaceId: string) => Promise<T>
|
||||
): Promise<T> {
|
||||
const workspaceId =
|
||||
this.settings.workspaceOverride ??
|
||||
buildWorkspaceId(this.settings.workspacePrefix, caseId, attempt)
|
||||
|
||||
const run = async () => {
|
||||
await this.ensureWorkspace(workspaceId)
|
||||
if (this.settings.workspaceOverride) {
|
||||
await this.clearManagedSharedWorkspaceAssets(workspaceId)
|
||||
}
|
||||
|
||||
try {
|
||||
return await body(workspaceId)
|
||||
} finally {
|
||||
if (!this.settings.keepWorkspaces && !this.settings.workspaceOverride) {
|
||||
await this.deleteWorkspace(workspaceId).catch(() => undefined)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (this.settings.workspaceOverride) {
|
||||
return await withSharedWorkspaceLock(workspaceId, run)
|
||||
}
|
||||
|
||||
return await run()
|
||||
}
|
||||
|
||||
async createScript(input: {
|
||||
workspaceId: string
|
||||
path: string
|
||||
summary: string
|
||||
description?: string
|
||||
schema?: Record<string, unknown>
|
||||
content: string
|
||||
language: string
|
||||
}): Promise<void> {
|
||||
await this.ensureFolderForPath(input.workspaceId, input.path)
|
||||
|
||||
const payload = {
|
||||
path: input.path,
|
||||
summary: input.summary,
|
||||
description: input.description ?? '',
|
||||
content: input.content,
|
||||
schema: input.schema ?? { type: 'object', properties: {}, required: [] },
|
||||
is_template: false,
|
||||
language: input.language,
|
||||
kind: 'script'
|
||||
}
|
||||
|
||||
const response = await this.request(`/w/${encodeURIComponent(input.workspaceId)}/scripts/create`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(payload)
|
||||
})
|
||||
|
||||
if (response.ok) {
|
||||
await this.waitForScriptDeployment(input.workspaceId, input.path, (await response.text()).trim())
|
||||
return
|
||||
}
|
||||
|
||||
const message = await response.text()
|
||||
if (!isConflictMessage(message)) {
|
||||
throw new Error(`create script ${input.path} failed: ${response.status} ${response.statusText} - ${message}`)
|
||||
}
|
||||
|
||||
const currentScript = await this.getScriptByPath(input.workspaceId, input.path)
|
||||
const currentHash = readStringField(currentScript, 'hash', `script ${input.path}`)
|
||||
const updateResponse = await this.request(
|
||||
`/w/${encodeURIComponent(input.workspaceId)}/scripts/create`,
|
||||
{
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
...payload,
|
||||
parent_hash: currentHash
|
||||
})
|
||||
}
|
||||
)
|
||||
await expectOk(updateResponse, `update script ${input.path}`)
|
||||
await this.waitForScriptDeployment(input.workspaceId, input.path, (await updateResponse.text()).trim())
|
||||
}
|
||||
|
||||
async createFlow(input: {
|
||||
workspaceId: string
|
||||
path: string
|
||||
summary: string
|
||||
description?: string
|
||||
schema?: Record<string, unknown>
|
||||
value: Record<string, unknown>
|
||||
}): Promise<void> {
|
||||
await this.ensureFolderForPath(input.workspaceId, input.path)
|
||||
|
||||
const payload = {
|
||||
path: input.path,
|
||||
summary: input.summary,
|
||||
description: input.description ?? '',
|
||||
schema: input.schema ?? { type: 'object', properties: {}, required: [] },
|
||||
value: input.value
|
||||
}
|
||||
|
||||
const response = await this.request(`/w/${encodeURIComponent(input.workspaceId)}/flows/create`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(payload)
|
||||
})
|
||||
|
||||
if (response.ok) {
|
||||
return
|
||||
}
|
||||
|
||||
const message = await response.text()
|
||||
if (!isConflictMessage(message)) {
|
||||
throw new Error(`create flow ${input.path} failed: ${response.status} ${response.statusText} - ${message}`)
|
||||
}
|
||||
|
||||
const updateResponse = await this.request(
|
||||
`/w/${encodeURIComponent(input.workspaceId)}/flows/update/${input.path}`,
|
||||
{
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(payload)
|
||||
}
|
||||
)
|
||||
await expectOk(updateResponse, `update flow ${input.path}`)
|
||||
}
|
||||
|
||||
async runScriptPreview(input: {
|
||||
workspaceId: string
|
||||
content: string
|
||||
args: Record<string, unknown>
|
||||
language: string
|
||||
path?: string
|
||||
timeoutSeconds?: number
|
||||
}): Promise<CompletedPreviewJob> {
|
||||
const response = await this.request(
|
||||
withQuery(`/w/${encodeURIComponent(input.workspaceId)}/jobs/run/preview`, {
|
||||
timeout: input.timeoutSeconds
|
||||
}),
|
||||
{
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
content: input.content,
|
||||
args: input.args,
|
||||
language: input.language,
|
||||
path: input.path
|
||||
})
|
||||
}
|
||||
)
|
||||
|
||||
await expectOk(response, 'start script preview')
|
||||
const jobId = (await response.text()).trim()
|
||||
return await this.waitForCompletedJob(input.workspaceId, jobId)
|
||||
}
|
||||
|
||||
async runFlowPreview(input: {
|
||||
workspaceId: string
|
||||
value: Record<string, unknown>
|
||||
args: Record<string, unknown>
|
||||
timeoutSeconds?: number
|
||||
path?: string
|
||||
}): Promise<CompletedPreviewJob> {
|
||||
const response = await this.request(
|
||||
withQuery(`/w/${encodeURIComponent(input.workspaceId)}/jobs/run/preview_flow`, {
|
||||
timeout: input.timeoutSeconds
|
||||
}),
|
||||
{
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
value: input.value,
|
||||
args: input.args,
|
||||
path: input.path
|
||||
})
|
||||
}
|
||||
)
|
||||
|
||||
await expectOk(response, 'start flow preview')
|
||||
const jobId = (await response.text()).trim()
|
||||
return await this.waitForCompletedJob(input.workspaceId, jobId)
|
||||
}
|
||||
|
||||
private async ensureWorkspace(workspaceId: string): Promise<void> {
|
||||
const existsResponse = await this.request('/workspaces/exists', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ id: workspaceId })
|
||||
})
|
||||
await expectOk(existsResponse, `check workspace ${workspaceId}`)
|
||||
|
||||
if ((await existsResponse.text()).trim() === 'true') {
|
||||
return
|
||||
}
|
||||
|
||||
const createResponse = await this.request('/workspaces/create', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ id: workspaceId, name: workspaceId })
|
||||
})
|
||||
try {
|
||||
await expectOk(createResponse, `create workspace ${workspaceId}`)
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : String(error)
|
||||
if (message.includes('maximum number of workspaces')) {
|
||||
throw new Error(
|
||||
`${message}. Reuse an existing workspace with WMILL_AI_EVAL_BACKEND_WORKSPACE=<workspace-id>.`
|
||||
)
|
||||
}
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
private async deleteWorkspace(workspaceId: string): Promise<void> {
|
||||
const response = await this.request(`/workspaces/delete/${encodeURIComponent(workspaceId)}`, {
|
||||
method: 'DELETE'
|
||||
})
|
||||
await expectOk(response, `delete workspace ${workspaceId}`)
|
||||
}
|
||||
|
||||
private async ensureFolderForPath(workspaceId: string, path: string): Promise<void> {
|
||||
const folderName = extractFolderName(path)
|
||||
if (!folderName) {
|
||||
return
|
||||
}
|
||||
|
||||
const response = await this.request(`/w/${encodeURIComponent(workspaceId)}/folders/create`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ name: folderName })
|
||||
})
|
||||
|
||||
if (response.ok) {
|
||||
return
|
||||
}
|
||||
|
||||
const message = await response.text()
|
||||
if (!message.toLowerCase().includes('already exists')) {
|
||||
throw new Error(`Failed to create folder ${folderName}: ${message}`)
|
||||
}
|
||||
}
|
||||
|
||||
private async waitForCompletedJob(
|
||||
workspaceId: string,
|
||||
jobId: string
|
||||
): Promise<CompletedPreviewJob> {
|
||||
const deadline = Date.now() + this.settings.maxWaitMs
|
||||
|
||||
while (Date.now() < deadline) {
|
||||
const maybeResponse = await this.request(
|
||||
`/w/${encodeURIComponent(workspaceId)}/jobs_u/completed/get_result_maybe/${encodeURIComponent(jobId)}?get_started=false`
|
||||
)
|
||||
await expectOk(maybeResponse, `poll job ${jobId}`)
|
||||
const maybeResult = (await maybeResponse.json()) as CompletedJobResultMaybe
|
||||
|
||||
if (maybeResult.completed) {
|
||||
const completedResponse = await this.request(
|
||||
`/w/${encodeURIComponent(workspaceId)}/jobs_u/completed/get/${encodeURIComponent(jobId)}`
|
||||
)
|
||||
await expectOk(completedResponse, `get completed job ${jobId}`)
|
||||
const completedJob = (await completedResponse.json()) as Record<string, unknown>
|
||||
return {
|
||||
id: jobId,
|
||||
success: Boolean(maybeResult.success),
|
||||
result: maybeResult.result,
|
||||
logs:
|
||||
typeof completedJob.logs === 'string' || completedJob.logs === null
|
||||
? (completedJob.logs as string | null)
|
||||
: null,
|
||||
raw: completedJob
|
||||
}
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, this.settings.pollIntervalMs))
|
||||
}
|
||||
|
||||
throw new Error(`Timed out waiting for preview job ${jobId} to complete`)
|
||||
}
|
||||
|
||||
private async getScriptByPath(workspaceId: string, path: string): Promise<Record<string, unknown>> {
|
||||
const response = await this.request(`/w/${encodeURIComponent(workspaceId)}/scripts/get/p/${path}`)
|
||||
await expectOk(response, `get script ${path}`)
|
||||
return (await response.json()) as Record<string, unknown>
|
||||
}
|
||||
|
||||
private async clearManagedSharedWorkspaceAssets(workspaceId: string): Promise<void> {
|
||||
const flowPaths = await this.listFlowPaths(workspaceId)
|
||||
for (const path of flowPaths.filter(isManagedSharedWorkspacePath)) {
|
||||
await this.deleteFlowByPath(workspaceId, path)
|
||||
}
|
||||
|
||||
const scriptPaths = await this.listScriptPaths(workspaceId)
|
||||
for (const path of scriptPaths.filter(isManagedSharedWorkspacePath)) {
|
||||
await this.deleteScriptByPath(workspaceId, path)
|
||||
}
|
||||
}
|
||||
|
||||
private async listFlowPaths(workspaceId: string): Promise<string[]> {
|
||||
const response = await this.request(`/w/${encodeURIComponent(workspaceId)}/flows/list_paths`)
|
||||
await expectOk(response, `list flows in workspace ${workspaceId}`)
|
||||
return await response.json()
|
||||
}
|
||||
|
||||
private async listScriptPaths(workspaceId: string): Promise<string[]> {
|
||||
const response = await this.request(`/w/${encodeURIComponent(workspaceId)}/scripts/list_paths`)
|
||||
await expectOk(response, `list scripts in workspace ${workspaceId}`)
|
||||
return await response.json()
|
||||
}
|
||||
|
||||
private async deleteFlowByPath(workspaceId: string, path: string): Promise<void> {
|
||||
const response = await this.request(`/w/${encodeURIComponent(workspaceId)}/flows/delete/${path}`, {
|
||||
method: 'DELETE'
|
||||
})
|
||||
await expectOk(response, `delete flow ${path}`)
|
||||
}
|
||||
|
||||
private async deleteScriptByPath(workspaceId: string, path: string): Promise<void> {
|
||||
const response = await this.request(`/w/${encodeURIComponent(workspaceId)}/scripts/delete/p/${path}`, {
|
||||
method: 'POST'
|
||||
})
|
||||
await expectOk(response, `delete script ${path}`)
|
||||
}
|
||||
|
||||
private async waitForScriptDeployment(
|
||||
workspaceId: string,
|
||||
path: string,
|
||||
hash: string
|
||||
): Promise<void> {
|
||||
const deadline = Date.now() + this.settings.maxWaitMs
|
||||
|
||||
while (Date.now() < deadline) {
|
||||
const response = await this.request(
|
||||
`/w/${encodeURIComponent(workspaceId)}/scripts/deployment_status/h/${encodeURIComponent(hash)}`
|
||||
)
|
||||
await expectOk(response, `check deployment status for script ${path}`)
|
||||
const deployment = (await response.json()) as ScriptDeploymentStatus
|
||||
if (deployment.lock != null) {
|
||||
return
|
||||
}
|
||||
if (deployment.lock_error_logs) {
|
||||
throw new Error(`Script deployment failed for ${path}: ${deployment.lock_error_logs}`)
|
||||
}
|
||||
await new Promise((resolve) => setTimeout(resolve, this.settings.pollIntervalMs))
|
||||
}
|
||||
|
||||
throw new Error(`Timed out waiting for script ${path} (${hash}) to deploy`)
|
||||
}
|
||||
|
||||
private async request(path: string, init?: RequestInit): Promise<Response> {
|
||||
const token = await this.getToken()
|
||||
return await fetch(`${this.settings.baseUrl}/api${path}`, {
|
||||
...init,
|
||||
headers: {
|
||||
Authorization: `Bearer ${token}`,
|
||||
...(init?.headers ?? {})
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
private async getToken(): Promise<string> {
|
||||
const cacheKey = `${this.settings.baseUrl}|${this.settings.email}`
|
||||
let tokenPromise = tokenCache.get(cacheKey)
|
||||
if (!tokenPromise) {
|
||||
tokenPromise = this.login().catch((error) => {
|
||||
if (tokenCache.get(cacheKey) === tokenPromise) {
|
||||
tokenCache.delete(cacheKey)
|
||||
}
|
||||
throw error
|
||||
})
|
||||
tokenCache.set(cacheKey, tokenPromise)
|
||||
}
|
||||
return await tokenPromise
|
||||
}
|
||||
|
||||
private async login(): Promise<string> {
|
||||
const response = await fetch(`${this.settings.baseUrl}/api/auth/login`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
email: this.settings.email,
|
||||
password: this.settings.password
|
||||
})
|
||||
})
|
||||
await expectOk(response, 'login for backend validation')
|
||||
return (await response.text()).trim()
|
||||
}
|
||||
}
|
||||
|
||||
async function withSharedWorkspaceLock<T>(workspaceId: string, body: () => Promise<T>): Promise<T> {
|
||||
const previous = sharedWorkspaceQueue.get(workspaceId) ?? Promise.resolve()
|
||||
let releaseCurrent: (() => void) | undefined
|
||||
const current = new Promise<void>((resolve) => {
|
||||
releaseCurrent = resolve
|
||||
})
|
||||
const tail = previous.catch(() => undefined).then(() => current)
|
||||
sharedWorkspaceQueue.set(workspaceId, tail)
|
||||
|
||||
await previous.catch(() => undefined)
|
||||
|
||||
try {
|
||||
return await body()
|
||||
} finally {
|
||||
releaseCurrent?.()
|
||||
if (sharedWorkspaceQueue.get(workspaceId) === tail) {
|
||||
sharedWorkspaceQueue.delete(workspaceId)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function buildWorkspaceId(prefix: string, caseId: string, attempt: number): string {
|
||||
const caseSlug = caseId
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9-]+/g, '-')
|
||||
.replace(/^-+|-+$/g, '')
|
||||
.slice(0, 30)
|
||||
const suffix = randomUUID().slice(0, 8)
|
||||
return `${prefix}-${caseSlug || 'case'}-a${attempt}-${suffix}`
|
||||
}
|
||||
|
||||
function extractFolderName(path: string): string | null {
|
||||
if (!path.startsWith('f/')) {
|
||||
return null
|
||||
}
|
||||
const segments = path.split('/').slice(1, -1)
|
||||
return segments.length > 0 ? segments.join('/') : null
|
||||
}
|
||||
|
||||
function withQuery(
|
||||
path: string,
|
||||
params: Record<string, string | number | undefined>
|
||||
): string {
|
||||
const query = new URLSearchParams()
|
||||
for (const [key, value] of Object.entries(params)) {
|
||||
if (value === undefined) {
|
||||
continue
|
||||
}
|
||||
query.set(key, String(value))
|
||||
}
|
||||
const suffix = query.toString()
|
||||
return suffix ? `${path}?${suffix}` : path
|
||||
}
|
||||
|
||||
async function expectOk(response: Response, context: string): Promise<void> {
|
||||
if (response.ok) {
|
||||
return
|
||||
}
|
||||
throw new Error(`${context} failed: ${response.status} ${response.statusText} - ${await response.text()}`)
|
||||
}
|
||||
|
||||
function readStringField(
|
||||
value: Record<string, unknown>,
|
||||
field: string,
|
||||
context: string
|
||||
): string {
|
||||
const candidate = value[field]
|
||||
if (typeof candidate === 'string' && candidate.length > 0) {
|
||||
return candidate
|
||||
}
|
||||
throw new Error(`${context} is missing string field ${field}`)
|
||||
}
|
||||
|
||||
function isConflictMessage(message: string): boolean {
|
||||
const normalized = message.toLowerCase()
|
||||
return normalized.includes('already exists') || normalized.includes('path conflict')
|
||||
}
|
||||
|
||||
function isManagedSharedWorkspacePath(path: string): boolean {
|
||||
return managedSharedWorkspacePrefixes.some((prefix) => path.startsWith(prefix))
|
||||
}
|
||||
93
ai_evals/adapters/frontend/benchmarkRunner.ts
Normal file
93
ai_evals/adapters/frontend/benchmarkRunner.ts
Normal file
@@ -0,0 +1,93 @@
|
||||
import { loadSelectedCases } from "../../core/cases";
|
||||
import { resolveBackendValidationSettings } from "../../core/backendValidation";
|
||||
import {
|
||||
formatRunModelLabel,
|
||||
getFrontendEvalModel,
|
||||
resolveEvalModel,
|
||||
} from "../../core/models";
|
||||
import { buildRunResult } from "../../core/results";
|
||||
import { runSuite } from "../../core/runSuite";
|
||||
import type { BenchmarkRunResult, ModeRunner } from "../../core/types";
|
||||
import { emitFrontendBenchmarkProgress } from "./progress";
|
||||
import { createAppModeRunner } from "../../modes/app";
|
||||
import { createFlowModeRunner } from "../../modes/flow";
|
||||
import { createScriptModeRunner } from "../../modes/script";
|
||||
import { DEFAULT_JUDGE_MODEL } from "../../core/judge";
|
||||
|
||||
export type FrontendBenchmarkMode = "flow" | "app" | "script";
|
||||
|
||||
export async function runFrontendBenchmarkFromEnv(): Promise<BenchmarkRunResult> {
|
||||
const mode = parseMode(process.env.WMILL_FRONTEND_AI_EVAL_MODE);
|
||||
const caseIds = parseOptionalJsonStringArray(process.env.WMILL_FRONTEND_AI_EVAL_CASE_IDS);
|
||||
const runs = parsePositiveInteger(process.env.WMILL_FRONTEND_AI_EVAL_RUNS, "WMILL_FRONTEND_AI_EVAL_RUNS");
|
||||
const emitProgress = process.env.WMILL_FRONTEND_AI_EVAL_PROGRESS === "1";
|
||||
const verbose = process.env.WMILL_FRONTEND_AI_EVAL_VERBOSE === "1";
|
||||
const model = resolveEvalModel(mode, process.env.WMILL_FRONTEND_AI_EVAL_MODEL);
|
||||
const backendValidation = resolveBackendValidationSettings({
|
||||
evalMode: mode,
|
||||
requestedMode: process.env.WMILL_FRONTEND_AI_EVAL_BACKEND_VALIDATION,
|
||||
});
|
||||
|
||||
const selectedCases = await loadSelectedCases(mode, caseIds);
|
||||
const modeRunner = getModeRunner(mode, getFrontendEvalModel(model), backendValidation);
|
||||
const runModel = formatRunModelLabel(mode, model);
|
||||
const caseResults = await runSuite({
|
||||
modeRunner,
|
||||
cases: selectedCases,
|
||||
runs,
|
||||
runModel,
|
||||
judgeModel: DEFAULT_JUDGE_MODEL,
|
||||
concurrency: verbose ? 1 : undefined,
|
||||
verbose,
|
||||
onProgress: emitProgress ? (event) => emitFrontendBenchmarkProgress(event) : undefined,
|
||||
});
|
||||
|
||||
return buildRunResult({
|
||||
mode,
|
||||
runs,
|
||||
runModel,
|
||||
judgeModel: DEFAULT_JUDGE_MODEL,
|
||||
caseResults,
|
||||
});
|
||||
}
|
||||
|
||||
function getModeRunner(
|
||||
mode: FrontendBenchmarkMode,
|
||||
model: ReturnType<typeof getFrontendEvalModel>,
|
||||
backendValidation: ReturnType<typeof resolveBackendValidationSettings>
|
||||
): ModeRunner<any, any, any> {
|
||||
switch (mode) {
|
||||
case "flow":
|
||||
return createFlowModeRunner(model, backendValidation);
|
||||
case "app":
|
||||
return createAppModeRunner(model);
|
||||
case "script":
|
||||
return createScriptModeRunner(model, backendValidation);
|
||||
}
|
||||
}
|
||||
|
||||
function parseMode(value: string | undefined): FrontendBenchmarkMode {
|
||||
if (value === "flow" || value === "app" || value === "script") {
|
||||
return value;
|
||||
}
|
||||
throw new Error(`Unsupported frontend benchmark mode: ${String(value)}`);
|
||||
}
|
||||
|
||||
function parseOptionalJsonStringArray(value: string | undefined): string[] {
|
||||
if (!value) {
|
||||
return [];
|
||||
}
|
||||
const parsed = JSON.parse(value) as unknown;
|
||||
if (!Array.isArray(parsed) || parsed.some((entry) => typeof entry !== "string")) {
|
||||
throw new Error("WMILL_FRONTEND_AI_EVAL_CASE_IDS must be a JSON string array");
|
||||
}
|
||||
return parsed;
|
||||
}
|
||||
|
||||
function parsePositiveInteger(value: string | undefined, envName: string): number {
|
||||
const parsed = Number(value);
|
||||
if (!Number.isInteger(parsed) || parsed <= 0) {
|
||||
throw new Error(`${envName} must be a positive integer`);
|
||||
}
|
||||
return parsed;
|
||||
}
|
||||
92
ai_evals/adapters/frontend/core/app/appEvalRunner.ts
Normal file
92
ai_evals/adapters/frontend/core/app/appEvalRunner.ts
Normal file
@@ -0,0 +1,92 @@
|
||||
import { mkdtemp } from 'fs/promises'
|
||||
import { tmpdir } from 'os'
|
||||
import { join } from 'path'
|
||||
import type {
|
||||
AppFiles,
|
||||
BackendRunnable,
|
||||
AppAIChatHelpers
|
||||
} from '../../../../../frontend/src/lib/components/copilot/chat/app/core'
|
||||
import {
|
||||
getAppTools,
|
||||
prepareAppSystemMessage,
|
||||
prepareAppUserMessage
|
||||
} from '../../../../../frontend/src/lib/components/copilot/chat/app/core'
|
||||
import type { Tool as ProductionTool } from '../../../../../frontend/src/lib/components/copilot/chat/shared'
|
||||
import { createAppFileHelpers } from './fileHelpers'
|
||||
import { runEval } from '../shared'
|
||||
import type { AIProvider } from '$lib/gen/types.gen'
|
||||
import type { ModeRunContext } from '../../../../core/types'
|
||||
import type { TokenUsage } from '../shared/types'
|
||||
|
||||
export interface AppEvalResult {
|
||||
success: boolean
|
||||
files: AppFiles
|
||||
error?: string
|
||||
assistantMessageCount: number
|
||||
toolCallCount: number
|
||||
toolsUsed: string[]
|
||||
tokenUsage: TokenUsage
|
||||
}
|
||||
|
||||
export interface AppEvalOptions {
|
||||
initialFrontend?: Record<string, string>
|
||||
initialBackend?: Record<string, BackendRunnable>
|
||||
model?: string
|
||||
maxIterations?: number
|
||||
provider?: AIProvider
|
||||
workspaceRoot?: string
|
||||
runContext?: ModeRunContext
|
||||
}
|
||||
|
||||
export async function runAppEval(
|
||||
userPrompt: string,
|
||||
apiKey: string,
|
||||
options?: AppEvalOptions
|
||||
): Promise<AppEvalResult> {
|
||||
const workspaceRoot =
|
||||
options?.workspaceRoot ??
|
||||
(await mkdtemp(join(tmpdir(), 'wmill-frontend-app-benchmark-')))
|
||||
const { helpers, getFiles, cleanup } = await createAppFileHelpers(
|
||||
options?.initialFrontend ?? {},
|
||||
options?.initialBackend ?? {},
|
||||
workspaceRoot
|
||||
)
|
||||
|
||||
try {
|
||||
const systemMessage = prepareAppSystemMessage()
|
||||
const tools = getAppTools() as ProductionTool<AppAIChatHelpers>[]
|
||||
const model = options?.model ?? 'claude-haiku-4-5-20251001'
|
||||
const userMessage = prepareAppUserMessage(userPrompt, helpers.getSelectedContext())
|
||||
|
||||
const rawResult = await runEval({
|
||||
userPrompt,
|
||||
systemMessage,
|
||||
userMessage,
|
||||
tools,
|
||||
helpers,
|
||||
apiKey,
|
||||
getOutput: getFiles,
|
||||
onAssistantMessageStart: options?.runContext?.onAssistantMessageStart,
|
||||
onAssistantToken: options?.runContext?.onAssistantChunk,
|
||||
onAssistantMessageEnd: options?.runContext?.onAssistantMessageEnd,
|
||||
options: {
|
||||
maxIterations: options?.maxIterations,
|
||||
model,
|
||||
workspace: workspaceRoot,
|
||||
provider: options?.provider
|
||||
}
|
||||
})
|
||||
|
||||
return {
|
||||
files: rawResult.output,
|
||||
success: rawResult.success,
|
||||
error: rawResult.error,
|
||||
assistantMessageCount: rawResult.iterations,
|
||||
toolCallCount: rawResult.toolCallsCount,
|
||||
toolsUsed: rawResult.toolsCalled,
|
||||
tokenUsage: rawResult.tokenUsage
|
||||
}
|
||||
} finally {
|
||||
await cleanup()
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,8 @@
|
||||
import type { AppFiles, BackendRunnable, InlineScript } from '../../app/core'
|
||||
import type {
|
||||
AppFiles,
|
||||
BackendRunnable,
|
||||
InlineScript
|
||||
} from '../../../../../frontend/src/lib/components/copilot/chat/app/core'
|
||||
|
||||
/**
|
||||
* Backend runnable metadata stored in meta.json files.
|
||||
255
ai_evals/adapters/frontend/core/app/fileHelpers.ts
Normal file
255
ai_evals/adapters/frontend/core/app/fileHelpers.ts
Normal file
@@ -0,0 +1,255 @@
|
||||
import { mkdir, rm, writeFile } from 'fs/promises'
|
||||
import { dirname, join } from 'path'
|
||||
import type {
|
||||
AppAIChatHelpers,
|
||||
AppFiles,
|
||||
BackendRunnable,
|
||||
DataTableSchema,
|
||||
LintResult,
|
||||
SelectedContext
|
||||
} from '../../../../../frontend/src/lib/components/copilot/chat/app/core'
|
||||
|
||||
function createEmptyLintResult(): LintResult {
|
||||
return {
|
||||
errorCount: 0,
|
||||
warningCount: 0,
|
||||
errors: { frontend: {}, backend: {} },
|
||||
warnings: { frontend: {}, backend: {} }
|
||||
}
|
||||
}
|
||||
|
||||
async function writeFrontendFile(
|
||||
workspaceRoot: string | undefined,
|
||||
path: string,
|
||||
content: string
|
||||
): Promise<void> {
|
||||
if (!workspaceRoot) {
|
||||
return
|
||||
}
|
||||
const relativePath = path.startsWith('/') ? path.slice(1) : path
|
||||
const fullPath = join(workspaceRoot, 'frontend', relativePath)
|
||||
await mkdir(dirname(fullPath), { recursive: true })
|
||||
await writeFile(fullPath, content, 'utf8')
|
||||
}
|
||||
|
||||
async function removeFrontendFile(workspaceRoot: string | undefined, path: string): Promise<void> {
|
||||
if (!workspaceRoot) {
|
||||
return
|
||||
}
|
||||
const relativePath = path.startsWith('/') ? path.slice(1) : path
|
||||
await rm(join(workspaceRoot, 'frontend', relativePath), { force: true })
|
||||
}
|
||||
|
||||
async function writeBackendRunnable(
|
||||
workspaceRoot: string | undefined,
|
||||
key: string,
|
||||
runnable: BackendRunnable
|
||||
): Promise<void> {
|
||||
if (!workspaceRoot) {
|
||||
return
|
||||
}
|
||||
const runnableDir = join(workspaceRoot, 'backend', key)
|
||||
await mkdir(runnableDir, { recursive: true })
|
||||
|
||||
const meta: { name: string; language?: string; type?: string; path?: string } = {
|
||||
name: runnable.name
|
||||
}
|
||||
|
||||
if (runnable.type === 'inline' && runnable.inlineScript) {
|
||||
meta.language = runnable.inlineScript.language
|
||||
const extension = runnable.inlineScript.language === 'python3' ? 'py' : 'ts'
|
||||
await writeFile(
|
||||
join(runnableDir, `main.${extension}`),
|
||||
runnable.inlineScript.content,
|
||||
'utf8'
|
||||
)
|
||||
} else {
|
||||
meta.type = runnable.type
|
||||
if (runnable.path) {
|
||||
meta.path = runnable.path
|
||||
}
|
||||
}
|
||||
|
||||
await writeFile(join(runnableDir, 'meta.json'), JSON.stringify(meta, null, 2) + '\n', 'utf8')
|
||||
}
|
||||
|
||||
async function removeBackendRunnable(workspaceRoot: string | undefined, key: string): Promise<void> {
|
||||
if (!workspaceRoot) {
|
||||
return
|
||||
}
|
||||
await rm(join(workspaceRoot, 'backend', key), { recursive: true, force: true })
|
||||
}
|
||||
|
||||
async function persistDatatables(
|
||||
workspaceRoot: string | undefined,
|
||||
datatables: DataTableSchema[]
|
||||
): Promise<void> {
|
||||
if (!workspaceRoot) {
|
||||
return
|
||||
}
|
||||
await writeFile(
|
||||
join(workspaceRoot, 'datatables.json'),
|
||||
JSON.stringify(datatables, null, 2) + '\n',
|
||||
'utf8'
|
||||
)
|
||||
}
|
||||
|
||||
export async function createAppFileHelpers(
|
||||
initialFrontend: Record<string, string> = {},
|
||||
initialBackend: Record<string, BackendRunnable> = {},
|
||||
workspaceRoot?: string
|
||||
): Promise<{
|
||||
helpers: AppAIChatHelpers
|
||||
getFiles: () => AppFiles
|
||||
getFrontend: () => Record<string, string>
|
||||
getBackend: () => Record<string, BackendRunnable>
|
||||
cleanup: () => Promise<void>
|
||||
workspaceDir: string | null
|
||||
}> {
|
||||
let frontend = { ...initialFrontend }
|
||||
let backend = { ...initialBackend }
|
||||
let snapshotId = 0
|
||||
const snapshots = new Map<
|
||||
number,
|
||||
{ frontend: Record<string, string>; backend: Record<string, BackendRunnable> }
|
||||
>()
|
||||
const datatables: DataTableSchema[] = []
|
||||
|
||||
for (const [path, content] of Object.entries(frontend)) {
|
||||
await writeFrontendFile(workspaceRoot, path, content)
|
||||
}
|
||||
for (const [key, runnable] of Object.entries(backend)) {
|
||||
await writeBackendRunnable(workspaceRoot, key, runnable)
|
||||
}
|
||||
await persistDatatables(workspaceRoot, datatables)
|
||||
|
||||
const helpers: AppAIChatHelpers = {
|
||||
listFrontendFiles: () => Object.keys(frontend),
|
||||
getFrontendFile: (path: string) => frontend[path],
|
||||
getFrontendFiles: () => ({ ...frontend }),
|
||||
setFrontendFile: (path: string, content: string) => {
|
||||
frontend[path] = content
|
||||
void writeFrontendFile(workspaceRoot, path, content)
|
||||
return createEmptyLintResult()
|
||||
},
|
||||
deleteFrontendFile: (path: string) => {
|
||||
delete frontend[path]
|
||||
void removeFrontendFile(workspaceRoot, path)
|
||||
},
|
||||
listBackendRunnables: () =>
|
||||
Object.entries(backend).map(([key, runnable]) => ({
|
||||
key,
|
||||
name: runnable.name
|
||||
})),
|
||||
getBackendRunnable: (key: string) => backend[key],
|
||||
getBackendRunnables: () => ({ ...backend }),
|
||||
setBackendRunnable: async (key: string, runnable: BackendRunnable) => {
|
||||
backend[key] = runnable
|
||||
await writeBackendRunnable(workspaceRoot, key, runnable)
|
||||
return createEmptyLintResult()
|
||||
},
|
||||
deleteBackendRunnable: (key: string) => {
|
||||
delete backend[key]
|
||||
void removeBackendRunnable(workspaceRoot, key)
|
||||
},
|
||||
getFiles: (): AppFiles => ({
|
||||
frontend: { ...frontend },
|
||||
backend: { ...backend }
|
||||
}),
|
||||
getSelectedContext: (): SelectedContext => ({ type: 'none' }),
|
||||
snapshot: () => {
|
||||
const id = ++snapshotId
|
||||
snapshots.set(id, {
|
||||
frontend: { ...frontend },
|
||||
backend: { ...backend }
|
||||
})
|
||||
return id
|
||||
},
|
||||
revertToSnapshot: (id: number) => {
|
||||
const snapshot = snapshots.get(id)
|
||||
if (!snapshot) {
|
||||
return
|
||||
}
|
||||
frontend = { ...snapshot.frontend }
|
||||
backend = { ...snapshot.backend }
|
||||
void syncWorkspace()
|
||||
},
|
||||
lint: () => createEmptyLintResult(),
|
||||
getDatatables: async () => structuredClone(datatables),
|
||||
getAvailableDatatableNames: () => datatables.map((datatable) => datatable.datatable_name),
|
||||
execDatatableSql: async (
|
||||
datatableName: string,
|
||||
sql: string,
|
||||
newTable?: { schema: string; name: string }
|
||||
) => {
|
||||
if (newTable) {
|
||||
datatables.push({
|
||||
datatable_name: datatableName,
|
||||
schemas: {
|
||||
[newTable.schema]: {
|
||||
[newTable.name]: {}
|
||||
}
|
||||
}
|
||||
})
|
||||
await persistDatatables(workspaceRoot, datatables)
|
||||
}
|
||||
return {
|
||||
success: true,
|
||||
result: [
|
||||
{
|
||||
datatableName,
|
||||
sql
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
addTableToWhitelist: (datatableName: string, schemaName: string, tableName: string) => {
|
||||
const existing = datatables.find((entry) => entry.datatable_name === datatableName)
|
||||
if (existing) {
|
||||
existing.schemas[schemaName] ??= {}
|
||||
existing.schemas[schemaName][tableName] ??= {}
|
||||
} else {
|
||||
datatables.push({
|
||||
datatable_name: datatableName,
|
||||
schemas: {
|
||||
[schemaName]: {
|
||||
[tableName]: {}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
void persistDatatables(workspaceRoot, datatables)
|
||||
}
|
||||
}
|
||||
|
||||
async function syncWorkspace(): Promise<void> {
|
||||
if (!workspaceRoot) {
|
||||
return
|
||||
}
|
||||
await rm(join(workspaceRoot, 'frontend'), { recursive: true, force: true })
|
||||
await rm(join(workspaceRoot, 'backend'), { recursive: true, force: true })
|
||||
for (const [path, content] of Object.entries(frontend)) {
|
||||
await writeFrontendFile(workspaceRoot, path, content)
|
||||
}
|
||||
for (const [key, runnable] of Object.entries(backend)) {
|
||||
await writeBackendRunnable(workspaceRoot, key, runnable)
|
||||
}
|
||||
await persistDatatables(workspaceRoot, datatables)
|
||||
}
|
||||
|
||||
return {
|
||||
helpers,
|
||||
getFiles: () => ({
|
||||
frontend: { ...frontend },
|
||||
backend: { ...backend }
|
||||
}),
|
||||
getFrontend: () => ({ ...frontend }),
|
||||
getBackend: () => ({ ...backend }),
|
||||
cleanup: async () => {
|
||||
if (workspaceRoot) {
|
||||
await rm(workspaceRoot, { recursive: true, force: true })
|
||||
}
|
||||
},
|
||||
workspaceDir: workspaceRoot ?? null
|
||||
}
|
||||
}
|
||||
169
ai_evals/adapters/frontend/core/flow/fileHelpers.ts
Normal file
169
ai_evals/adapters/frontend/core/flow/fileHelpers.ts
Normal file
@@ -0,0 +1,169 @@
|
||||
import { mkdir, rm, writeFile } from 'fs/promises'
|
||||
import { dirname, join } from 'path'
|
||||
import type { FlowModule, InputTransform } from '../../../../../frontend/src/lib/gen'
|
||||
import type { ExtendedOpenFlow } from '../../../../../frontend/src/lib/components/flows/types'
|
||||
import type { FlowAIChatHelpers } from '../../../../../frontend/src/lib/components/copilot/chat/flow/core'
|
||||
import type { ScriptLintResult } from '../../../../../frontend/src/lib/components/copilot/chat/shared'
|
||||
import { getSubModules } from '../../../../../frontend/src/lib/components/flows/flowExplorer'
|
||||
import {
|
||||
createInlineScriptSession
|
||||
} from '../../../../../frontend/src/lib/components/copilot/chat/flow/inlineScriptsUtils'
|
||||
import {
|
||||
applyFlowJsonUpdate,
|
||||
getFlowModuleById,
|
||||
updateRawScriptModuleContent
|
||||
} from '../../../../../frontend/src/lib/components/copilot/chat/flow/helperUtils'
|
||||
import {
|
||||
registerBenchmarkWorkspace,
|
||||
registerBenchmarkWorkspaceRunnables,
|
||||
unregisterBenchmarkWorkspaceRunnables,
|
||||
createBenchmarkCompletedJob,
|
||||
type BenchmarkWorkspaceFlow,
|
||||
type BenchmarkWorkspaceScript
|
||||
} from '../../mockBackend'
|
||||
|
||||
const EMPTY_SCRIPT_LINT_RESULT: ScriptLintResult = {
|
||||
errorCount: 0,
|
||||
warningCount: 0,
|
||||
errors: [],
|
||||
warnings: []
|
||||
}
|
||||
|
||||
export interface FlowWorkspaceFixtures {
|
||||
scripts?: BenchmarkWorkspaceScript[]
|
||||
flows?: BenchmarkWorkspaceFlow[]
|
||||
}
|
||||
|
||||
export async function createFlowFileHelpers(
|
||||
initialModules: FlowModule[] = [],
|
||||
initialSchema?: Record<string, any>,
|
||||
initialPreprocessorModule?: FlowModule,
|
||||
initialFailureModule?: FlowModule,
|
||||
workspaceRoot?: string,
|
||||
workspaceFixtures?: FlowWorkspaceFixtures
|
||||
): Promise<{
|
||||
helpers: FlowAIChatHelpers
|
||||
getFlow: () => ExtendedOpenFlow
|
||||
getModules: () => FlowModule[]
|
||||
cleanup: () => Promise<void>
|
||||
workspaceDir: string | null
|
||||
}> {
|
||||
let flow: ExtendedOpenFlow = {
|
||||
value: {
|
||||
modules: structuredClone(initialModules),
|
||||
preprocessor_module: structuredClone(initialPreprocessorModule),
|
||||
failure_module: structuredClone(initialFailureModule)
|
||||
},
|
||||
summary: '',
|
||||
schema: initialSchema ?? {
|
||||
$schema: 'https://json-schema.org/draft/2020-12/schema',
|
||||
properties: {},
|
||||
required: [],
|
||||
type: 'object'
|
||||
}
|
||||
}
|
||||
const inlineScriptSession = createInlineScriptSession()
|
||||
|
||||
const flowFilePath = workspaceRoot ? join(workspaceRoot, 'flow.json') : null
|
||||
|
||||
async function persistFlow(): Promise<void> {
|
||||
if (!flowFilePath) {
|
||||
return
|
||||
}
|
||||
await mkdir(dirname(flowFilePath), { recursive: true })
|
||||
await writeFile(flowFilePath, JSON.stringify(flow, null, 2) + '\n', 'utf8')
|
||||
}
|
||||
|
||||
await persistFlow()
|
||||
|
||||
if (workspaceRoot) {
|
||||
registerBenchmarkWorkspace(workspaceRoot)
|
||||
if (workspaceFixtures) {
|
||||
registerBenchmarkWorkspaceRunnables(workspaceRoot, workspaceFixtures)
|
||||
}
|
||||
}
|
||||
|
||||
const helpers: FlowAIChatHelpers = {
|
||||
getFlowAndSelectedId: () => ({ flow, selectedId: '' }),
|
||||
getModules: (id?: string) => {
|
||||
if (!id) return flow.value.modules
|
||||
const module = getFlowModuleById(flow, id)
|
||||
return module ? getSubModules(module).flat() : []
|
||||
},
|
||||
inlineScriptSession,
|
||||
setSnapshot: () => {},
|
||||
revertToSnapshot: () => {},
|
||||
setCode: async (id: string, code: string) => {
|
||||
updateRawScriptModuleContent(flow, id, code)
|
||||
inlineScriptSession.set(id, code)
|
||||
await persistFlow()
|
||||
},
|
||||
setFlowJson: async (
|
||||
modules: FlowModule[] | undefined,
|
||||
schema: Record<string, any> | undefined,
|
||||
preprocessorModule: FlowModule | null | undefined,
|
||||
failureModule: FlowModule | null | undefined
|
||||
) => {
|
||||
applyFlowJsonUpdate(flow, inlineScriptSession, {
|
||||
modules,
|
||||
schema,
|
||||
preprocessorModule,
|
||||
failureModule
|
||||
})
|
||||
await persistFlow()
|
||||
},
|
||||
getFlowInputsSchema: async () => flow.schema ?? {},
|
||||
updateExprsToSet: (_id: string, _inputTransforms: Record<string, InputTransform>) => {},
|
||||
acceptAllModuleActions: () => {},
|
||||
rejectAllModuleActions: () => {},
|
||||
hasPendingChanges: () => false,
|
||||
selectStep: (_id: string) => {},
|
||||
testFlow: async (args?: Record<string, any>) => {
|
||||
if (workspaceRoot) {
|
||||
const runPath = join(workspaceRoot, 'test-run.json')
|
||||
await writeFile(
|
||||
runPath,
|
||||
JSON.stringify(
|
||||
{
|
||||
requestedArgs: args ?? {},
|
||||
modules: flow.value.modules.map((module) => module.id),
|
||||
preprocessor_module: flow.value.preprocessor_module?.id ?? null,
|
||||
failure_module: flow.value.failure_module?.id ?? null
|
||||
},
|
||||
null,
|
||||
2
|
||||
) + '\n',
|
||||
'utf8'
|
||||
)
|
||||
}
|
||||
return createBenchmarkCompletedJob({
|
||||
workspace: workspaceRoot ?? 'benchmark',
|
||||
jobKind: 'flowpreview',
|
||||
result: {
|
||||
requestedArgs: args ?? {},
|
||||
modules: flow.value.modules.map((module) => module.id),
|
||||
preprocessor_module: flow.value.preprocessor_module?.id ?? null,
|
||||
failure_module: flow.value.failure_module?.id ?? null,
|
||||
mocked: true
|
||||
},
|
||||
logs: 'Mock benchmark flow test run completed successfully.'
|
||||
})
|
||||
},
|
||||
getLintErrors: async () => EMPTY_SCRIPT_LINT_RESULT
|
||||
}
|
||||
|
||||
return {
|
||||
helpers,
|
||||
getFlow: () => flow,
|
||||
getModules: () => flow.value.modules,
|
||||
cleanup: async () => {
|
||||
if (workspaceRoot) {
|
||||
unregisterBenchmarkWorkspaceRunnables(workspaceRoot)
|
||||
}
|
||||
if (workspaceRoot) {
|
||||
await rm(workspaceRoot, { recursive: true, force: true })
|
||||
}
|
||||
},
|
||||
workspaceDir: workspaceRoot ?? null
|
||||
}
|
||||
}
|
||||
107
ai_evals/adapters/frontend/core/flow/flowEvalRunner.ts
Normal file
107
ai_evals/adapters/frontend/core/flow/flowEvalRunner.ts
Normal file
@@ -0,0 +1,107 @@
|
||||
import { mkdtemp } from 'fs/promises'
|
||||
import { tmpdir } from 'os'
|
||||
import { join } from 'path'
|
||||
import type { FlowModule } from '$lib/gen'
|
||||
import type { AIProvider } from '$lib/gen/types.gen'
|
||||
import type { ExtendedOpenFlow } from '$lib/components/flows/types'
|
||||
import {
|
||||
flowTools,
|
||||
prepareFlowSystemMessage,
|
||||
prepareFlowUserMessage,
|
||||
type FlowAIChatHelpers
|
||||
} from '../../../../../frontend/src/lib/components/copilot/chat/flow/core'
|
||||
import type { Tool as ProductionTool } from '../../../../../frontend/src/lib/components/copilot/chat/shared'
|
||||
import { createFlowFileHelpers, type FlowWorkspaceFixtures } from './fileHelpers'
|
||||
import { runEval } from '../shared'
|
||||
import type { ModeRunContext } from '../../../../core/types'
|
||||
import type { TokenUsage } from '../shared/types'
|
||||
|
||||
export interface FlowFixture {
|
||||
value?: {
|
||||
modules?: FlowModule[]
|
||||
preprocessor_module?: FlowModule
|
||||
failure_module?: FlowModule
|
||||
}
|
||||
schema?: Record<string, unknown>
|
||||
}
|
||||
|
||||
export interface FlowEvalResult {
|
||||
success: boolean
|
||||
flow: ExtendedOpenFlow
|
||||
error?: string
|
||||
assistantMessageCount: number
|
||||
toolCallCount: number
|
||||
toolsUsed: string[]
|
||||
tokenUsage: TokenUsage
|
||||
}
|
||||
|
||||
export interface FlowEvalOptions {
|
||||
initialFlow?: FlowFixture
|
||||
workspaceFixtures?: FlowWorkspaceFixtures
|
||||
model?: string
|
||||
maxIterations?: number
|
||||
provider?: AIProvider
|
||||
workspaceRoot?: string
|
||||
runContext?: ModeRunContext
|
||||
}
|
||||
|
||||
export async function runFlowEval(
|
||||
userPrompt: string,
|
||||
apiKey: string,
|
||||
options?: FlowEvalOptions
|
||||
): Promise<FlowEvalResult> {
|
||||
const workspaceRoot =
|
||||
options?.workspaceRoot ??
|
||||
(await mkdtemp(join(tmpdir(), 'wmill-frontend-flow-benchmark-')))
|
||||
const { helpers, getFlow, cleanup } = await createFlowFileHelpers(
|
||||
options?.initialFlow?.value?.modules ?? [],
|
||||
options?.initialFlow?.schema,
|
||||
options?.initialFlow?.value?.preprocessor_module,
|
||||
options?.initialFlow?.value?.failure_module,
|
||||
workspaceRoot,
|
||||
options?.workspaceFixtures
|
||||
)
|
||||
|
||||
try {
|
||||
const systemMessage = prepareFlowSystemMessage()
|
||||
const tools = flowTools as ProductionTool<FlowAIChatHelpers>[]
|
||||
const model = options?.model ?? 'claude-haiku-4-5-20251001'
|
||||
const userMessage = prepareFlowUserMessage(
|
||||
userPrompt,
|
||||
helpers.getFlowAndSelectedId(),
|
||||
[],
|
||||
helpers.inlineScriptSession
|
||||
)
|
||||
|
||||
const rawResult = await runEval({
|
||||
userPrompt,
|
||||
systemMessage,
|
||||
userMessage,
|
||||
tools,
|
||||
helpers,
|
||||
apiKey,
|
||||
getOutput: getFlow,
|
||||
onAssistantMessageStart: options?.runContext?.onAssistantMessageStart,
|
||||
onAssistantToken: options?.runContext?.onAssistantChunk,
|
||||
onAssistantMessageEnd: options?.runContext?.onAssistantMessageEnd,
|
||||
options: {
|
||||
maxIterations: options?.maxIterations,
|
||||
model,
|
||||
workspace: workspaceRoot,
|
||||
provider: options?.provider
|
||||
}
|
||||
})
|
||||
|
||||
return {
|
||||
flow: rawResult.output,
|
||||
success: rawResult.success,
|
||||
error: rawResult.error,
|
||||
assistantMessageCount: rawResult.iterations,
|
||||
toolCallCount: rawResult.toolCallsCount,
|
||||
toolsUsed: rawResult.toolsCalled,
|
||||
tokenUsage: rawResult.tokenUsage
|
||||
}
|
||||
} finally {
|
||||
await cleanup()
|
||||
}
|
||||
}
|
||||
73
ai_evals/adapters/frontend/core/script/fileHelpers.ts
Normal file
73
ai_evals/adapters/frontend/core/script/fileHelpers.ts
Normal file
@@ -0,0 +1,73 @@
|
||||
import { mkdir, rm, writeFile } from 'fs/promises'
|
||||
import { dirname, join } from 'path'
|
||||
import type { ScriptLang } from '../../../../../frontend/src/lib/gen/types.gen'
|
||||
import type { ReviewChangesOpts } from '../../../../../frontend/src/lib/components/copilot/chat/monaco-adapter'
|
||||
import type { ScriptChatHelpers } from '../../../../../frontend/src/lib/components/copilot/chat/script/core'
|
||||
import { buildScriptLintResult } from './preview'
|
||||
import { registerBenchmarkWorkspace, unregisterBenchmarkWorkspace } from '../../mockBackend'
|
||||
|
||||
export interface ScriptEvalState {
|
||||
code: string
|
||||
lang: ScriptLang | 'bunnative'
|
||||
path: string
|
||||
args: Record<string, any>
|
||||
}
|
||||
|
||||
export async function createScriptFileHelpers(
|
||||
initialScript: ScriptEvalState,
|
||||
workspaceRoot?: string
|
||||
): Promise<{
|
||||
helpers: ScriptChatHelpers
|
||||
getScript: () => ScriptEvalState
|
||||
cleanup: () => Promise<void>
|
||||
workspaceDir: string | null
|
||||
}> {
|
||||
let script = structuredClone(initialScript)
|
||||
const scriptFilePath = workspaceRoot ? join(workspaceRoot, script.path) : null
|
||||
|
||||
async function persistScript(): Promise<void> {
|
||||
if (!scriptFilePath) {
|
||||
return
|
||||
}
|
||||
await mkdir(dirname(scriptFilePath), { recursive: true })
|
||||
await writeFile(scriptFilePath, script.code, 'utf8')
|
||||
}
|
||||
|
||||
await persistScript()
|
||||
|
||||
if (workspaceRoot) {
|
||||
registerBenchmarkWorkspace(workspaceRoot)
|
||||
}
|
||||
|
||||
const helpers: ScriptChatHelpers = {
|
||||
getScriptOptions: () => ({
|
||||
code: script.code,
|
||||
lang: script.lang,
|
||||
path: script.path,
|
||||
args: structuredClone(script.args)
|
||||
}),
|
||||
applyCode: async (code: string, opts?: ReviewChangesOpts) => {
|
||||
if (opts?.mode === 'revert') {
|
||||
return
|
||||
}
|
||||
script = {
|
||||
...script,
|
||||
code
|
||||
}
|
||||
await persistScript()
|
||||
},
|
||||
getLintErrors: () => buildScriptLintResult(script.code, script.lang)
|
||||
}
|
||||
|
||||
return {
|
||||
helpers,
|
||||
getScript: () => structuredClone(script),
|
||||
cleanup: async () => {
|
||||
if (workspaceRoot) {
|
||||
unregisterBenchmarkWorkspace(workspaceRoot)
|
||||
await rm(workspaceRoot, { recursive: true, force: true })
|
||||
}
|
||||
},
|
||||
workspaceDir: workspaceRoot ?? null
|
||||
}
|
||||
}
|
||||
96
ai_evals/adapters/frontend/core/script/preview.ts
Normal file
96
ai_evals/adapters/frontend/core/script/preview.ts
Normal file
@@ -0,0 +1,96 @@
|
||||
import ts from 'typescript'
|
||||
import type { ScriptLang } from '../../../../../frontend/src/lib/gen/types.gen'
|
||||
import type { ScriptLintResult } from '../../../../../frontend/src/lib/components/copilot/chat/shared'
|
||||
|
||||
export type ScriptPreviewLanguage = ScriptLang | 'bunnative'
|
||||
|
||||
const TS_LIKE_LANGUAGES = new Set<ScriptPreviewLanguage>(['bun', 'deno', 'nativets', 'bunnative'])
|
||||
const JS_LIKE_LANGUAGES = new Set<ScriptPreviewLanguage>(['bun', 'deno', 'nativets', 'bunnative'])
|
||||
|
||||
function hasSupportedEntrypoint(code: string): boolean {
|
||||
return (
|
||||
/export\s+(async\s+)?function\s+main\s*\(/.test(code) ||
|
||||
/export\s+(async\s+)?function\s+preprocessor\s*\(/.test(code)
|
||||
)
|
||||
}
|
||||
|
||||
function compilerOptionsForLanguage(lang: ScriptPreviewLanguage): ts.CompilerOptions | null {
|
||||
if (!TS_LIKE_LANGUAGES.has(lang)) {
|
||||
return null
|
||||
}
|
||||
|
||||
return {
|
||||
target: ts.ScriptTarget.ES2022,
|
||||
module: ts.ModuleKind.ESNext,
|
||||
moduleResolution: ts.ModuleResolutionKind.Bundler,
|
||||
noEmit: true,
|
||||
allowJs: true,
|
||||
checkJs: false,
|
||||
strict: false,
|
||||
skipLibCheck: true
|
||||
}
|
||||
}
|
||||
|
||||
function getLineAndColumn(sourceText: string, start: number): { line: number; column: number } {
|
||||
const prefix = sourceText.slice(0, Math.max(0, start))
|
||||
const line = prefix.split('\n').length
|
||||
const lastNewline = prefix.lastIndexOf('\n')
|
||||
const column = lastNewline === -1 ? prefix.length + 1 : prefix.length - lastNewline
|
||||
return { line, column }
|
||||
}
|
||||
|
||||
export function buildScriptLintResult(
|
||||
code: string,
|
||||
lang: ScriptPreviewLanguage
|
||||
): ScriptLintResult {
|
||||
const diagnostics: ScriptLintResult['errors'] = []
|
||||
const compilerOptions = compilerOptionsForLanguage(lang)
|
||||
|
||||
if (compilerOptions) {
|
||||
const sourceFile = ts.createSourceFile(
|
||||
'script.ts',
|
||||
code,
|
||||
ts.ScriptTarget.ES2022,
|
||||
true,
|
||||
JS_LIKE_LANGUAGES.has(lang) ? ts.ScriptKind.TS : ts.ScriptKind.JS
|
||||
)
|
||||
const output = ts.transpileModule(code, {
|
||||
compilerOptions,
|
||||
fileName: sourceFile.fileName,
|
||||
reportDiagnostics: true
|
||||
})
|
||||
|
||||
for (const diagnostic of output.diagnostics ?? []) {
|
||||
const start = diagnostic.start ?? 0
|
||||
const length = diagnostic.length ?? 1
|
||||
const { line, column } = getLineAndColumn(code, start)
|
||||
const message = ts.flattenDiagnosticMessageText(diagnostic.messageText, '\n')
|
||||
diagnostics.push({
|
||||
startLineNumber: line,
|
||||
startColumn: column,
|
||||
endLineNumber: line,
|
||||
endColumn: column + Math.max(1, length),
|
||||
message,
|
||||
severity: 8
|
||||
} as ScriptLintResult['errors'][number])
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasSupportedEntrypoint(code)) {
|
||||
diagnostics.push({
|
||||
startLineNumber: 1,
|
||||
startColumn: 1,
|
||||
endLineNumber: 1,
|
||||
endColumn: 1,
|
||||
message: 'Script must export a main or preprocessor function.',
|
||||
severity: 8
|
||||
} as ScriptLintResult['errors'][number])
|
||||
}
|
||||
|
||||
return {
|
||||
errorCount: diagnostics.length,
|
||||
warningCount: 0,
|
||||
errors: diagnostics,
|
||||
warnings: []
|
||||
}
|
||||
}
|
||||
109
ai_evals/adapters/frontend/core/script/scriptEvalRunner.ts
Normal file
109
ai_evals/adapters/frontend/core/script/scriptEvalRunner.ts
Normal file
@@ -0,0 +1,109 @@
|
||||
import { mkdtemp } from 'fs/promises'
|
||||
import { tmpdir } from 'os'
|
||||
import { join } from 'path'
|
||||
import type { AIProvider, AIProviderModel, ScriptLang } from '$lib/gen/types.gen'
|
||||
import type { ContextElement } from '../../../../../frontend/src/lib/components/copilot/chat/context'
|
||||
import {
|
||||
prepareScriptSystemMessage,
|
||||
prepareScriptTools,
|
||||
prepareScriptUserMessage,
|
||||
type ScriptChatHelpers
|
||||
} from '../../../../../frontend/src/lib/components/copilot/chat/script/core'
|
||||
import type { Tool as ProductionTool } from '../../../../../frontend/src/lib/components/copilot/chat/shared'
|
||||
import { createScriptFileHelpers, type ScriptEvalState } from './fileHelpers'
|
||||
import { runEval } from '../shared'
|
||||
import type { ModeRunContext } from '../../../../core/types'
|
||||
import type { TokenUsage } from '../shared/types'
|
||||
|
||||
export interface ScriptEvalResult {
|
||||
success: boolean
|
||||
script: ScriptEvalState
|
||||
error?: string
|
||||
assistantMessageCount: number
|
||||
toolCallCount: number
|
||||
toolsUsed: string[]
|
||||
tokenUsage: TokenUsage
|
||||
}
|
||||
|
||||
export interface ScriptEvalOptions {
|
||||
initialScript: ScriptEvalState
|
||||
model?: string
|
||||
maxIterations?: number
|
||||
provider?: AIProvider
|
||||
workspaceRoot?: string
|
||||
runContext?: ModeRunContext
|
||||
}
|
||||
|
||||
function resolveModelProvider(
|
||||
model: string,
|
||||
provider?: AIProvider
|
||||
): AIProviderModel {
|
||||
if (provider) {
|
||||
return { provider, model }
|
||||
}
|
||||
if (model.startsWith('claude')) {
|
||||
return { provider: 'anthropic', model }
|
||||
}
|
||||
return { provider: 'openai', model }
|
||||
}
|
||||
|
||||
export async function runScriptEval(
|
||||
userPrompt: string,
|
||||
apiKey: string,
|
||||
options: ScriptEvalOptions
|
||||
): Promise<ScriptEvalResult> {
|
||||
const workspaceRoot =
|
||||
options.workspaceRoot ?? (await mkdtemp(join(tmpdir(), 'wmill-frontend-script-benchmark-')))
|
||||
const { helpers, getScript, cleanup } = await createScriptFileHelpers(
|
||||
options.initialScript,
|
||||
workspaceRoot
|
||||
)
|
||||
|
||||
try {
|
||||
const model = options.model ?? 'claude-haiku-4-5-20251001'
|
||||
const modelProvider = resolveModelProvider(model, options.provider)
|
||||
const selectedContext: ContextElement[] = []
|
||||
const systemMessage = prepareScriptSystemMessage(
|
||||
modelProvider,
|
||||
options.initialScript.lang,
|
||||
{}
|
||||
)
|
||||
const tools = prepareScriptTools(
|
||||
modelProvider,
|
||||
options.initialScript.lang,
|
||||
selectedContext
|
||||
) as ProductionTool<ScriptChatHelpers>[]
|
||||
const userMessage = prepareScriptUserMessage(userPrompt, selectedContext)
|
||||
|
||||
const rawResult = await runEval({
|
||||
userPrompt,
|
||||
systemMessage,
|
||||
userMessage,
|
||||
tools,
|
||||
helpers,
|
||||
apiKey,
|
||||
getOutput: getScript,
|
||||
onAssistantMessageStart: options.runContext?.onAssistantMessageStart,
|
||||
onAssistantToken: options.runContext?.onAssistantChunk,
|
||||
onAssistantMessageEnd: options.runContext?.onAssistantMessageEnd,
|
||||
options: {
|
||||
maxIterations: options.maxIterations,
|
||||
model,
|
||||
workspace: workspaceRoot,
|
||||
provider: modelProvider.provider
|
||||
}
|
||||
})
|
||||
|
||||
return {
|
||||
script: rawResult.output,
|
||||
success: rawResult.success,
|
||||
error: rawResult.error,
|
||||
assistantMessageCount: rawResult.iterations,
|
||||
toolCallCount: rawResult.toolCallsCount,
|
||||
toolsUsed: rawResult.toolsCalled,
|
||||
tokenUsage: rawResult.tokenUsage
|
||||
}
|
||||
} finally {
|
||||
await cleanup()
|
||||
}
|
||||
}
|
||||
@@ -1,29 +1,19 @@
|
||||
import OpenAI from 'openai'
|
||||
import Anthropic from '@anthropic-ai/sdk'
|
||||
import type {
|
||||
ChatCompletionMessageParam,
|
||||
ChatCompletionSystemMessageParam
|
||||
} from 'openai/resources/chat/completions.mjs'
|
||||
import type { AIProvider, AIProviderModel } from '$lib/gen/types.gen'
|
||||
import type { TokenUsage, ToolCallDetail, EvalRunnerOptions } from './types'
|
||||
import type { Tool } from './baseVariants'
|
||||
import { runChatLoop, type ChatClients } from '../../chatLoop'
|
||||
import type { Tool as ProductionTool, ToolCallbacks } from '../../shared'
|
||||
|
||||
/**
|
||||
* Result from a single eval run (before domain-specific evaluation).
|
||||
*/
|
||||
export interface RawEvalResult<TOutput> {
|
||||
success: boolean
|
||||
output: TOutput
|
||||
error?: string
|
||||
tokenUsage: TokenUsage
|
||||
toolCallsCount: number
|
||||
toolsCalled: string[]
|
||||
toolCallDetails: ToolCallDetail[]
|
||||
iterations: number
|
||||
messages: ChatCompletionMessageParam[]
|
||||
}
|
||||
import type { AIProviderModel } from '$lib/gen/types.gen'
|
||||
import type { TokenUsage, ToolCallDetail, EvalRunnerOptions, RawEvalResult } from './types'
|
||||
import { runChatLoop, type ChatClients } from '../../../../../frontend/src/lib/components/copilot/chat/chatLoop'
|
||||
import type {
|
||||
Tool as ProductionTool,
|
||||
ToolCallbacks
|
||||
} from '../../../../../frontend/src/lib/components/copilot/chat/shared'
|
||||
import {
|
||||
createEvalClients,
|
||||
type FrontendEvalProvider,
|
||||
resolveEvalModelProvider
|
||||
} from './providerConfig'
|
||||
|
||||
/**
|
||||
* Parameters for running a base evaluation.
|
||||
@@ -38,7 +28,7 @@ export interface RunEvalParams<THelpers, TOutput> {
|
||||
/** Tool definitions for the LLM API (unused — derived from tools) */
|
||||
toolDefs?: unknown
|
||||
/** Full tool implementations for execution */
|
||||
tools: Tool<THelpers>[]
|
||||
tools: ProductionTool<THelpers>[]
|
||||
/** Domain-specific helpers for tool execution */
|
||||
helpers: THelpers
|
||||
/** API key for the provider */
|
||||
@@ -47,35 +37,9 @@ export interface RunEvalParams<THelpers, TOutput> {
|
||||
getOutput: () => TOutput
|
||||
/** Optional configuration */
|
||||
options?: EvalRunnerOptions
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates SDK clients for the given provider.
|
||||
*/
|
||||
function createEvalClients(provider: AIProvider, apiKey: string): ChatClients {
|
||||
if (provider === 'anthropic') {
|
||||
return {
|
||||
openai: new OpenAI({ apiKey: 'unused' }),
|
||||
anthropic: new Anthropic({ apiKey })
|
||||
}
|
||||
}
|
||||
return {
|
||||
openai: new OpenAI({ apiKey }),
|
||||
anthropic: new Anthropic({ apiKey: 'unused' })
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolves model string to AIProviderModel.
|
||||
*/
|
||||
function resolveModelProvider(
|
||||
model: string,
|
||||
provider?: AIProvider
|
||||
): AIProviderModel {
|
||||
if (provider) return { provider, model }
|
||||
if (model.startsWith('claude')) return { provider: 'anthropic', model }
|
||||
if (model.startsWith('gpt') || model.startsWith('o')) return { provider: 'openai', model }
|
||||
return { provider: 'openai', model }
|
||||
onAssistantMessageStart?: () => void
|
||||
onAssistantToken?: (token: string) => void
|
||||
onAssistantMessageEnd?: () => void
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -92,16 +56,23 @@ export async function runEval<THelpers, TOutput>(
|
||||
helpers,
|
||||
apiKey,
|
||||
getOutput,
|
||||
options
|
||||
options,
|
||||
onAssistantMessageStart,
|
||||
onAssistantToken,
|
||||
onAssistantMessageEnd
|
||||
} = params
|
||||
let shouldEmitMessageStart = true
|
||||
|
||||
const model = options?.model ?? 'gpt-4o'
|
||||
const maxIterations = options?.maxIterations ?? 20
|
||||
const workspace = options?.workspace ?? 'test-workspace'
|
||||
const provider = options?.provider
|
||||
|
||||
const modelProvider = resolveModelProvider(model, provider)
|
||||
const clients = createEvalClients(modelProvider.provider, apiKey)
|
||||
const modelProvider = resolveEvalModelProvider(
|
||||
model,
|
||||
provider as FrontendEvalProvider | undefined
|
||||
) as AIProviderModel
|
||||
const clients = createEvalClients(modelProvider.provider, apiKey) as ChatClients
|
||||
|
||||
const messages: ChatCompletionMessageParam[] = [userMessage]
|
||||
let toolCallsCount = 0
|
||||
@@ -128,7 +99,7 @@ export async function runEval<THelpers, TOutput>(
|
||||
}
|
||||
return tool.fn(p)
|
||||
}
|
||||
})) as ProductionTool<THelpers>[]
|
||||
}))
|
||||
|
||||
// No-op callbacks for eval
|
||||
const callbacks: ToolCallbacks & {
|
||||
@@ -137,8 +108,19 @@ export async function runEval<THelpers, TOutput>(
|
||||
} = {
|
||||
setToolStatus: () => {},
|
||||
removeToolStatus: () => {},
|
||||
onNewToken: () => {},
|
||||
onMessageEnd: () => {}
|
||||
onNewToken: (token: string) => {
|
||||
if (shouldEmitMessageStart) {
|
||||
onAssistantMessageStart?.()
|
||||
shouldEmitMessageStart = false
|
||||
}
|
||||
onAssistantToken?.(token)
|
||||
},
|
||||
onMessageEnd: () => {
|
||||
if (!shouldEmitMessageStart) {
|
||||
onAssistantMessageEnd?.()
|
||||
}
|
||||
shouldEmitMessageStart = true
|
||||
}
|
||||
}
|
||||
|
||||
const abortController = new AbortController()
|
||||
@@ -161,7 +143,7 @@ export async function runEval<THelpers, TOutput>(
|
||||
return {
|
||||
success: true,
|
||||
output: getOutput(),
|
||||
tokenUsage: { prompt: 0, completion: 0, total: 0 },
|
||||
tokenUsage: result.tokenUsage,
|
||||
toolCallsCount,
|
||||
toolsCalled,
|
||||
toolCallDetails,
|
||||
3
ai_evals/adapters/frontend/core/shared/index.ts
Normal file
3
ai_evals/adapters/frontend/core/shared/index.ts
Normal file
@@ -0,0 +1,3 @@
|
||||
export type { TokenUsage, ToolCallDetail, EvalRunnerOptions, RawEvalResult } from './types'
|
||||
export type { RunEvalParams } from './baseEvalRunner'
|
||||
export { runEval } from './baseEvalRunner'
|
||||
@@ -0,0 +1,41 @@
|
||||
import { describe, expect, it } from "bun:test";
|
||||
import {
|
||||
buildOpenAICompatibleClientOptions,
|
||||
resolveEvalModelProvider,
|
||||
} from "./providerConfig";
|
||||
|
||||
describe("buildOpenAICompatibleClientOptions", () => {
|
||||
it("adds Gemini's OpenAI-compatible base URL and client header", () => {
|
||||
const options = buildOpenAICompatibleClientOptions("googleai", "gemini-test-key");
|
||||
|
||||
expect(options).toMatchObject({
|
||||
apiKey: "gemini-test-key",
|
||||
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
|
||||
defaultHeaders: {
|
||||
"x-goog-api-client": "windmill-ai-evals/1.0",
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("keeps the default OpenAI-compatible config for OpenAI", () => {
|
||||
expect(buildOpenAICompatibleClientOptions("openai", "openai-test-key")).toEqual({
|
||||
apiKey: "openai-test-key",
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("resolveEvalModelProvider", () => {
|
||||
it("infers googleai from Gemini model ids", () => {
|
||||
expect(resolveEvalModelProvider("gemini-2.5-flash")).toEqual({
|
||||
provider: "googleai",
|
||||
model: "gemini-2.5-flash",
|
||||
});
|
||||
});
|
||||
|
||||
it("preserves an explicit provider", () => {
|
||||
expect(resolveEvalModelProvider("gemini-2.5-pro", "googleai")).toEqual({
|
||||
provider: "googleai",
|
||||
model: "gemini-2.5-pro",
|
||||
});
|
||||
});
|
||||
});
|
||||
71
ai_evals/adapters/frontend/core/shared/providerConfig.ts
Normal file
71
ai_evals/adapters/frontend/core/shared/providerConfig.ts
Normal file
@@ -0,0 +1,71 @@
|
||||
import Anthropic from "@anthropic-ai/sdk";
|
||||
import OpenAI from "openai";
|
||||
import type { FrontendEvalModelConfig } from "../../../../core/models";
|
||||
|
||||
export type FrontendEvalProvider = FrontendEvalModelConfig["provider"];
|
||||
|
||||
export interface EvalClients {
|
||||
openai: OpenAI;
|
||||
anthropic: Anthropic;
|
||||
}
|
||||
|
||||
export interface ResolvedEvalModelProvider {
|
||||
provider: FrontendEvalProvider;
|
||||
model: string;
|
||||
}
|
||||
|
||||
const GEMINI_OPENAI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/";
|
||||
const GEMINI_GOOG_API_CLIENT = "windmill-ai-evals/1.0";
|
||||
|
||||
export function buildOpenAICompatibleClientOptions(
|
||||
provider: Exclude<FrontendEvalProvider, "anthropic">,
|
||||
apiKey: string
|
||||
): ConstructorParameters<typeof OpenAI>[0] {
|
||||
if (provider === "googleai") {
|
||||
return {
|
||||
apiKey,
|
||||
baseURL: GEMINI_OPENAI_BASE_URL,
|
||||
defaultHeaders: {
|
||||
"x-goog-api-client": GEMINI_GOOG_API_CLIENT,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
return { apiKey };
|
||||
}
|
||||
|
||||
export function createEvalClients(
|
||||
provider: FrontendEvalProvider,
|
||||
apiKey: string
|
||||
): EvalClients {
|
||||
if (provider === "anthropic") {
|
||||
return {
|
||||
openai: new OpenAI({ apiKey: "unused" }),
|
||||
anthropic: new Anthropic({ apiKey }),
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
openai: new OpenAI(buildOpenAICompatibleClientOptions(provider, apiKey)),
|
||||
anthropic: new Anthropic({ apiKey: "unused" }),
|
||||
};
|
||||
}
|
||||
|
||||
export function resolveEvalModelProvider(
|
||||
model: string,
|
||||
provider?: FrontendEvalProvider
|
||||
): ResolvedEvalModelProvider {
|
||||
if (provider) {
|
||||
return { provider, model };
|
||||
}
|
||||
if (model.startsWith("claude")) {
|
||||
return { provider: "anthropic", model };
|
||||
}
|
||||
if (model.startsWith("gemini")) {
|
||||
return { provider: "googleai", model };
|
||||
}
|
||||
if (model.startsWith("gpt") || model.startsWith("o")) {
|
||||
return { provider: "openai", model };
|
||||
}
|
||||
return { provider: "openai", model };
|
||||
}
|
||||
32
ai_evals/adapters/frontend/core/shared/types.ts
Normal file
32
ai_evals/adapters/frontend/core/shared/types.ts
Normal file
@@ -0,0 +1,32 @@
|
||||
import type { ChatCompletionMessageParam } from 'openai/resources/chat/completions.mjs'
|
||||
import type { AIProvider } from '$lib/gen/types.gen'
|
||||
|
||||
export interface TokenUsage {
|
||||
prompt: number
|
||||
completion: number
|
||||
total: number
|
||||
}
|
||||
|
||||
export interface ToolCallDetail {
|
||||
name: string
|
||||
arguments: Record<string, unknown>
|
||||
}
|
||||
|
||||
export interface EvalRunnerOptions {
|
||||
maxIterations?: number
|
||||
model?: string
|
||||
workspace?: string
|
||||
provider?: AIProvider
|
||||
}
|
||||
|
||||
export interface RawEvalResult<TOutput> {
|
||||
success: boolean
|
||||
output: TOutput
|
||||
error?: string
|
||||
tokenUsage: TokenUsage
|
||||
toolCallsCount: number
|
||||
toolsCalled: string[]
|
||||
toolCallDetails: ToolCallDetail[]
|
||||
iterations: number
|
||||
messages: ChatCompletionMessageParam[]
|
||||
}
|
||||
270
ai_evals/adapters/frontend/mockBackend.ts
Normal file
270
ai_evals/adapters/frontend/mockBackend.ts
Normal file
@@ -0,0 +1,270 @@
|
||||
import { randomUUID } from 'node:crypto'
|
||||
import type { CompletedJob, Flow, Script } from '../../../frontend/src/lib/gen'
|
||||
import type { ScriptLang } from '../../../frontend/src/lib/gen/types.gen'
|
||||
import { buildScriptLintResult } from './core/script/preview'
|
||||
|
||||
const BENCHMARK_TIMESTAMP = '1970-01-01T00:00:00.000Z'
|
||||
|
||||
export interface BenchmarkWorkspaceScript {
|
||||
path: string
|
||||
summary: string
|
||||
description?: string
|
||||
language: Script['language']
|
||||
schema?: Record<string, unknown>
|
||||
content: string
|
||||
}
|
||||
|
||||
export interface BenchmarkWorkspaceFlow {
|
||||
path: string
|
||||
summary: string
|
||||
description?: string
|
||||
schema?: Record<string, unknown>
|
||||
value: Flow['value']
|
||||
}
|
||||
|
||||
export interface BenchmarkWorkspaceRunnables {
|
||||
scripts?: BenchmarkWorkspaceScript[]
|
||||
flows?: BenchmarkWorkspaceFlow[]
|
||||
}
|
||||
|
||||
type BenchmarkCompletedJob = CompletedJob & { type: 'CompletedJob' }
|
||||
|
||||
const benchmarkWorkspaces = new Set<string>()
|
||||
const benchmarkWorkspaceRunnables = new Map<string, BenchmarkWorkspaceRunnables>()
|
||||
const benchmarkJobs = new Map<string, { workspace: string; job: BenchmarkCompletedJob }>()
|
||||
|
||||
export function resetBenchmarkMockBackend(): void {
|
||||
benchmarkWorkspaces.clear()
|
||||
benchmarkWorkspaceRunnables.clear()
|
||||
benchmarkJobs.clear()
|
||||
}
|
||||
|
||||
export function registerBenchmarkWorkspace(workspace: string): void {
|
||||
benchmarkWorkspaces.add(workspace)
|
||||
}
|
||||
|
||||
export function registerBenchmarkWorkspaceRunnables(
|
||||
workspace: string,
|
||||
runnables: BenchmarkWorkspaceRunnables
|
||||
): void {
|
||||
benchmarkWorkspaces.add(workspace)
|
||||
benchmarkWorkspaceRunnables.set(workspace, runnables)
|
||||
}
|
||||
|
||||
export function unregisterBenchmarkWorkspace(workspace: string): void {
|
||||
benchmarkWorkspaces.delete(workspace)
|
||||
benchmarkWorkspaceRunnables.delete(workspace)
|
||||
for (const [jobId, entry] of benchmarkJobs.entries()) {
|
||||
if (entry.workspace === workspace) {
|
||||
benchmarkJobs.delete(jobId)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export function unregisterBenchmarkWorkspaceRunnables(workspace: string): void {
|
||||
unregisterBenchmarkWorkspace(workspace)
|
||||
}
|
||||
|
||||
export function hasBenchmarkWorkspace(workspace: string): boolean {
|
||||
return benchmarkWorkspaces.has(workspace)
|
||||
}
|
||||
|
||||
export function listBenchmarkScripts(workspace: string): Script[] | null {
|
||||
const runnables = benchmarkWorkspaceRunnables.get(workspace)
|
||||
if (!runnables) {
|
||||
return null
|
||||
}
|
||||
return (runnables.scripts ?? []).map(buildBenchmarkScript)
|
||||
}
|
||||
|
||||
export function listBenchmarkFlows(workspace: string): Flow[] | null {
|
||||
const runnables = benchmarkWorkspaceRunnables.get(workspace)
|
||||
if (!runnables) {
|
||||
return null
|
||||
}
|
||||
return (runnables.flows ?? []).map(buildBenchmarkFlow)
|
||||
}
|
||||
|
||||
export function getBenchmarkScriptByPath(workspace: string, path: string): Script | null {
|
||||
const script = benchmarkWorkspaceRunnables
|
||||
.get(workspace)
|
||||
?.scripts?.find((entry) => entry.path === path)
|
||||
|
||||
return script ? buildBenchmarkScript(script) : null
|
||||
}
|
||||
|
||||
export function getBenchmarkScriptByHash(workspace: string, hash: string): Script | null {
|
||||
const script = benchmarkWorkspaceRunnables
|
||||
.get(workspace)
|
||||
?.scripts?.find((entry) => buildBenchmarkScriptHash(entry.path) === hash)
|
||||
|
||||
return script ? buildBenchmarkScript(script) : null
|
||||
}
|
||||
|
||||
export function getBenchmarkFlowByPath(workspace: string, path: string): Flow | null {
|
||||
const flow = benchmarkWorkspaceRunnables
|
||||
.get(workspace)
|
||||
?.flows?.find((entry) => entry.path === path)
|
||||
|
||||
return flow ? buildBenchmarkFlow(flow) : null
|
||||
}
|
||||
|
||||
export function createBenchmarkCompletedJob(input: {
|
||||
workspace: string
|
||||
jobKind: CompletedJob['job_kind']
|
||||
success?: boolean
|
||||
result?: unknown
|
||||
logs?: string
|
||||
scriptPath?: string
|
||||
scriptHash?: string
|
||||
args?: Record<string, unknown>
|
||||
}): string {
|
||||
const jobId = `benchmark-job-${randomUUID()}`
|
||||
const now = new Date().toISOString()
|
||||
const job: BenchmarkCompletedJob = {
|
||||
type: 'CompletedJob',
|
||||
id: jobId,
|
||||
workspace_id: input.workspace,
|
||||
created_by: 'ai-evals',
|
||||
created_at: now,
|
||||
started_at: now,
|
||||
completed_at: now,
|
||||
duration_ms: 0,
|
||||
success: input.success ?? true,
|
||||
script_path: input.scriptPath,
|
||||
script_hash: input.scriptHash,
|
||||
args: input.args,
|
||||
result: input.result,
|
||||
logs: input.logs,
|
||||
canceled: false,
|
||||
job_kind: input.jobKind,
|
||||
permissioned_as: 'u/ai-evals',
|
||||
is_flow_step: false,
|
||||
is_skipped: false,
|
||||
email: 'ai-evals@local',
|
||||
visible_to_owner: true,
|
||||
tag: 'benchmark'
|
||||
}
|
||||
|
||||
benchmarkJobs.set(jobId, { workspace: input.workspace, job })
|
||||
return jobId
|
||||
}
|
||||
|
||||
export function getBenchmarkCompletedJob(
|
||||
workspace: string,
|
||||
jobId: string
|
||||
): BenchmarkCompletedJob | null {
|
||||
const entry = benchmarkJobs.get(jobId)
|
||||
if (!entry || entry.workspace !== workspace) {
|
||||
return null
|
||||
}
|
||||
return structuredClone(entry.job)
|
||||
}
|
||||
|
||||
export function runBenchmarkScriptPreview(input: {
|
||||
workspace: string
|
||||
requestBody: {
|
||||
content?: string
|
||||
language?: ScriptLang | 'bunnative'
|
||||
args?: Record<string, unknown>
|
||||
path?: string
|
||||
}
|
||||
}): string {
|
||||
const content = input.requestBody.content ?? ''
|
||||
const language = input.requestBody.language ?? 'bun'
|
||||
const lintResult = buildScriptLintResult(content, language)
|
||||
const success = lintResult.errorCount === 0
|
||||
|
||||
return createBenchmarkCompletedJob({
|
||||
workspace: input.workspace,
|
||||
jobKind: 'preview',
|
||||
success,
|
||||
scriptPath: input.requestBody.path,
|
||||
args: input.requestBody.args,
|
||||
result: success
|
||||
? {
|
||||
path: input.requestBody.path,
|
||||
args: input.requestBody.args ?? {},
|
||||
validated: true
|
||||
}
|
||||
: {
|
||||
path: input.requestBody.path,
|
||||
args: input.requestBody.args ?? {},
|
||||
errorCount: lintResult.errorCount,
|
||||
errors: lintResult.errors.map((entry) => ({
|
||||
line: entry.startLineNumber,
|
||||
message: entry.message
|
||||
}))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
export function runBenchmarkFlowByPath(input: {
|
||||
workspace: string
|
||||
path: string
|
||||
args?: Record<string, unknown>
|
||||
}): string {
|
||||
const flow = getBenchmarkFlowByPath(input.workspace, input.path)
|
||||
return createBenchmarkCompletedJob({
|
||||
workspace: input.workspace,
|
||||
jobKind: 'flowpreview',
|
||||
success: flow !== null,
|
||||
args: input.args,
|
||||
result:
|
||||
flow !== null
|
||||
? {
|
||||
path: input.path,
|
||||
args: input.args ?? {},
|
||||
mocked: true
|
||||
}
|
||||
: {
|
||||
error: `Flow "${input.path}" not found in benchmark workspace`
|
||||
},
|
||||
logs:
|
||||
flow !== null
|
||||
? 'Mock benchmark flow run completed successfully.'
|
||||
: `Flow "${input.path}" not found in benchmark workspace.`
|
||||
})
|
||||
}
|
||||
|
||||
function buildBenchmarkScriptHash(path: string): string {
|
||||
return `benchmark:${path}`
|
||||
}
|
||||
|
||||
function buildBenchmarkScript(script: BenchmarkWorkspaceScript): Script {
|
||||
return {
|
||||
workspace_id: 'benchmark',
|
||||
hash: buildBenchmarkScriptHash(script.path),
|
||||
path: script.path,
|
||||
parent_hashes: [],
|
||||
summary: script.summary,
|
||||
description: script.description ?? '',
|
||||
content: script.content,
|
||||
created_by: 'benchmark',
|
||||
created_at: BENCHMARK_TIMESTAMP,
|
||||
archived: false,
|
||||
schema: script.schema ?? {},
|
||||
deleted: false,
|
||||
is_template: false,
|
||||
extra_perms: {},
|
||||
language: script.language,
|
||||
kind: 'script',
|
||||
starred: false,
|
||||
has_preprocessor: false,
|
||||
modules: null
|
||||
}
|
||||
}
|
||||
|
||||
function buildBenchmarkFlow(flow: BenchmarkWorkspaceFlow): Flow {
|
||||
return {
|
||||
path: flow.path,
|
||||
summary: flow.summary,
|
||||
description: flow.description ?? '',
|
||||
value: flow.value,
|
||||
schema: flow.schema ?? {},
|
||||
edited_by: 'benchmark',
|
||||
edited_at: BENCHMARK_TIMESTAMP,
|
||||
archived: false,
|
||||
extra_perms: {}
|
||||
} as Flow
|
||||
}
|
||||
133
ai_evals/adapters/frontend/progress.ts
Normal file
133
ai_evals/adapters/frontend/progress.ts
Normal file
@@ -0,0 +1,133 @@
|
||||
export type FrontendBenchmarkProgressSurface = 'flow' | 'app' | 'script'
|
||||
|
||||
export type FrontendBenchmarkProgressEvent =
|
||||
| {
|
||||
type: 'run-start'
|
||||
surface: FrontendBenchmarkProgressSurface
|
||||
totalCases: number
|
||||
runs: number
|
||||
concurrency: number
|
||||
}
|
||||
| {
|
||||
type: 'attempt-start'
|
||||
surface: FrontendBenchmarkProgressSurface
|
||||
caseId: string
|
||||
caseNumber: number
|
||||
totalCases: number
|
||||
attempt: number
|
||||
runs: number
|
||||
}
|
||||
| {
|
||||
type: 'attempt-finish'
|
||||
surface: FrontendBenchmarkProgressSurface
|
||||
caseId: string
|
||||
caseNumber: number
|
||||
totalCases: number
|
||||
attempt: number
|
||||
runs: number
|
||||
passed: boolean
|
||||
durationMs: number
|
||||
judgeScore: number | null
|
||||
error: string | null
|
||||
}
|
||||
| {
|
||||
type: 'assistant-message-start'
|
||||
surface: FrontendBenchmarkProgressSurface
|
||||
caseId: string
|
||||
caseNumber: number
|
||||
totalCases: number
|
||||
attempt: number
|
||||
runs: number
|
||||
}
|
||||
| {
|
||||
type: 'assistant-chunk'
|
||||
surface: FrontendBenchmarkProgressSurface
|
||||
caseId: string
|
||||
caseNumber: number
|
||||
totalCases: number
|
||||
attempt: number
|
||||
runs: number
|
||||
chunk: string
|
||||
}
|
||||
| {
|
||||
type: 'assistant-message-end'
|
||||
surface: FrontendBenchmarkProgressSurface
|
||||
caseId: string
|
||||
caseNumber: number
|
||||
totalCases: number
|
||||
attempt: number
|
||||
runs: number
|
||||
}
|
||||
|
||||
export const FRONTEND_BENCHMARK_PROGRESS_PREFIX = 'WMILL_FRONTEND_AI_EVAL_PROGRESS '
|
||||
|
||||
export function emitFrontendBenchmarkProgress(event: FrontendBenchmarkProgressEvent): void {
|
||||
process.stderr.write(
|
||||
`${FRONTEND_BENCHMARK_PROGRESS_PREFIX}${JSON.stringify(event)}\n`
|
||||
)
|
||||
}
|
||||
|
||||
export function parseFrontendBenchmarkProgressLine(
|
||||
line: string
|
||||
): FrontendBenchmarkProgressEvent | null {
|
||||
if (!line.startsWith(FRONTEND_BENCHMARK_PROGRESS_PREFIX)) {
|
||||
return null
|
||||
}
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(
|
||||
line.slice(FRONTEND_BENCHMARK_PROGRESS_PREFIX.length)
|
||||
) as FrontendBenchmarkProgressEvent
|
||||
return parsed?.type ? parsed : null
|
||||
} catch {
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
export function formatFrontendBenchmarkProgressEvent(
|
||||
event: FrontendBenchmarkProgressEvent
|
||||
): string {
|
||||
switch (event.type) {
|
||||
case 'run-start':
|
||||
return `Running ${event.surface}: ${event.totalCases} cases x ${event.runs} run${event.runs === 1 ? '' : 's'}, concurrency ${event.concurrency}`
|
||||
case 'attempt-start':
|
||||
return `${formatCasePrefix(event.caseNumber, event.totalCases)} ${event.caseId} attempt ${event.attempt}/${event.runs}...`
|
||||
case 'attempt-finish': {
|
||||
const parts = [
|
||||
`${formatCasePrefix(event.caseNumber, event.totalCases)} ${event.caseId} attempt ${event.attempt}/${event.runs} ${event.passed ? 'pass' : 'fail'}`,
|
||||
formatDuration(event.durationMs)
|
||||
]
|
||||
if (event.judgeScore !== null) {
|
||||
parts.push(`judge ${formatNumber(event.judgeScore)}`)
|
||||
}
|
||||
if (event.error) {
|
||||
parts.push(truncateSingleLine(event.error, 120))
|
||||
}
|
||||
return parts.join(' | ')
|
||||
}
|
||||
case 'assistant-message-start':
|
||||
case 'assistant-chunk':
|
||||
case 'assistant-message-end':
|
||||
return ''
|
||||
}
|
||||
}
|
||||
|
||||
function formatCasePrefix(caseNumber: number, totalCases: number): string {
|
||||
return `[${caseNumber}/${totalCases}]`
|
||||
}
|
||||
|
||||
function formatDuration(durationMs: number): string {
|
||||
return `${formatNumber(durationMs / 1000)}s`
|
||||
}
|
||||
|
||||
function formatNumber(value: number): string {
|
||||
return Number.isInteger(value) ? String(value) : value.toFixed(1)
|
||||
}
|
||||
|
||||
function truncateSingleLine(value: string, maxLength: number): string {
|
||||
const normalized = value.replace(/\s+/g, ' ').trim()
|
||||
if (normalized.length <= maxLength) {
|
||||
return normalized
|
||||
}
|
||||
return `${normalized.slice(0, Math.max(0, maxLength - 3))}...`
|
||||
}
|
||||
218
ai_evals/adapters/frontend/runtime.ts
Normal file
218
ai_evals/adapters/frontend/runtime.ts
Normal file
@@ -0,0 +1,218 @@
|
||||
import { spawn } from 'node:child_process'
|
||||
import { mkdtemp, readFile, rm } from 'node:fs/promises'
|
||||
import { tmpdir } from 'node:os'
|
||||
import path from 'node:path'
|
||||
import { fileURLToPath } from 'node:url'
|
||||
import {
|
||||
formatFrontendBenchmarkProgressEvent,
|
||||
parseFrontendBenchmarkProgressLine
|
||||
} from './progress'
|
||||
import type { BenchmarkRunResult } from '../../core/types'
|
||||
|
||||
const REPO_ROOT = fileURLToPath(new URL('../../../', import.meta.url))
|
||||
const FRONTEND_DIR = path.join(REPO_ROOT, 'frontend')
|
||||
const FRONTEND_BENCHMARK_TEST = '../ai_evals/adapters/frontend/vitestAdapter.test.ts'
|
||||
const FRONTEND_BENCHMARK_CONFIG = '../ai_evals/adapters/frontend/vitest.config.ts'
|
||||
|
||||
export type FrontendMode = 'flow' | 'app' | 'script'
|
||||
|
||||
export async function runFrontendBenchmarkAdapter(input: {
|
||||
mode: FrontendMode
|
||||
caseIds: string[]
|
||||
runs: number
|
||||
model?: string
|
||||
verbose?: boolean
|
||||
backendValidation?: string
|
||||
}): Promise<BenchmarkRunResult> {
|
||||
const tempDir = await mkdtemp(path.join(tmpdir(), 'wmill-frontend-benchmark-'))
|
||||
const outputPath = path.join(tempDir, 'result.json')
|
||||
|
||||
try {
|
||||
await runVitestBenchmark(
|
||||
path.join(FRONTEND_DIR, 'node_modules', '.bin', 'vitest'),
|
||||
[
|
||||
'run',
|
||||
FRONTEND_BENCHMARK_TEST,
|
||||
'--project',
|
||||
'server',
|
||||
'--config',
|
||||
FRONTEND_BENCHMARK_CONFIG
|
||||
],
|
||||
{
|
||||
cwd: FRONTEND_DIR,
|
||||
env: {
|
||||
...process.env,
|
||||
BROWSERSLIST_IGNORE_OLD_DATA: '1',
|
||||
WMILL_FRONTEND_AI_EVAL_OUTPUT_PATH: outputPath,
|
||||
WMILL_FRONTEND_AI_EVAL_MODE: input.mode,
|
||||
WMILL_FRONTEND_AI_EVAL_CASE_IDS: JSON.stringify(input.caseIds),
|
||||
WMILL_FRONTEND_AI_EVAL_RUNS: String(input.runs),
|
||||
WMILL_FRONTEND_AI_EVAL_MODEL: input.model ?? "",
|
||||
WMILL_FRONTEND_AI_EVAL_PROGRESS: '1',
|
||||
WMILL_FRONTEND_AI_EVAL_VERBOSE: input.verbose ? '1' : '0',
|
||||
WMILL_FRONTEND_AI_EVAL_BACKEND_VALIDATION: input.backendValidation ?? ''
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
const raw = await readFile(outputPath, 'utf8')
|
||||
return JSON.parse(raw) as BenchmarkRunResult
|
||||
} catch (error) {
|
||||
throw new Error(`Frontend benchmark adapter failed:\n${toErrorMessage(error)}`)
|
||||
} finally {
|
||||
await rm(tempDir, { recursive: true, force: true })
|
||||
}
|
||||
}
|
||||
|
||||
async function runVitestBenchmark(
|
||||
command: string,
|
||||
args: string[],
|
||||
options: {
|
||||
cwd: string
|
||||
env: NodeJS.ProcessEnv
|
||||
}
|
||||
): Promise<void> {
|
||||
const child = spawn(command, args, {
|
||||
cwd: options.cwd,
|
||||
env: options.env,
|
||||
stdio: ['ignore', 'pipe', 'pipe']
|
||||
})
|
||||
|
||||
let stdout = ''
|
||||
let stderr = ''
|
||||
let stderrLineBuffer = ''
|
||||
let assistantStreamOpen = false
|
||||
|
||||
child.stdout?.setEncoding('utf8')
|
||||
child.stdout?.on('data', (chunk: string) => {
|
||||
stdout += chunk
|
||||
})
|
||||
|
||||
child.stderr?.setEncoding('utf8')
|
||||
child.stderr?.on('data', (chunk: string) => {
|
||||
stderrLineBuffer += chunk
|
||||
const { remainder, passthrough, nextAssistantStreamOpen } = drainProgressLines(
|
||||
stderrLineBuffer,
|
||||
assistantStreamOpen
|
||||
)
|
||||
stderrLineBuffer = remainder
|
||||
stderr += passthrough
|
||||
assistantStreamOpen = nextAssistantStreamOpen
|
||||
})
|
||||
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
child.once('error', reject)
|
||||
child.once('close', (code) => {
|
||||
if (stderrLineBuffer.length > 0) {
|
||||
const {
|
||||
remainder,
|
||||
passthrough,
|
||||
nextAssistantStreamOpen
|
||||
} = drainProgressLines(`${stderrLineBuffer}\n`, assistantStreamOpen)
|
||||
stderrLineBuffer = remainder
|
||||
stderr += passthrough
|
||||
assistantStreamOpen = nextAssistantStreamOpen
|
||||
}
|
||||
|
||||
if (code === 0) {
|
||||
if (assistantStreamOpen) {
|
||||
process.stderr.write('\n')
|
||||
}
|
||||
resolve()
|
||||
return
|
||||
}
|
||||
|
||||
const details = [`vitest exited with code ${code}`, stdout, stderr].filter(Boolean).join('\n')
|
||||
reject(new Error(details))
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
function drainProgressLines(buffer: string): {
|
||||
remainder: string
|
||||
passthrough: string
|
||||
nextAssistantStreamOpen: boolean
|
||||
}
|
||||
function drainProgressLines(
|
||||
buffer: string,
|
||||
initialAssistantStreamOpen: boolean
|
||||
): {
|
||||
remainder: string
|
||||
passthrough: string
|
||||
nextAssistantStreamOpen: boolean
|
||||
} {
|
||||
let remainder = buffer
|
||||
let passthrough = ''
|
||||
let assistantStreamOpen = initialAssistantStreamOpen
|
||||
|
||||
while (true) {
|
||||
const newlineIndex = remainder.indexOf('\n')
|
||||
if (newlineIndex === -1) {
|
||||
return { remainder, passthrough, nextAssistantStreamOpen: assistantStreamOpen }
|
||||
}
|
||||
|
||||
const line = remainder.slice(0, newlineIndex).replace(/\r$/, '')
|
||||
remainder = remainder.slice(newlineIndex + 1)
|
||||
|
||||
const progressEvent = parseFrontendBenchmarkProgressLine(line)
|
||||
if (progressEvent) {
|
||||
if (progressEvent.type === 'assistant-message-start') {
|
||||
if (assistantStreamOpen) {
|
||||
process.stderr.write('\n')
|
||||
}
|
||||
process.stderr.write(
|
||||
`${formatCasePrefix(progressEvent.caseNumber, progressEvent.totalCases)} ${progressEvent.caseId} attempt ${progressEvent.attempt}/${progressEvent.runs} assistant:\n`
|
||||
)
|
||||
assistantStreamOpen = true
|
||||
continue
|
||||
}
|
||||
|
||||
if (progressEvent.type === 'assistant-chunk') {
|
||||
process.stderr.write(progressEvent.chunk)
|
||||
continue
|
||||
}
|
||||
|
||||
if (progressEvent.type === 'assistant-message-end') {
|
||||
if (assistantStreamOpen) {
|
||||
process.stderr.write('\n')
|
||||
}
|
||||
assistantStreamOpen = false
|
||||
continue
|
||||
}
|
||||
|
||||
if (assistantStreamOpen) {
|
||||
process.stderr.write('\n')
|
||||
assistantStreamOpen = false
|
||||
}
|
||||
process.stderr.write(`${formatFrontendBenchmarkProgressEvent(progressEvent)}\n`)
|
||||
continue
|
||||
}
|
||||
|
||||
if (shouldSuppressFrontendStderrLine(line)) {
|
||||
continue
|
||||
}
|
||||
|
||||
passthrough += `${line}\n`
|
||||
process.stderr.write(`${line}\n`)
|
||||
}
|
||||
}
|
||||
|
||||
function formatCasePrefix(caseNumber: number, totalCases: number): string {
|
||||
return `[${caseNumber}/${totalCases}]`
|
||||
}
|
||||
|
||||
function shouldSuppressFrontendStderrLine(line: string): boolean {
|
||||
return (
|
||||
line.startsWith('[baseline-browser-mapping] ') ||
|
||||
line.startsWith('Browserslist: browsers data (caniuse-lite) is ') ||
|
||||
line.includes('update-browserslist-db@latest') ||
|
||||
line.includes('update-db#readme')
|
||||
)
|
||||
}
|
||||
|
||||
function toErrorMessage(error: unknown): string {
|
||||
if (error instanceof Error) {
|
||||
return error.message
|
||||
}
|
||||
return String(error)
|
||||
}
|
||||
28
ai_evals/adapters/frontend/vitest.config.ts
Normal file
28
ai_evals/adapters/frontend/vitest.config.ts
Normal file
@@ -0,0 +1,28 @@
|
||||
import { fileURLToPath } from 'node:url'
|
||||
import frontendConfig from '../../../frontend/vite.config.js'
|
||||
|
||||
const FRONTEND_VITE_CONFIG_PATH = fileURLToPath(new URL('../../../frontend/vite.config.js', import.meta.url))
|
||||
const FRONTEND_TEST_SETUP_PATH = fileURLToPath(
|
||||
new URL('../../../frontend/src/lib/test-setup.ts', import.meta.url)
|
||||
)
|
||||
const ADAPTER_TEST_PATH = fileURLToPath(new URL('./vitestAdapter.test.ts', import.meta.url))
|
||||
|
||||
const config = {
|
||||
...frontendConfig,
|
||||
test: {
|
||||
...frontendConfig.test,
|
||||
projects: [
|
||||
{
|
||||
extends: FRONTEND_VITE_CONFIG_PATH,
|
||||
test: {
|
||||
name: 'server',
|
||||
environment: 'node',
|
||||
include: [ADAPTER_TEST_PATH],
|
||||
setupFiles: [FRONTEND_TEST_SETUP_PATH]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
export default config
|
||||
165
ai_evals/adapters/frontend/vitestAdapter.test.ts
Normal file
165
ai_evals/adapters/frontend/vitestAdapter.test.ts
Normal file
@@ -0,0 +1,165 @@
|
||||
import { expect, it, vi } from 'vitest'
|
||||
// @ts-ignore - Node.js fs/promises
|
||||
import { mkdir, writeFile } from 'fs/promises'
|
||||
// @ts-ignore - Node.js path
|
||||
import { dirname, resolve } from 'path'
|
||||
|
||||
vi.mock('monaco-editor', () => ({
|
||||
editor: {},
|
||||
languages: {},
|
||||
KeyCode: {},
|
||||
Uri: {
|
||||
parse: (value: string) => ({ toString: () => value })
|
||||
},
|
||||
MarkerSeverity: {
|
||||
Error: 8,
|
||||
Warning: 4,
|
||||
Info: 2,
|
||||
Hint: 1
|
||||
}
|
||||
}))
|
||||
|
||||
vi.mock('@codingame/monaco-vscode-standalone-typescript-language-features', () => ({
|
||||
getTypeScriptWorker: async () => async () => ({}),
|
||||
typescriptVersion: 'test'
|
||||
}))
|
||||
|
||||
vi.mock('@codingame/monaco-vscode-languages-service-override', () => ({
|
||||
default: () => ({})
|
||||
}))
|
||||
|
||||
vi.mock('$lib/components/vscode', () => ({}))
|
||||
|
||||
vi.mock('$lib/gen', async () => {
|
||||
const actual = await vi.importActual<any>('$lib/gen')
|
||||
const {
|
||||
getBenchmarkCompletedJob,
|
||||
getBenchmarkFlowByPath,
|
||||
getBenchmarkScriptByHash,
|
||||
getBenchmarkScriptByPath,
|
||||
hasBenchmarkWorkspace,
|
||||
listBenchmarkFlows,
|
||||
listBenchmarkScripts,
|
||||
runBenchmarkFlowByPath,
|
||||
runBenchmarkScriptPreview
|
||||
} = await import('./mockBackend')
|
||||
|
||||
function wrapService<T extends object>(target: T, overrides: Record<string, unknown>): T {
|
||||
return new Proxy(target, {
|
||||
get(source, property, receiver) {
|
||||
if (typeof property === 'string' && property in overrides) {
|
||||
return overrides[property]
|
||||
}
|
||||
return Reflect.get(source, property, receiver)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
return {
|
||||
...actual,
|
||||
ScriptService: wrapService(actual.ScriptService, {
|
||||
listScripts: async (data: { workspace: string }) =>
|
||||
hasBenchmarkWorkspace(data.workspace)
|
||||
? (listBenchmarkScripts(data.workspace) ?? [])
|
||||
: actual.ScriptService.listScripts(data),
|
||||
getScriptByPath: async (data: { workspace: string; path: string }) => {
|
||||
if (hasBenchmarkWorkspace(data.workspace)) {
|
||||
const script = getBenchmarkScriptByPath(data.workspace, data.path)
|
||||
if (!script) {
|
||||
throw new Error(`Script "${data.path}" not found in benchmark workspace`)
|
||||
}
|
||||
return script
|
||||
}
|
||||
return actual.ScriptService.getScriptByPath(data)
|
||||
},
|
||||
getScriptByHash: async (data: { workspace: string; hash: string }) => {
|
||||
if (hasBenchmarkWorkspace(data.workspace)) {
|
||||
const script = getBenchmarkScriptByHash(data.workspace, data.hash)
|
||||
if (!script) {
|
||||
throw new Error(`Script hash "${data.hash}" not found in benchmark workspace`)
|
||||
}
|
||||
return script
|
||||
}
|
||||
return actual.ScriptService.getScriptByHash(data)
|
||||
}
|
||||
}),
|
||||
FlowService: wrapService(actual.FlowService, {
|
||||
listFlows: async (data: { workspace: string }) =>
|
||||
hasBenchmarkWorkspace(data.workspace)
|
||||
? (listBenchmarkFlows(data.workspace) ?? [])
|
||||
: actual.FlowService.listFlows(data),
|
||||
getFlowByPath: async (data: { workspace: string; path: string }) => {
|
||||
if (hasBenchmarkWorkspace(data.workspace)) {
|
||||
const flow = getBenchmarkFlowByPath(data.workspace, data.path)
|
||||
if (!flow) {
|
||||
throw new Error(`Flow "${data.path}" not found in benchmark workspace`)
|
||||
}
|
||||
return flow
|
||||
}
|
||||
return actual.FlowService.getFlowByPath(data)
|
||||
}
|
||||
}),
|
||||
JobService: wrapService(actual.JobService, {
|
||||
runScriptPreview: async (data: {
|
||||
workspace: string
|
||||
requestBody?: {
|
||||
content?: string
|
||||
language?: string
|
||||
args?: Record<string, unknown>
|
||||
path?: string
|
||||
}
|
||||
}) =>
|
||||
hasBenchmarkWorkspace(data.workspace)
|
||||
? runBenchmarkScriptPreview({
|
||||
workspace: data.workspace,
|
||||
requestBody: data.requestBody ?? {}
|
||||
})
|
||||
: actual.JobService.runScriptPreview(data),
|
||||
runFlowByPath: async (data: {
|
||||
workspace: string
|
||||
path: string
|
||||
requestBody?: Record<string, unknown>
|
||||
}) =>
|
||||
hasBenchmarkWorkspace(data.workspace)
|
||||
? runBenchmarkFlowByPath({
|
||||
workspace: data.workspace,
|
||||
path: data.path,
|
||||
args: data.requestBody
|
||||
})
|
||||
: actual.JobService.runFlowByPath(data),
|
||||
getJob: async (data: { workspace: string; id: string }) => {
|
||||
if (hasBenchmarkWorkspace(data.workspace)) {
|
||||
const job = getBenchmarkCompletedJob(data.workspace, data.id)
|
||||
if (!job) {
|
||||
throw new Error(`Job "${data.id}" not found in benchmark workspace`)
|
||||
}
|
||||
return job
|
||||
}
|
||||
return actual.JobService.getJob(data)
|
||||
}
|
||||
})
|
||||
}
|
||||
})
|
||||
|
||||
const benchmarkOutputPath = process.env.WMILL_FRONTEND_AI_EVAL_OUTPUT_PATH
|
||||
const benchmarkIt = benchmarkOutputPath ? it : it.skip
|
||||
|
||||
benchmarkIt(
|
||||
'runs the frontend benchmark adapter from environment input',
|
||||
async () => {
|
||||
const { resetBenchmarkMockBackend } = await import('./mockBackend')
|
||||
resetBenchmarkMockBackend()
|
||||
const { runFrontendBenchmarkFromEnv } = await import('./benchmarkRunner')
|
||||
try {
|
||||
const payload = await runFrontendBenchmarkFromEnv()
|
||||
const absoluteOutputPath = resolve(benchmarkOutputPath!)
|
||||
await mkdir(dirname(absoluteOutputPath), { recursive: true })
|
||||
await writeFile(absoluteOutputPath, JSON.stringify(payload, null, 2) + '\n', 'utf8')
|
||||
|
||||
expect(payload.cases.length).toBeGreaterThan(0)
|
||||
} finally {
|
||||
resetBenchmarkMockBackend()
|
||||
}
|
||||
},
|
||||
600_000
|
||||
)
|
||||
313
ai_evals/bun.lock
Normal file
313
ai_evals/bun.lock
Normal file
@@ -0,0 +1,313 @@
|
||||
{
|
||||
"lockfileVersion": 1,
|
||||
"configVersion": 1,
|
||||
"workspaces": {
|
||||
"": {
|
||||
"name": "windmill-ai-evals",
|
||||
"dependencies": {
|
||||
"@anthropic-ai/claude-agent-sdk": "^0.2.25",
|
||||
"@anthropic-ai/sdk": "^0.39.0",
|
||||
"commander": "^14.0.3",
|
||||
"openai": "^6.9.1",
|
||||
"yaml": "^2.8.3",
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/bun": "latest",
|
||||
"typescript": "^5.0.0",
|
||||
},
|
||||
},
|
||||
},
|
||||
"packages": {
|
||||
"@anthropic-ai/claude-agent-sdk": ["@anthropic-ai/claude-agent-sdk@0.2.87", "", { "dependencies": { "@anthropic-ai/sdk": "^0.74.0", "@modelcontextprotocol/sdk": "^1.27.1" }, "optionalDependencies": { "@img/sharp-darwin-arm64": "^0.34.2", "@img/sharp-darwin-x64": "^0.34.2", "@img/sharp-linux-arm": "^0.34.2", "@img/sharp-linux-arm64": "^0.34.2", "@img/sharp-linux-x64": "^0.34.2", "@img/sharp-linuxmusl-arm64": "^0.34.2", "@img/sharp-linuxmusl-x64": "^0.34.2", "@img/sharp-win32-arm64": "^0.34.2", "@img/sharp-win32-x64": "^0.34.2" }, "peerDependencies": { "zod": "^4.0.0" } }, "sha512-WWmgBPxPhBOvNT0ujI8vPTI2lK+w5YEkEZ/y1mH0EDkK/0kBnxVJNhCtG5vnueiAViwLoUOFn66pbkDiivijdA=="],
|
||||
|
||||
"@anthropic-ai/sdk": ["@anthropic-ai/sdk@0.39.0", "", { "dependencies": { "@types/node": "^18.11.18", "@types/node-fetch": "^2.6.4", "abort-controller": "^3.0.0", "agentkeepalive": "^4.2.1", "form-data-encoder": "1.7.2", "formdata-node": "^4.3.2", "node-fetch": "^2.6.7" } }, "sha512-eMyDIPRZbt1CCLErRCi3exlAvNkBtRe+kW5vvJyef93PmNr/clstYgHhtvmkxN82nlKgzyGPCyGxrm0JQ1ZIdg=="],
|
||||
|
||||
"@babel/runtime": ["@babel/runtime@7.29.2", "", {}, "sha512-JiDShH45zKHWyGe4ZNVRrCjBz8Nh9TMmZG1kh4QTK8hCBTWBi8Da+i7s1fJw7/lYpM4ccepSNfqzZ/QvABBi5g=="],
|
||||
|
||||
"@hono/node-server": ["@hono/node-server@1.19.12", "", { "peerDependencies": { "hono": "^4" } }, "sha512-txsUW4SQ1iilgE0l9/e9VQWmELXifEFvmdA1j6WFh/aFPj99hIntrSsq/if0UWyGVkmrRPKA1wCeP+UCr1B9Uw=="],
|
||||
|
||||
"@img/sharp-darwin-arm64": ["@img/sharp-darwin-arm64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-darwin-arm64": "1.2.4" }, "os": "darwin", "cpu": "arm64" }, "sha512-imtQ3WMJXbMY4fxb/Ndp6HBTNVtWCUI0WdobyheGf5+ad6xX8VIDO8u2xE4qc/fr08CKG/7dDseFtn6M6g/r3w=="],
|
||||
|
||||
"@img/sharp-darwin-x64": ["@img/sharp-darwin-x64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-darwin-x64": "1.2.4" }, "os": "darwin", "cpu": "x64" }, "sha512-YNEFAF/4KQ/PeW0N+r+aVVsoIY0/qxxikF2SWdp+NRkmMB7y9LBZAVqQ4yhGCm/H3H270OSykqmQMKLBhBJDEw=="],
|
||||
|
||||
"@img/sharp-libvips-darwin-arm64": ["@img/sharp-libvips-darwin-arm64@1.2.4", "", { "os": "darwin", "cpu": "arm64" }, "sha512-zqjjo7RatFfFoP0MkQ51jfuFZBnVE2pRiaydKJ1G/rHZvnsrHAOcQALIi9sA5co5xenQdTugCvtb1cuf78Vf4g=="],
|
||||
|
||||
"@img/sharp-libvips-darwin-x64": ["@img/sharp-libvips-darwin-x64@1.2.4", "", { "os": "darwin", "cpu": "x64" }, "sha512-1IOd5xfVhlGwX+zXv2N93k0yMONvUlANylbJw1eTah8K/Jtpi15KC+WSiaX/nBmbm2HxRM1gZ0nSdjSsrZbGKg=="],
|
||||
|
||||
"@img/sharp-libvips-linux-arm": ["@img/sharp-libvips-linux-arm@1.2.4", "", { "os": "linux", "cpu": "arm" }, "sha512-bFI7xcKFELdiNCVov8e44Ia4u2byA+l3XtsAj+Q8tfCwO6BQ8iDojYdvoPMqsKDkuoOo+X6HZA0s0q11ANMQ8A=="],
|
||||
|
||||
"@img/sharp-libvips-linux-arm64": ["@img/sharp-libvips-linux-arm64@1.2.4", "", { "os": "linux", "cpu": "arm64" }, "sha512-excjX8DfsIcJ10x1Kzr4RcWe1edC9PquDRRPx3YVCvQv+U5p7Yin2s32ftzikXojb1PIFc/9Mt28/y+iRklkrw=="],
|
||||
|
||||
"@img/sharp-libvips-linux-x64": ["@img/sharp-libvips-linux-x64@1.2.4", "", { "os": "linux", "cpu": "x64" }, "sha512-tJxiiLsmHc9Ax1bz3oaOYBURTXGIRDODBqhveVHonrHJ9/+k89qbLl0bcJns+e4t4rvaNBxaEZsFtSfAdquPrw=="],
|
||||
|
||||
"@img/sharp-libvips-linuxmusl-arm64": ["@img/sharp-libvips-linuxmusl-arm64@1.2.4", "", { "os": "linux", "cpu": "arm64" }, "sha512-FVQHuwx1IIuNow9QAbYUzJ+En8KcVm9Lk5+uGUQJHaZmMECZmOlix9HnH7n1TRkXMS0pGxIJokIVB9SuqZGGXw=="],
|
||||
|
||||
"@img/sharp-libvips-linuxmusl-x64": ["@img/sharp-libvips-linuxmusl-x64@1.2.4", "", { "os": "linux", "cpu": "x64" }, "sha512-+LpyBk7L44ZIXwz/VYfglaX/okxezESc6UxDSoyo2Ks6Jxc4Y7sGjpgU9s4PMgqgjj1gZCylTieNamqA1MF7Dg=="],
|
||||
|
||||
"@img/sharp-linux-arm": ["@img/sharp-linux-arm@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-arm": "1.2.4" }, "os": "linux", "cpu": "arm" }, "sha512-9dLqsvwtg1uuXBGZKsxem9595+ujv0sJ6Vi8wcTANSFpwV/GONat5eCkzQo/1O6zRIkh0m/8+5BjrRr7jDUSZw=="],
|
||||
|
||||
"@img/sharp-linux-arm64": ["@img/sharp-linux-arm64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-arm64": "1.2.4" }, "os": "linux", "cpu": "arm64" }, "sha512-bKQzaJRY/bkPOXyKx5EVup7qkaojECG6NLYswgktOZjaXecSAeCWiZwwiFf3/Y+O1HrauiE3FVsGxFg8c24rZg=="],
|
||||
|
||||
"@img/sharp-linux-x64": ["@img/sharp-linux-x64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-x64": "1.2.4" }, "os": "linux", "cpu": "x64" }, "sha512-MEzd8HPKxVxVenwAa+JRPwEC7QFjoPWuS5NZnBt6B3pu7EG2Ge0id1oLHZpPJdn3OQK+BQDiw9zStiHBTJQQQQ=="],
|
||||
|
||||
"@img/sharp-linuxmusl-arm64": ["@img/sharp-linuxmusl-arm64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linuxmusl-arm64": "1.2.4" }, "os": "linux", "cpu": "arm64" }, "sha512-fprJR6GtRsMt6Kyfq44IsChVZeGN97gTD331weR1ex1c1rypDEABN6Tm2xa1wE6lYb5DdEnk03NZPqA7Id21yg=="],
|
||||
|
||||
"@img/sharp-linuxmusl-x64": ["@img/sharp-linuxmusl-x64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linuxmusl-x64": "1.2.4" }, "os": "linux", "cpu": "x64" }, "sha512-Jg8wNT1MUzIvhBFxViqrEhWDGzqymo3sV7z7ZsaWbZNDLXRJZoRGrjulp60YYtV4wfY8VIKcWidjojlLcWrd8Q=="],
|
||||
|
||||
"@img/sharp-win32-arm64": ["@img/sharp-win32-arm64@0.34.5", "", { "os": "win32", "cpu": "arm64" }, "sha512-WQ3AgWCWYSb2yt+IG8mnC6Jdk9Whs7O0gxphblsLvdhSpSTtmu69ZG1Gkb6NuvxsNACwiPV6cNSZNzt0KPsw7g=="],
|
||||
|
||||
"@img/sharp-win32-x64": ["@img/sharp-win32-x64@0.34.5", "", { "os": "win32", "cpu": "x64" }, "sha512-+29YMsqY2/9eFEiW93eqWnuLcWcufowXewwSNIT6UwZdUUCrM3oFjMWH/Z6/TMmb4hlFenmfAVbpWeup2jryCw=="],
|
||||
|
||||
"@modelcontextprotocol/sdk": ["@modelcontextprotocol/sdk@1.29.0", "", { "dependencies": { "@hono/node-server": "^1.19.9", "ajv": "^8.17.1", "ajv-formats": "^3.0.1", "content-type": "^1.0.5", "cors": "^2.8.5", "cross-spawn": "^7.0.5", "eventsource": "^3.0.2", "eventsource-parser": "^3.0.0", "express": "^5.2.1", "express-rate-limit": "^8.2.1", "hono": "^4.11.4", "jose": "^6.1.3", "json-schema-typed": "^8.0.2", "pkce-challenge": "^5.0.0", "raw-body": "^3.0.0", "zod": "^3.25 || ^4.0", "zod-to-json-schema": "^3.25.1" }, "peerDependencies": { "@cfworker/json-schema": "^4.1.1" }, "optionalPeers": ["@cfworker/json-schema"] }, "sha512-zo37mZA9hJWpULgkRpowewez1y6ML5GsXJPY8FI0tBBCd77HEvza4jDqRKOXgHNn867PVGCyTdzqpz0izu5ZjQ=="],
|
||||
|
||||
"@types/bun": ["@types/bun@1.3.11", "", { "dependencies": { "bun-types": "1.3.11" } }, "sha512-5vPne5QvtpjGpsGYXiFyycfpDF2ECyPcTSsFBMa0fraoxiQyMJ3SmuQIGhzPg2WJuWxVBoxWJ2kClYTcw/4fAg=="],
|
||||
|
||||
"@types/node": ["@types/node@18.19.130", "", { "dependencies": { "undici-types": "~5.26.4" } }, "sha512-GRaXQx6jGfL8sKfaIDD6OupbIHBr9jv7Jnaml9tB7l4v068PAOXqfcujMMo5PhbIs6ggR1XODELqahT2R8v0fg=="],
|
||||
|
||||
"@types/node-fetch": ["@types/node-fetch@2.6.13", "", { "dependencies": { "@types/node": "*", "form-data": "^4.0.4" } }, "sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw=="],
|
||||
|
||||
"abort-controller": ["abort-controller@3.0.0", "", { "dependencies": { "event-target-shim": "^5.0.0" } }, "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg=="],
|
||||
|
||||
"accepts": ["accepts@2.0.0", "", { "dependencies": { "mime-types": "^3.0.0", "negotiator": "^1.0.0" } }, "sha512-5cvg6CtKwfgdmVqY1WIiXKc3Q1bkRqGLi+2W/6ao+6Y7gu/RCwRuAhGEzh5B4KlszSuTLgZYuqFqo5bImjNKng=="],
|
||||
|
||||
"agentkeepalive": ["agentkeepalive@4.6.0", "", { "dependencies": { "humanize-ms": "^1.2.1" } }, "sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ=="],
|
||||
|
||||
"ajv": ["ajv@8.18.0", "", { "dependencies": { "fast-deep-equal": "^3.1.3", "fast-uri": "^3.0.1", "json-schema-traverse": "^1.0.0", "require-from-string": "^2.0.2" } }, "sha512-PlXPeEWMXMZ7sPYOHqmDyCJzcfNrUr3fGNKtezX14ykXOEIvyK81d+qydx89KY5O71FKMPaQ2vBfBFI5NHR63A=="],
|
||||
|
||||
"ajv-formats": ["ajv-formats@3.0.1", "", { "dependencies": { "ajv": "^8.0.0" } }, "sha512-8iUql50EUR+uUcdRQ3HDqa6EVyo3docL8g5WJ3FNcWmu62IbkGUue/pEyLBW8VGKKucTPgqeks4fIU1DA4yowQ=="],
|
||||
|
||||
"asynckit": ["asynckit@0.4.0", "", {}, "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="],
|
||||
|
||||
"body-parser": ["body-parser@2.2.2", "", { "dependencies": { "bytes": "^3.1.2", "content-type": "^1.0.5", "debug": "^4.4.3", "http-errors": "^2.0.0", "iconv-lite": "^0.7.0", "on-finished": "^2.4.1", "qs": "^6.14.1", "raw-body": "^3.0.1", "type-is": "^2.0.1" } }, "sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA=="],
|
||||
|
||||
"bun-types": ["bun-types@1.3.11", "", { "dependencies": { "@types/node": "*" } }, "sha512-1KGPpoxQWl9f6wcZh57LvrPIInQMn2TQ7jsgxqpRzg+l0QPOFvJVH7HmvHo/AiPgwXy+/Thf6Ov3EdVn1vOabg=="],
|
||||
|
||||
"bytes": ["bytes@3.1.2", "", {}, "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg=="],
|
||||
|
||||
"call-bind-apply-helpers": ["call-bind-apply-helpers@1.0.2", "", { "dependencies": { "es-errors": "^1.3.0", "function-bind": "^1.1.2" } }, "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ=="],
|
||||
|
||||
"call-bound": ["call-bound@1.0.4", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "get-intrinsic": "^1.3.0" } }, "sha512-+ys997U96po4Kx/ABpBCqhA9EuxJaQWDQg7295H4hBphv3IZg0boBKuwYpt4YXp6MZ5AmZQnU/tyMTlRpaSejg=="],
|
||||
|
||||
"combined-stream": ["combined-stream@1.0.8", "", { "dependencies": { "delayed-stream": "~1.0.0" } }, "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg=="],
|
||||
|
||||
"commander": ["commander@14.0.3", "", {}, "sha512-H+y0Jo/T1RZ9qPP4Eh1pkcQcLRglraJaSLoyOtHxu6AapkjWVCy2Sit1QQ4x3Dng8qDlSsZEet7g5Pq06MvTgw=="],
|
||||
|
||||
"content-disposition": ["content-disposition@1.0.1", "", {}, "sha512-oIXISMynqSqm241k6kcQ5UwttDILMK4BiurCfGEREw6+X9jkkpEe5T9FZaApyLGGOnFuyMWZpdolTXMtvEJ08Q=="],
|
||||
|
||||
"content-type": ["content-type@1.0.5", "", {}, "sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA=="],
|
||||
|
||||
"cookie": ["cookie@0.7.2", "", {}, "sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w=="],
|
||||
|
||||
"cookie-signature": ["cookie-signature@1.2.2", "", {}, "sha512-D76uU73ulSXrD1UXF4KE2TMxVVwhsnCgfAyTg9k8P6KGZjlXKrOLe4dJQKI3Bxi5wjesZoFXJWElNWBjPZMbhg=="],
|
||||
|
||||
"cors": ["cors@2.8.6", "", { "dependencies": { "object-assign": "^4", "vary": "^1" } }, "sha512-tJtZBBHA6vjIAaF6EnIaq6laBBP9aq/Y3ouVJjEfoHbRBcHBAHYcMh/w8LDrk2PvIMMq8gmopa5D4V8RmbrxGw=="],
|
||||
|
||||
"cross-spawn": ["cross-spawn@7.0.6", "", { "dependencies": { "path-key": "^3.1.0", "shebang-command": "^2.0.0", "which": "^2.0.1" } }, "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA=="],
|
||||
|
||||
"debug": ["debug@4.4.3", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA=="],
|
||||
|
||||
"delayed-stream": ["delayed-stream@1.0.0", "", {}, "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ=="],
|
||||
|
||||
"depd": ["depd@2.0.0", "", {}, "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw=="],
|
||||
|
||||
"dunder-proto": ["dunder-proto@1.0.1", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.1", "es-errors": "^1.3.0", "gopd": "^1.2.0" } }, "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A=="],
|
||||
|
||||
"ee-first": ["ee-first@1.1.1", "", {}, "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow=="],
|
||||
|
||||
"encodeurl": ["encodeurl@2.0.0", "", {}, "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg=="],
|
||||
|
||||
"es-define-property": ["es-define-property@1.0.1", "", {}, "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g=="],
|
||||
|
||||
"es-errors": ["es-errors@1.3.0", "", {}, "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw=="],
|
||||
|
||||
"es-object-atoms": ["es-object-atoms@1.1.1", "", { "dependencies": { "es-errors": "^1.3.0" } }, "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA=="],
|
||||
|
||||
"es-set-tostringtag": ["es-set-tostringtag@2.1.0", "", { "dependencies": { "es-errors": "^1.3.0", "get-intrinsic": "^1.2.6", "has-tostringtag": "^1.0.2", "hasown": "^2.0.2" } }, "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA=="],
|
||||
|
||||
"escape-html": ["escape-html@1.0.3", "", {}, "sha512-NiSupZ4OeuGwr68lGIeym/ksIZMJodUGOSCZ/FSnTxcrekbvqrgdUxlJOMpijaKZVjAJrWrGs/6Jy8OMuyj9ow=="],
|
||||
|
||||
"etag": ["etag@1.8.1", "", {}, "sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg=="],
|
||||
|
||||
"event-target-shim": ["event-target-shim@5.0.1", "", {}, "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ=="],
|
||||
|
||||
"eventsource": ["eventsource@3.0.7", "", { "dependencies": { "eventsource-parser": "^3.0.1" } }, "sha512-CRT1WTyuQoD771GW56XEZFQ/ZoSfWid1alKGDYMmkt2yl8UXrVR4pspqWNEcqKvVIzg6PAltWjxcSSPrboA4iA=="],
|
||||
|
||||
"eventsource-parser": ["eventsource-parser@3.0.6", "", {}, "sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg=="],
|
||||
|
||||
"express": ["express@5.2.1", "", { "dependencies": { "accepts": "^2.0.0", "body-parser": "^2.2.1", "content-disposition": "^1.0.0", "content-type": "^1.0.5", "cookie": "^0.7.1", "cookie-signature": "^1.2.1", "debug": "^4.4.0", "depd": "^2.0.0", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "etag": "^1.8.1", "finalhandler": "^2.1.0", "fresh": "^2.0.0", "http-errors": "^2.0.0", "merge-descriptors": "^2.0.0", "mime-types": "^3.0.0", "on-finished": "^2.4.1", "once": "^1.4.0", "parseurl": "^1.3.3", "proxy-addr": "^2.0.7", "qs": "^6.14.0", "range-parser": "^1.2.1", "router": "^2.2.0", "send": "^1.1.0", "serve-static": "^2.2.0", "statuses": "^2.0.1", "type-is": "^2.0.1", "vary": "^1.1.2" } }, "sha512-hIS4idWWai69NezIdRt2xFVofaF4j+6INOpJlVOLDO8zXGpUVEVzIYk12UUi2JzjEzWL3IOAxcTubgz9Po0yXw=="],
|
||||
|
||||
"express-rate-limit": ["express-rate-limit@8.3.2", "", { "dependencies": { "ip-address": "10.1.0" }, "peerDependencies": { "express": ">= 4.11" } }, "sha512-77VmFeJkO0/rvimEDuUC5H30oqUC4EyOhyGccfqoLebB0oiEYfM7nwPrsDsBL1gsTpwfzX8SFy2MT3TDyRq+bg=="],
|
||||
|
||||
"fast-deep-equal": ["fast-deep-equal@3.1.3", "", {}, "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q=="],
|
||||
|
||||
"fast-uri": ["fast-uri@3.1.0", "", {}, "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA=="],
|
||||
|
||||
"finalhandler": ["finalhandler@2.1.1", "", { "dependencies": { "debug": "^4.4.0", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "on-finished": "^2.4.1", "parseurl": "^1.3.3", "statuses": "^2.0.1" } }, "sha512-S8KoZgRZN+a5rNwqTxlZZePjT/4cnm0ROV70LedRHZ0p8u9fRID0hJUZQpkKLzro8LfmC8sx23bY6tVNxv8pQA=="],
|
||||
|
||||
"form-data": ["form-data@4.0.5", "", { "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", "es-set-tostringtag": "^2.1.0", "hasown": "^2.0.2", "mime-types": "^2.1.12" } }, "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w=="],
|
||||
|
||||
"form-data-encoder": ["form-data-encoder@1.7.2", "", {}, "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A=="],
|
||||
|
||||
"formdata-node": ["formdata-node@4.4.1", "", { "dependencies": { "node-domexception": "1.0.0", "web-streams-polyfill": "4.0.0-beta.3" } }, "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ=="],
|
||||
|
||||
"forwarded": ["forwarded@0.2.0", "", {}, "sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow=="],
|
||||
|
||||
"fresh": ["fresh@2.0.0", "", {}, "sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A=="],
|
||||
|
||||
"function-bind": ["function-bind@1.1.2", "", {}, "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA=="],
|
||||
|
||||
"get-intrinsic": ["get-intrinsic@1.3.0", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "es-define-property": "^1.0.1", "es-errors": "^1.3.0", "es-object-atoms": "^1.1.1", "function-bind": "^1.1.2", "get-proto": "^1.0.1", "gopd": "^1.2.0", "has-symbols": "^1.1.0", "hasown": "^2.0.2", "math-intrinsics": "^1.1.0" } }, "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ=="],
|
||||
|
||||
"get-proto": ["get-proto@1.0.1", "", { "dependencies": { "dunder-proto": "^1.0.1", "es-object-atoms": "^1.0.0" } }, "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g=="],
|
||||
|
||||
"gopd": ["gopd@1.2.0", "", {}, "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg=="],
|
||||
|
||||
"has-symbols": ["has-symbols@1.1.0", "", {}, "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ=="],
|
||||
|
||||
"has-tostringtag": ["has-tostringtag@1.0.2", "", { "dependencies": { "has-symbols": "^1.0.3" } }, "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw=="],
|
||||
|
||||
"hasown": ["hasown@2.0.2", "", { "dependencies": { "function-bind": "^1.1.2" } }, "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ=="],
|
||||
|
||||
"hono": ["hono@4.12.9", "", {}, "sha512-wy3T8Zm2bsEvxKZM5w21VdHDDcwVS1yUFFY6i8UobSsKfFceT7TOwhbhfKsDyx7tYQlmRM5FLpIuYvNFyjctiA=="],
|
||||
|
||||
"http-errors": ["http-errors@2.0.1", "", { "dependencies": { "depd": "~2.0.0", "inherits": "~2.0.4", "setprototypeof": "~1.2.0", "statuses": "~2.0.2", "toidentifier": "~1.0.1" } }, "sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ=="],
|
||||
|
||||
"humanize-ms": ["humanize-ms@1.2.1", "", { "dependencies": { "ms": "^2.0.0" } }, "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ=="],
|
||||
|
||||
"iconv-lite": ["iconv-lite@0.7.2", "", { "dependencies": { "safer-buffer": ">= 2.1.2 < 3.0.0" } }, "sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw=="],
|
||||
|
||||
"inherits": ["inherits@2.0.4", "", {}, "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="],
|
||||
|
||||
"ip-address": ["ip-address@10.1.0", "", {}, "sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q=="],
|
||||
|
||||
"ipaddr.js": ["ipaddr.js@1.9.1", "", {}, "sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g=="],
|
||||
|
||||
"is-promise": ["is-promise@4.0.0", "", {}, "sha512-hvpoI6korhJMnej285dSg6nu1+e6uxs7zG3BYAm5byqDsgJNWwxzM6z6iZiAgQR4TJ30JmBTOwqZUw3WlyH3AQ=="],
|
||||
|
||||
"isexe": ["isexe@2.0.0", "", {}, "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw=="],
|
||||
|
||||
"jose": ["jose@6.2.2", "", {}, "sha512-d7kPDd34KO/YnzaDOlikGpOurfF0ByC2sEV4cANCtdqLlTfBlw2p14O/5d/zv40gJPbIQxfES3nSx1/oYNyuZQ=="],
|
||||
|
||||
"json-schema-to-ts": ["json-schema-to-ts@3.1.1", "", { "dependencies": { "@babel/runtime": "^7.18.3", "ts-algebra": "^2.0.0" } }, "sha512-+DWg8jCJG2TEnpy7kOm/7/AxaYoaRbjVB4LFZLySZlWn8exGs3A4OLJR966cVvU26N7X9TWxl+Jsw7dzAqKT6g=="],
|
||||
|
||||
"json-schema-traverse": ["json-schema-traverse@1.0.0", "", {}, "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="],
|
||||
|
||||
"json-schema-typed": ["json-schema-typed@8.0.2", "", {}, "sha512-fQhoXdcvc3V28x7C7BMs4P5+kNlgUURe2jmUT1T//oBRMDrqy1QPelJimwZGo7Hg9VPV3EQV5Bnq4hbFy2vetA=="],
|
||||
|
||||
"math-intrinsics": ["math-intrinsics@1.1.0", "", {}, "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g=="],
|
||||
|
||||
"media-typer": ["media-typer@1.1.0", "", {}, "sha512-aisnrDP4GNe06UcKFnV5bfMNPBUw4jsLGaWwWfnH3v02GnBuXX2MCVn5RbrWo0j3pczUilYblq7fQ7Nw2t5XKw=="],
|
||||
|
||||
"merge-descriptors": ["merge-descriptors@2.0.0", "", {}, "sha512-Snk314V5ayFLhp3fkUREub6WtjBfPdCPY1Ln8/8munuLuiYhsABgBVWsozAG+MWMbVEvcdcpbi9R7ww22l9Q3g=="],
|
||||
|
||||
"mime-db": ["mime-db@1.54.0", "", {}, "sha512-aU5EJuIN2WDemCcAp2vFBfp/m4EAhWJnUNSSw0ixs7/kXbd6Pg64EmwJkNdFhB8aWt1sH2CTXrLxo/iAGV3oPQ=="],
|
||||
|
||||
"mime-types": ["mime-types@3.0.2", "", { "dependencies": { "mime-db": "^1.54.0" } }, "sha512-Lbgzdk0h4juoQ9fCKXW4by0UJqj+nOOrI9MJ1sSj4nI8aI2eo1qmvQEie4VD1glsS250n15LsWsYtCugiStS5A=="],
|
||||
|
||||
"ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="],
|
||||
|
||||
"negotiator": ["negotiator@1.0.0", "", {}, "sha512-8Ofs/AUQh8MaEcrlq5xOX0CQ9ypTF5dl78mjlMNfOK08fzpgTHQRQPBxcPlEtIw0yRpws+Zo/3r+5WRby7u3Gg=="],
|
||||
|
||||
"node-domexception": ["node-domexception@1.0.0", "", {}, "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ=="],
|
||||
|
||||
"node-fetch": ["node-fetch@2.7.0", "", { "dependencies": { "whatwg-url": "^5.0.0" }, "peerDependencies": { "encoding": "^0.1.0" }, "optionalPeers": ["encoding"] }, "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A=="],
|
||||
|
||||
"object-assign": ["object-assign@4.1.1", "", {}, "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg=="],
|
||||
|
||||
"object-inspect": ["object-inspect@1.13.4", "", {}, "sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew=="],
|
||||
|
||||
"on-finished": ["on-finished@2.4.1", "", { "dependencies": { "ee-first": "1.1.1" } }, "sha512-oVlzkg3ENAhCk2zdv7IJwd/QUD4z2RxRwpkcGY8psCVcCYZNq4wYnVWALHM+brtuJjePWiYF/ClmuDr8Ch5+kg=="],
|
||||
|
||||
"once": ["once@1.4.0", "", { "dependencies": { "wrappy": "1" } }, "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w=="],
|
||||
|
||||
"openai": ["openai@6.34.0", "", { "peerDependencies": { "ws": "^8.18.0", "zod": "^3.25 || ^4.0" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-yEr2jdGf4tVFYG6ohmr3pF6VJuveP0EA/sS8TBx+4Eq5NT10alu5zg2dmxMXMgqpihRDQlFGpRt2XwsGj+Fyxw=="],
|
||||
|
||||
"parseurl": ["parseurl@1.3.3", "", {}, "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ=="],
|
||||
|
||||
"path-key": ["path-key@3.1.1", "", {}, "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q=="],
|
||||
|
||||
"path-to-regexp": ["path-to-regexp@8.4.1", "", {}, "sha512-fvU78fIjZ+SBM9YwCknCvKOUKkLVqtWDVctl0s7xIqfmfb38t2TT4ZU2gHm+Z8xGwgW+QWEU3oQSAzIbo89Ggw=="],
|
||||
|
||||
"pkce-challenge": ["pkce-challenge@5.0.1", "", {}, "sha512-wQ0b/W4Fr01qtpHlqSqspcj3EhBvimsdh0KlHhH8HRZnMsEa0ea2fTULOXOS9ccQr3om+GcGRk4e+isrZWV8qQ=="],
|
||||
|
||||
"proxy-addr": ["proxy-addr@2.0.7", "", { "dependencies": { "forwarded": "0.2.0", "ipaddr.js": "1.9.1" } }, "sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg=="],
|
||||
|
||||
"qs": ["qs@6.15.0", "", { "dependencies": { "side-channel": "^1.1.0" } }, "sha512-mAZTtNCeetKMH+pSjrb76NAM8V9a05I9aBZOHztWy/UqcJdQYNsf59vrRKWnojAT9Y+GbIvoTBC++CPHqpDBhQ=="],
|
||||
|
||||
"range-parser": ["range-parser@1.2.1", "", {}, "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg=="],
|
||||
|
||||
"raw-body": ["raw-body@3.0.2", "", { "dependencies": { "bytes": "~3.1.2", "http-errors": "~2.0.1", "iconv-lite": "~0.7.0", "unpipe": "~1.0.0" } }, "sha512-K5zQjDllxWkf7Z5xJdV0/B0WTNqx6vxG70zJE4N0kBs4LovmEYWJzQGxC9bS9RAKu3bgM40lrd5zoLJ12MQ5BA=="],
|
||||
|
||||
"require-from-string": ["require-from-string@2.0.2", "", {}, "sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw=="],
|
||||
|
||||
"router": ["router@2.2.0", "", { "dependencies": { "debug": "^4.4.0", "depd": "^2.0.0", "is-promise": "^4.0.0", "parseurl": "^1.3.3", "path-to-regexp": "^8.0.0" } }, "sha512-nLTrUKm2UyiL7rlhapu/Zl45FwNgkZGaCpZbIHajDYgwlJCOzLSk+cIPAnsEqV955GjILJnKbdQC1nVPz+gAYQ=="],
|
||||
|
||||
"safer-buffer": ["safer-buffer@2.1.2", "", {}, "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="],
|
||||
|
||||
"send": ["send@1.2.1", "", { "dependencies": { "debug": "^4.4.3", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "etag": "^1.8.1", "fresh": "^2.0.0", "http-errors": "^2.0.1", "mime-types": "^3.0.2", "ms": "^2.1.3", "on-finished": "^2.4.1", "range-parser": "^1.2.1", "statuses": "^2.0.2" } }, "sha512-1gnZf7DFcoIcajTjTwjwuDjzuz4PPcY2StKPlsGAQ1+YH20IRVrBaXSWmdjowTJ6u8Rc01PoYOGHXfP1mYcZNQ=="],
|
||||
|
||||
"serve-static": ["serve-static@2.2.1", "", { "dependencies": { "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "parseurl": "^1.3.3", "send": "^1.2.0" } }, "sha512-xRXBn0pPqQTVQiC8wyQrKs2MOlX24zQ0POGaj0kultvoOCstBQM5yvOhAVSUwOMjQtTvsPWoNCHfPGwaaQJhTw=="],
|
||||
|
||||
"setprototypeof": ["setprototypeof@1.2.0", "", {}, "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw=="],
|
||||
|
||||
"shebang-command": ["shebang-command@2.0.0", "", { "dependencies": { "shebang-regex": "^3.0.0" } }, "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA=="],
|
||||
|
||||
"shebang-regex": ["shebang-regex@3.0.0", "", {}, "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A=="],
|
||||
|
||||
"side-channel": ["side-channel@1.1.0", "", { "dependencies": { "es-errors": "^1.3.0", "object-inspect": "^1.13.3", "side-channel-list": "^1.0.0", "side-channel-map": "^1.0.1", "side-channel-weakmap": "^1.0.2" } }, "sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw=="],
|
||||
|
||||
"side-channel-list": ["side-channel-list@1.0.0", "", { "dependencies": { "es-errors": "^1.3.0", "object-inspect": "^1.13.3" } }, "sha512-FCLHtRD/gnpCiCHEiJLOwdmFP+wzCmDEkc9y7NsYxeF4u7Btsn1ZuwgwJGxImImHicJArLP4R0yX4c2KCrMrTA=="],
|
||||
|
||||
"side-channel-map": ["side-channel-map@1.0.1", "", { "dependencies": { "call-bound": "^1.0.2", "es-errors": "^1.3.0", "get-intrinsic": "^1.2.5", "object-inspect": "^1.13.3" } }, "sha512-VCjCNfgMsby3tTdo02nbjtM/ewra6jPHmpThenkTYh8pG9ucZ/1P8So4u4FGBek/BjpOVsDCMoLA/iuBKIFXRA=="],
|
||||
|
||||
"side-channel-weakmap": ["side-channel-weakmap@1.0.2", "", { "dependencies": { "call-bound": "^1.0.2", "es-errors": "^1.3.0", "get-intrinsic": "^1.2.5", "object-inspect": "^1.13.3", "side-channel-map": "^1.0.1" } }, "sha512-WPS/HvHQTYnHisLo9McqBHOJk2FkHO/tlpvldyrnem4aeQp4hai3gythswg6p01oSoTl58rcpiFAjF2br2Ak2A=="],
|
||||
|
||||
"statuses": ["statuses@2.0.2", "", {}, "sha512-DvEy55V3DB7uknRo+4iOGT5fP1slR8wQohVdknigZPMpMstaKJQWhwiYBACJE3Ul2pTnATihhBYnRhZQHGBiRw=="],
|
||||
|
||||
"toidentifier": ["toidentifier@1.0.1", "", {}, "sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA=="],
|
||||
|
||||
"tr46": ["tr46@0.0.3", "", {}, "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw=="],
|
||||
|
||||
"ts-algebra": ["ts-algebra@2.0.0", "", {}, "sha512-FPAhNPFMrkwz76P7cdjdmiShwMynZYN6SgOujD1urY4oNm80Ou9oMdmbR45LotcKOXoy7wSmHkRFE6Mxbrhefw=="],
|
||||
|
||||
"type-is": ["type-is@2.0.1", "", { "dependencies": { "content-type": "^1.0.5", "media-typer": "^1.1.0", "mime-types": "^3.0.0" } }, "sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw=="],
|
||||
|
||||
"typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
|
||||
|
||||
"undici-types": ["undici-types@5.26.5", "", {}, "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA=="],
|
||||
|
||||
"unpipe": ["unpipe@1.0.0", "", {}, "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ=="],
|
||||
|
||||
"vary": ["vary@1.1.2", "", {}, "sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg=="],
|
||||
|
||||
"web-streams-polyfill": ["web-streams-polyfill@4.0.0-beta.3", "", {}, "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug=="],
|
||||
|
||||
"webidl-conversions": ["webidl-conversions@3.0.1", "", {}, "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ=="],
|
||||
|
||||
"whatwg-url": ["whatwg-url@5.0.0", "", { "dependencies": { "tr46": "~0.0.3", "webidl-conversions": "^3.0.0" } }, "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw=="],
|
||||
|
||||
"which": ["which@2.0.2", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "node-which": "./bin/node-which" } }, "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA=="],
|
||||
|
||||
"wrappy": ["wrappy@1.0.2", "", {}, "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ=="],
|
||||
|
||||
"yaml": ["yaml@2.8.3", "", { "bin": { "yaml": "bin.mjs" } }, "sha512-AvbaCLOO2Otw/lW5bmh9d/WEdcDFdQp2Z2ZUH3pX9U2ihyUY0nvLv7J6TrWowklRGPYbB/IuIMfYgxaCPg5Bpg=="],
|
||||
|
||||
"zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="],
|
||||
|
||||
"zod-to-json-schema": ["zod-to-json-schema@3.25.2", "", { "peerDependencies": { "zod": "^3.25.28 || ^4" } }, "sha512-O/PgfnpT1xKSDeQYSCfRI5Gy3hPf91mKVDuYLUHZJMiDFptvP41MSnWofm8dnCm0256ZNfZIM7DSzuSMAFnjHA=="],
|
||||
|
||||
"@anthropic-ai/claude-agent-sdk/@anthropic-ai/sdk": ["@anthropic-ai/sdk@0.74.0", "", { "dependencies": { "json-schema-to-ts": "^3.1.1" }, "peerDependencies": { "zod": "^3.25.0 || ^4.0.0" }, "optionalPeers": ["zod"], "bin": { "anthropic-ai-sdk": "bin/cli" } }, "sha512-srbJV7JKsc5cQ6eVuFzjZO7UR3xEPJqPamHFIe29bs38Ij2IripoAhC0S5NslNbaFUYqBKypmmpzMTpqfHEUDw=="],
|
||||
|
||||
"@types/node-fetch/@types/node": ["@types/node@25.5.0", "", { "dependencies": { "undici-types": "~7.18.0" } }, "sha512-jp2P3tQMSxWugkCUKLRPVUpGaL5MVFwF8RDuSRztfwgN1wmqJeMSbKlnEtQqU8UrhTmzEmZdu2I6v2dpp7XIxw=="],
|
||||
|
||||
"bun-types/@types/node": ["@types/node@25.5.0", "", { "dependencies": { "undici-types": "~7.18.0" } }, "sha512-jp2P3tQMSxWugkCUKLRPVUpGaL5MVFwF8RDuSRztfwgN1wmqJeMSbKlnEtQqU8UrhTmzEmZdu2I6v2dpp7XIxw=="],
|
||||
|
||||
"form-data/mime-types": ["mime-types@2.1.35", "", { "dependencies": { "mime-db": "1.52.0" } }, "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw=="],
|
||||
|
||||
"@types/node-fetch/@types/node/undici-types": ["undici-types@7.18.2", "", {}, "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w=="],
|
||||
|
||||
"bun-types/@types/node/undici-types": ["undici-types@7.18.2", "", {}, "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w=="],
|
||||
|
||||
"form-data/mime-types/mime-db": ["mime-db@1.52.0", "", {}, "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg=="],
|
||||
}
|
||||
}
|
||||
93
ai_evals/cases/app.yaml
Normal file
93
ai_evals/cases/app.yaml
Normal file
@@ -0,0 +1,93 @@
|
||||
- id: app-test1-counter-create
|
||||
prompt: |-
|
||||
Create a simple counter app with increment and decrement buttons.
|
||||
judgeChecklist:
|
||||
- shows the current count in the UI
|
||||
- includes an increment button
|
||||
- includes a decrement button
|
||||
- clicking the buttons updates the count correctly
|
||||
|
||||
- id: app-test2-counter-reset
|
||||
prompt: |-
|
||||
Add a reset button that sets the counter back to 0
|
||||
initial: ai_evals/fixtures/frontend/app/initial/test1_counter_app
|
||||
judgeChecklist:
|
||||
- adds a reset control to the existing counter app
|
||||
- clicking reset sets the count back to 0
|
||||
- keeps the existing increment and decrement behavior working
|
||||
|
||||
- id: app-test3-shopping-cart-quantity
|
||||
prompt: |-
|
||||
Add a quantity selector (+ and - buttons) to each cart item so users can adjust quantities without removing and re-adding items
|
||||
initial: ai_evals/fixtures/frontend/app/initial/shopping_cart
|
||||
judgeChecklist:
|
||||
- each cart item has visible plus and minus quantity controls
|
||||
- users can increase quantity without re-adding the product
|
||||
- users can decrease quantity from the cart UI
|
||||
- cart totals stay in sync with quantity changes
|
||||
|
||||
- id: app-test4-shopping-cart-discount
|
||||
prompt: |-
|
||||
Add a discount code input field in the cart.
|
||||
When the code "SAVE10" is entered, apply a 10% discount to the total
|
||||
initial: ai_evals/fixtures/frontend/app/initial/shopping_cart
|
||||
judgeChecklist:
|
||||
- adds a discount code input to the cart
|
||||
- recognizes the code SAVE10
|
||||
- applies a 10 percent discount to the displayed total
|
||||
- keeps the rest of the cart behavior intact
|
||||
|
||||
- id: app-test5-file-manager-search
|
||||
prompt: |-
|
||||
Add a search bar in the toolbar that filters files and folders by name as the user types
|
||||
initial: ai_evals/fixtures/frontend/app/initial/file_manager
|
||||
judgeChecklist:
|
||||
- adds a search input in the toolbar
|
||||
- filters files and folders by name as the user types
|
||||
- updates the visible file list from the search query
|
||||
- keeps the rest of the file manager usable
|
||||
|
||||
- id: app-test6-file-manager-inline-rename
|
||||
prompt: |-
|
||||
Let users rename files and folders directly from the file list without leaving the page.
|
||||
initial: ai_evals/fixtures/frontend/app/initial/file_manager
|
||||
judgeChecklist:
|
||||
- adds a visible rename action or inline edit mode in the file list
|
||||
- lets users edit an item's name directly from the list
|
||||
- saves the renamed item through the app's existing rename behavior
|
||||
- refreshes the displayed name after a successful rename
|
||||
|
||||
- id: app-test7-file-manager-select-all
|
||||
prompt: |-
|
||||
Add a "Select All" checkbox in the file list header and individual checkboxes for each file.
|
||||
Add a "Delete Selected" button that appears when items are selected
|
||||
initial: ai_evals/fixtures/frontend/app/initial/file_manager
|
||||
judgeChecklist:
|
||||
- adds a select-all control in the file list header
|
||||
- adds per-item selection controls
|
||||
- shows a delete-selected action only when there is a selection
|
||||
- deleting selected items updates the visible list
|
||||
|
||||
- id: app-test8-inventory-tracker-create
|
||||
prompt: |-
|
||||
Create an inventory tracker app for a small store.
|
||||
Users should be able to add items with a name, sku, quantity, and price, search items by name or sku, and delete items.
|
||||
The inventory should persist between sessions.
|
||||
judgeChecklist:
|
||||
- includes a form to add inventory items with name, sku, quantity, and price
|
||||
- shows a list or table of saved inventory items
|
||||
- supports searching or filtering by name or sku
|
||||
- lets users delete existing inventory items
|
||||
- persists the inventory data appropriately for a raw Windmill app
|
||||
|
||||
- id: app-test9-recipe-book-create
|
||||
prompt: |-
|
||||
Create a recipe book app where users can add recipes with a name, ingredients list, and instructions.
|
||||
Include a search bar to filter recipes by name and the ability to delete recipes.
|
||||
Recipes should persist between sessions.
|
||||
judgeChecklist:
|
||||
- includes a form to add recipes with name, ingredients, and instructions
|
||||
- shows saved recipes in the app
|
||||
- supports searching recipes by name
|
||||
- lets users delete recipes
|
||||
- persists recipes appropriately for a raw Windmill app
|
||||
66
ai_evals/cases/cli.yaml
Normal file
66
ai_evals/cases/cli.yaml
Normal file
@@ -0,0 +1,66 @@
|
||||
- id: bun-hello-script
|
||||
prompt: |-
|
||||
Create a Windmill Bun script at `f/evals/hello.ts`.
|
||||
It should take a `name` input and return a greeting object like `{ greeting: "Hello, Alice!" }`.
|
||||
expected: ai_evals/fixtures/cli/expected/bun-hello-script
|
||||
judgeChecklist:
|
||||
- creates the requested Bun script at f/evals/hello.ts
|
||||
- takes a name input
|
||||
- returns an object containing the greeting
|
||||
|
||||
- id: bun-hello-flow
|
||||
prompt: |-
|
||||
Create a Windmill flow at `f/evals/hello__flow`.
|
||||
It should take a `name` input and return a greeting object like `{ greeting: "Hello, Alice!" }`.
|
||||
Put the step code in `hello.ts`.
|
||||
expected: ai_evals/fixtures/cli/expected/bun-hello-flow
|
||||
judgeChecklist:
|
||||
- creates the requested flow folder with flow.yaml and hello.ts
|
||||
- wires the name input into the flow step
|
||||
- returns the greeting object
|
||||
|
||||
- id: python-add-numbers-script
|
||||
prompt: |-
|
||||
Add a Windmill Python script at `f/evals/add_numbers.py`.
|
||||
It should take `a` and `b` as inputs and return `{ "total": a + b }`.
|
||||
expected: ai_evals/fixtures/cli/expected/python-add-numbers-script
|
||||
judgeChecklist:
|
||||
- creates the requested Python script at f/evals/add_numbers.py
|
||||
- takes `a` and `b` as inputs
|
||||
- returns an object with total equal to a plus b
|
||||
|
||||
- id: bun-hello-script-uppercase
|
||||
prompt: |-
|
||||
Update `f/evals/hello.ts` so it accepts an optional `uppercase` boolean.
|
||||
Keep returning `{ greeting: ... }`, but when `uppercase` is true the greeting should be uppercased before returning it.
|
||||
initial: ai_evals/fixtures/cli/initial/bun-hello-script-uppercase
|
||||
expected: ai_evals/fixtures/cli/expected/bun-hello-script-uppercase
|
||||
judgeChecklist:
|
||||
- updates the existing hello.ts file rather than creating a new script
|
||||
- accepts an optional uppercase boolean input
|
||||
- keeps returning an object with greeting
|
||||
- uppercases the greeting when uppercase is true
|
||||
|
||||
- id: bun-hello-flow-punctuation
|
||||
prompt: |-
|
||||
Update the existing flow in `f/evals/hello__flow` so it also accepts an optional `punctuation` input.
|
||||
The greeting should use that punctuation and default to `!` when it is missing.
|
||||
initial: ai_evals/fixtures/cli/initial/bun-hello-flow-punctuation
|
||||
expected: ai_evals/fixtures/cli/expected/bun-hello-flow-punctuation
|
||||
judgeChecklist:
|
||||
- updates the existing hello flow instead of creating a new one
|
||||
- adds an optional punctuation input to the flow
|
||||
- updates the step code so the returned greeting uses punctuation
|
||||
- defaults punctuation to an exclamation mark when omitted
|
||||
|
||||
- id: flow-reuse-existing-script
|
||||
prompt: |-
|
||||
There is already a reusable greeting script at `f/lib/format_greeting.ts`.
|
||||
Create a flow at `f/evals/reuse_greeting__flow` that takes a `name` input and reuses that existing script instead of duplicating the logic inline.
|
||||
initial: ai_evals/fixtures/cli/initial/flow-reuse-existing-script
|
||||
expected: ai_evals/fixtures/cli/expected/flow-reuse-existing-script
|
||||
judgeChecklist:
|
||||
- creates the requested flow at f/evals/reuse_greeting__flow
|
||||
- reuses the existing script from f/lib by path
|
||||
- does not duplicate the greeting logic in a new inline script
|
||||
- wires the name input into the reused script
|
||||
335
ai_evals/cases/flow.yaml
Normal file
335
ai_evals/cases/flow.yaml
Normal file
@@ -0,0 +1,335 @@
|
||||
- id: flow-test0-sum-two-numbers
|
||||
prompt: |-
|
||||
Create a flow that takes two numbers, `a` and `b`, and returns their sum.
|
||||
Keep it simple and use a single step named `sum_numbers`.
|
||||
expected: ai_evals/fixtures/frontend/flow/expected/test0_sum_two_numbers.json
|
||||
runtime:
|
||||
backendPreview:
|
||||
args:
|
||||
a: 4
|
||||
b: 5
|
||||
judgeChecklist:
|
||||
- "the flow takes `a` and `b` as inputs"
|
||||
- "the main step is named `sum_numbers`"
|
||||
- the flow returns the sum of the two numbers
|
||||
|
||||
- id: flow-test1-reuse-existing-script
|
||||
prompt: |-
|
||||
I need a flow that adds two numbers.
|
||||
If there is already a script in the workspace that does that, reuse it instead of rewriting the logic.
|
||||
The flow should take `a` and `b` as inputs and use a single step named `sum_numbers`.
|
||||
initial: ai_evals/fixtures/frontend/flow/initial/test1_reuse_existing_script_initial.json
|
||||
expected: ai_evals/fixtures/frontend/flow/expected/test1_reuse_existing_script.json
|
||||
runtime:
|
||||
backendPreview:
|
||||
args:
|
||||
a: 2
|
||||
b: 3
|
||||
judgeChecklist:
|
||||
- "the flow takes `a` and `b` as inputs"
|
||||
- "the main step is named `sum_numbers`"
|
||||
- the flow reuses the existing workspace script instead of rewriting the addition logic
|
||||
|
||||
- id: flow-test2-call-existing-subflow
|
||||
prompt: |-
|
||||
Create a parent flow that adds two numbers by reusing an existing flow in the workspace if one already exists.
|
||||
The parent flow should take `a` and `b` as inputs and delegate the calculation instead of inlining it.
|
||||
Use a single step named `call_add_numbers`.
|
||||
initial: ai_evals/fixtures/frontend/flow/initial/test2_call_existing_subflow_initial.json
|
||||
expected: ai_evals/fixtures/frontend/flow/expected/test2_call_existing_subflow.json
|
||||
runtime:
|
||||
backendPreview:
|
||||
args:
|
||||
a: 7
|
||||
b: 8
|
||||
judgeChecklist:
|
||||
- "the parent flow takes `a` and `b` as inputs"
|
||||
- "the main step is named `call_add_numbers`"
|
||||
- the parent flow delegates to an existing workspace subflow instead of inlining the addition logic
|
||||
|
||||
- id: flow-test3-branchone-routing
|
||||
prompt: |-
|
||||
Create a flow that routes incoming support requests based on the customer's tier.
|
||||
The input should contain a string field named `tier`.
|
||||
Free, pro, and enterprise requests should go to different queues, and unknown tiers should fall back to a default queue.
|
||||
Name the main routing step `route_by_tier`.
|
||||
expected: ai_evals/fixtures/frontend/flow/expected/test3_branchone_routing.json
|
||||
judgeChecklist:
|
||||
- "the input schema includes a string field named `tier`"
|
||||
- "the main routing step is named `route_by_tier`"
|
||||
- free requests go to a free queue
|
||||
- pro requests go to a pro queue
|
||||
- enterprise requests go to an enterprise queue
|
||||
- unknown tiers fall back to a default queue
|
||||
|
||||
- id: flow-test4-order-processing-loop
|
||||
prompt: |-
|
||||
Build an order-processing flow.
|
||||
|
||||
The input should include an order with:
|
||||
- an `items` array containing `name`, `price`, and `quantity`
|
||||
- `customer_email`
|
||||
- `shipping_address`
|
||||
|
||||
The flow should:
|
||||
- validate that every item has a positive price and quantity
|
||||
- calculate the order total with 8% tax
|
||||
- check inventory for each item using placeholder availability data
|
||||
- create a shipment if everything is in stock, otherwise create a backorder
|
||||
- send a confirmation using placeholder email logic
|
||||
- return a final order summary with the status
|
||||
validate:
|
||||
schemaAnyOf:
|
||||
- requiredPaths:
|
||||
- order
|
||||
- order.items
|
||||
- order.customer_email
|
||||
- order.shipping_address
|
||||
- requiredPaths:
|
||||
- items
|
||||
- customer_email
|
||||
- shipping_address
|
||||
resolveResultsRefs: true
|
||||
judgeChecklist:
|
||||
- the flow validates that every item has a positive price and quantity
|
||||
- the flow calculates the order total with 8% tax
|
||||
- the flow checks inventory for each item using placeholder availability data
|
||||
- the flow creates a shipment if everything is in stock, otherwise a backorder
|
||||
- the flow sends a confirmation using placeholder email logic
|
||||
- the flow returns a final order summary with the resulting status
|
||||
|
||||
- id: flow-test5-parallel-data-pipeline
|
||||
prompt: |-
|
||||
Create a data-processing flow for three external data sources.
|
||||
|
||||
It should:
|
||||
- load a small placeholder configuration listing the three sources
|
||||
- fetch placeholder records from each source
|
||||
- clean and validate each source's records
|
||||
- combine everything into one dataset
|
||||
- compute an overall quality score
|
||||
- store the result differently depending on the score:
|
||||
- 90 or above goes to the primary database
|
||||
- 70 to 89 goes to a secondary database with a warning
|
||||
- below 70 goes to quarantine and triggers an alert
|
||||
- return a processing report with total records, quality score, and destination
|
||||
judgeChecklist:
|
||||
- the flow loads a placeholder configuration listing three external sources
|
||||
- the flow fetches placeholder records from each source
|
||||
- the flow cleans and validates each source's records
|
||||
- the flow combines everything into one dataset
|
||||
- the flow computes an overall quality score
|
||||
- scores of 90 or above go to the primary database
|
||||
- scores from 70 to 89 go to a secondary database with a warning
|
||||
- scores below 70 go to quarantine and trigger an alert
|
||||
- the final report includes total records, quality score, and destination
|
||||
|
||||
- id: flow-test6-ai-agent-tools
|
||||
prompt: |-
|
||||
Create a customer support flow.
|
||||
|
||||
The input should include `customer_id` and `query_text`.
|
||||
The flow should load the customer's profile and order history, then use an AI assistant to help with the request.
|
||||
The assistant should be able to:
|
||||
- look up orders
|
||||
- check refund eligibility
|
||||
- search FAQs
|
||||
- open a support ticket when needed
|
||||
|
||||
After that, log the interaction and return the assistant's response.
|
||||
judgeChecklist:
|
||||
- "the input schema includes `customer_id` and `query_text`"
|
||||
- the flow loads the customer's profile and order history
|
||||
- the flow uses an AI assistant step
|
||||
- the assistant can look up orders
|
||||
- the assistant can check refund eligibility
|
||||
- the assistant can search FAQs
|
||||
- the assistant can open a support ticket
|
||||
- the flow logs the interaction
|
||||
- the final output returns the assistant response
|
||||
|
||||
- id: flow-test7-simple-modification
|
||||
prompt: |-
|
||||
Update this flow so it validates processed data before saving it.
|
||||
|
||||
After `process_data`, add a `validate_data` step that checks the data array is not empty.
|
||||
If the array is empty, the flow should surface the message `No data to save` and prevent saving.
|
||||
If validation passes, let the save continue normally.
|
||||
Update `save_results` so it uses the validation outcome instead of bypassing it.
|
||||
initial: ai_evals/fixtures/frontend/flow/initial/test5_initial.json
|
||||
validate:
|
||||
topLevelStepIds:
|
||||
- fetch_data
|
||||
- process_data
|
||||
- validate_data
|
||||
topLevelStepOrder:
|
||||
- fetch_data
|
||||
- process_data
|
||||
- validate_data
|
||||
topLevelStepTypes:
|
||||
- id: fetch_data
|
||||
type: rawscript
|
||||
- id: process_data
|
||||
type: rawscript
|
||||
- id: validate_data
|
||||
type: rawscript
|
||||
judgeChecklist:
|
||||
- the updated flow keeps the original fetch and process steps intact
|
||||
- "a `validate_data` step is added after `process_data`"
|
||||
- "`validate_data` checks that the processed data array is not empty"
|
||||
- "when processed data is empty, the flow surfaces the message `No data to save` and does not save results"
|
||||
- "`save_results` uses the validation outcome instead of reading `results.process_data` directly"
|
||||
- "exact field names or wrapper object shape for the validation result are not important"
|
||||
|
||||
- id: flow-test8-branching-in-loop
|
||||
prompt: |-
|
||||
Update the order-processing logic inside `loop_orders` so different order types are handled differently.
|
||||
|
||||
For `express`, mark the order as priority and use a shipping cost of $15.99.
|
||||
For `standard`, use a shipping cost of $5.99.
|
||||
For `pickup`, mark it as no shipping required with a cost of $0.
|
||||
Keep the existing processing as a fallback for unknown order types.
|
||||
Each path should return the orderId, shipping cost, and shipping type.
|
||||
initial: ai_evals/fixtures/frontend/flow/initial/test6_initial.json
|
||||
judgeChecklist:
|
||||
- "the existing `loop_orders` flow still handles per-order processing"
|
||||
- exact branching topology is not required as long as `loop_orders` handles the order types correctly
|
||||
- express orders are marked as priority and use a shipping cost of 15.99
|
||||
- standard orders use a shipping cost of 5.99
|
||||
- pickup orders use a shipping cost of 0 and are treated as no shipping required
|
||||
- unknown order types still follow a fallback path
|
||||
- "each processed order returns `orderId`, `shippingCost`, and `shippingType`"
|
||||
|
||||
- id: flow-test9-parallel-refactor
|
||||
prompt: |-
|
||||
Refactor this flow so the enrichment work no longer runs one step at a time.
|
||||
|
||||
`enrich_price`, `enrich_inventory`, and `enrich_reviews` should run independently.
|
||||
Each one should return a fallback value if it fails.
|
||||
Update `combine_data` so it merges the enrichment results and sets a `hasFallbacks` flag when any fallback was used.
|
||||
Keep `get_item` as the first step and `return_result` as the last step.
|
||||
initial: ai_evals/fixtures/frontend/flow/initial/test7_initial.json
|
||||
validate:
|
||||
topLevelStepIds:
|
||||
- get_item
|
||||
- combine_data
|
||||
- return_result
|
||||
topLevelStepOrder:
|
||||
- get_item
|
||||
- combine_data
|
||||
- return_result
|
||||
topLevelStepTypeCountsAtLeast:
|
||||
- type: branchall
|
||||
count: 1
|
||||
topLevelStepTypes:
|
||||
- id: get_item
|
||||
type: rawscript
|
||||
- id: combine_data
|
||||
type: rawscript
|
||||
- id: return_result
|
||||
type: rawscript
|
||||
moduleRules:
|
||||
- id: enrich_price
|
||||
- id: enrich_inventory
|
||||
- id: enrich_reviews
|
||||
judgeChecklist:
|
||||
- "the updated flow keeps `get_item` as the first step"
|
||||
- "the updated flow keeps `return_result` as the last step"
|
||||
- "`enrich_price`, `enrich_inventory`, and `enrich_reviews` run independently rather than sequentially"
|
||||
- each enrichment path returns a fallback value if it fails
|
||||
- "`combine_data` merges the enrichment results"
|
||||
- "`combine_data` sets `hasFallbacks` when any fallback was used"
|
||||
|
||||
- id: flow-test10-while-loop-counter
|
||||
prompt: |-
|
||||
Create a flow that keeps incrementing a counter until it reaches a target value.
|
||||
The input should include a number field named `target`.
|
||||
Use a top-level loop step named `count_until_target`.
|
||||
Inside it, use a single step named `increment_counter` that increments the current counter.
|
||||
The loop should stop once the counter reaches `target`.
|
||||
After the loop, add a top-level step named `return_final_counter` that returns the last counter value.
|
||||
validate:
|
||||
exactTopLevelStepIds:
|
||||
- count_until_target
|
||||
- return_final_counter
|
||||
topLevelStepOrder:
|
||||
- count_until_target
|
||||
- return_final_counter
|
||||
topLevelStepTypes:
|
||||
- id: count_until_target
|
||||
type: whileloopflow
|
||||
- id: return_final_counter
|
||||
type: rawscript
|
||||
moduleRules:
|
||||
- id: count_until_target
|
||||
hasStopAfterIf: true
|
||||
hasStopAfterAllItersIf: false
|
||||
exactImmediateChildStepIds:
|
||||
- increment_counter
|
||||
immediateChildStepTypes:
|
||||
- id: increment_counter
|
||||
type: rawscript
|
||||
moduleFieldRules:
|
||||
- id: count_until_target
|
||||
path: stop_after_if.expr
|
||||
equals: result >= flow_input.target
|
||||
judgeChecklist:
|
||||
- "the input schema includes a number field named `target`"
|
||||
- "the top-level while loop step is named `count_until_target`"
|
||||
- "`count_until_target` contains a single increment step named `increment_counter`"
|
||||
- "`count_until_target` uses module-level `stop_after_if` to stop when the counter reaches `target`"
|
||||
- "`increment_counter` uses `flow_input.iter.value` or an equivalent loop-state expression and falls back to `0` on the first iteration"
|
||||
- "`return_final_counter` returns the final counter value"
|
||||
|
||||
- id: flow-test11-preprocessor-and-failure-handler
|
||||
prompt: |-
|
||||
Create an event-processing flow for a string payload.
|
||||
|
||||
Before the main processing runs, trim the payload and reject empty strings.
|
||||
The main step should be named `process_event` and return a simple success object.
|
||||
If anything fails, return a compact error object with the error message and the failing step id.
|
||||
expected: ai_evals/fixtures/frontend/flow/expected/test11_preprocessor_failure.json
|
||||
validate:
|
||||
requireSpecialModules:
|
||||
- preprocessor_module
|
||||
- failure_module
|
||||
judgeChecklist:
|
||||
- the flow trims the payload before the main processing runs
|
||||
- the flow rejects empty payload strings
|
||||
- "the main step is named `process_event`"
|
||||
- "`process_event` returns a simple success object"
|
||||
- failures return a compact error object with the error message and failing step id
|
||||
|
||||
- id: flow-test12-approval-step
|
||||
prompt: |-
|
||||
Create a purchase approval flow.
|
||||
|
||||
The input should include `requester_email` and `amount`.
|
||||
Add an approval step named `request_approval` that pauses the flow and asks the approver for a comment.
|
||||
One approval should be enough to continue.
|
||||
After approval, add a final step named `finalize_purchase` that returns an approved status object.
|
||||
validate:
|
||||
topLevelStepIds:
|
||||
- request_approval
|
||||
- finalize_purchase
|
||||
topLevelStepOrder:
|
||||
- request_approval
|
||||
- finalize_purchase
|
||||
topLevelStepTypes:
|
||||
- id: finalize_purchase
|
||||
type: rawscript
|
||||
schemaRequiredPaths:
|
||||
- requester_email
|
||||
- amount
|
||||
requireSuspendSteps:
|
||||
- id: request_approval
|
||||
requiredEvents: 1
|
||||
resumeRequiredStringFieldAnyOf:
|
||||
- comment
|
||||
- approver_comment
|
||||
judgeChecklist:
|
||||
- "the flow includes an approval step named `request_approval`"
|
||||
- "`request_approval` pauses the flow and asks the approver for a comment"
|
||||
- one approval is enough to continue
|
||||
- "the flow includes a final step named `finalize_purchase`"
|
||||
- "`finalize_purchase` returns an approved status object after approval"
|
||||
11
ai_evals/cases/script.yaml
Normal file
11
ai_evals/cases/script.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
- id: script-test1-greet-user
|
||||
prompt: |-
|
||||
Update the current Bun script so it takes the existing `name` input and returns a plain greeting string like `Hello, Alice!`.
|
||||
Do not wrap the result in an object or array.
|
||||
Keep it simple and do not add external dependencies.
|
||||
initial: ai_evals/fixtures/frontend/script/initial/test1_empty_bun.json
|
||||
expected: ai_evals/fixtures/frontend/script/expected/test1_greet_user.json
|
||||
judgeChecklist:
|
||||
- uses the existing `name` input
|
||||
- returns a plain greeting string
|
||||
- does not wrap the result in an object or array
|
||||
314
ai_evals/cli/index.ts
Normal file
314
ai_evals/cli/index.ts
Normal file
@@ -0,0 +1,314 @@
|
||||
#!/usr/bin/env bun
|
||||
|
||||
import { Command, InvalidArgumentError } from "commander";
|
||||
import { loadCases, loadSelectedCases } from "../core/cases";
|
||||
import {
|
||||
BACKEND_VALIDATION_MODES,
|
||||
parseBackendValidationMode,
|
||||
} from "../core/backendValidation";
|
||||
import {
|
||||
EVAL_MODELS,
|
||||
type EvalModelSpec,
|
||||
formatRunModelLabel,
|
||||
getCliEvalModel,
|
||||
getEvalModelHelpText,
|
||||
resolveEvalModel,
|
||||
} from "../core/models";
|
||||
import {
|
||||
appendHistoryRecord,
|
||||
buildRunResult,
|
||||
formatRunSummary,
|
||||
resolveRunOutputPath,
|
||||
writeRunArtifacts,
|
||||
writeRunResult,
|
||||
} from "../core/results";
|
||||
import { runSuite } from "../core/runSuite";
|
||||
import { EVAL_MODES, type EvalMode } from "../core/types";
|
||||
import { DEFAULT_JUDGE_MODEL } from "../core/judge";
|
||||
import { createCliModeRunner } from "../modes/cli";
|
||||
import { runFrontendBenchmarkAdapter } from "../adapters/frontend/runtime";
|
||||
|
||||
async function main() {
|
||||
const program = new Command()
|
||||
.name("bun run cli --")
|
||||
.description("Run AI eval cases against the current production prompts and guidance")
|
||||
.showHelpAfterError()
|
||||
.showSuggestionAfterError()
|
||||
.addHelpText(
|
||||
"after",
|
||||
[
|
||||
"",
|
||||
"Examples:",
|
||||
" bun run cli -- models",
|
||||
" bun run cli -- cases",
|
||||
" bun run cli -- cases flow",
|
||||
" bun run cli -- run flow",
|
||||
" bun run cli -- run flow --model 4o",
|
||||
" bun run cli -- run flow --models haiku,opus,4o",
|
||||
" bun run cli -- run flow flow-test0-sum-two-numbers --verbose",
|
||||
" bun run cli -- run flow --record",
|
||||
" bun run cli -- run flow --backend-validation preview",
|
||||
" bun run cli -- run flow flow-test5-simple-modification --runs 3",
|
||||
" bun run cli -- run cli bun-hello-script",
|
||||
"",
|
||||
"Models:",
|
||||
getEvalModelHelpText(),
|
||||
].join("\n")
|
||||
);
|
||||
|
||||
program
|
||||
.command("models")
|
||||
.description("List available model aliases")
|
||||
.action(() => {
|
||||
handleModels();
|
||||
});
|
||||
|
||||
program
|
||||
.command("cases")
|
||||
.description("List available cases")
|
||||
.argument("[mode]", "cli, flow, script, or app", parseOptionalMode)
|
||||
.action(async (mode?: EvalMode) => {
|
||||
await handleCases(mode);
|
||||
});
|
||||
|
||||
program
|
||||
.command("run")
|
||||
.description("Run one benchmark mode")
|
||||
.argument("<mode>", "cli, flow, script, or app", parseMode)
|
||||
.argument("[caseIds...]", "specific case ids to run")
|
||||
.option("--runs <n>", "number of attempts per case", parsePositiveInteger, 1)
|
||||
.option("--output <path>", "write the result JSON to this path")
|
||||
.option("--model <name>", `model alias (${EVAL_MODELS.map((entry) => entry.id).join(", ")})`)
|
||||
.option("--models <names>", "comma-separated model aliases to run sequentially")
|
||||
.option("--verbose", "stream assistant output during frontend runs")
|
||||
.option("--record", "append a compact summary line to ai_evals/history/<mode>.jsonl")
|
||||
.option(
|
||||
"--backend-validation <mode>",
|
||||
`backend smoke validation (${BACKEND_VALIDATION_MODES.join(", ")})`
|
||||
)
|
||||
.action(
|
||||
async (
|
||||
mode: EvalMode,
|
||||
caseIds: string[],
|
||||
options: {
|
||||
runs: number;
|
||||
output?: string;
|
||||
model?: string;
|
||||
models?: string;
|
||||
verbose?: boolean;
|
||||
record?: boolean;
|
||||
backendValidation?: string;
|
||||
}
|
||||
) => {
|
||||
await handleRun({
|
||||
mode,
|
||||
caseIds,
|
||||
runs: options.runs,
|
||||
outputPath: options.output,
|
||||
model: options.model,
|
||||
models: options.models,
|
||||
verbose: options.verbose ?? false,
|
||||
record: options.record ?? false,
|
||||
backendValidation: options.backendValidation,
|
||||
});
|
||||
}
|
||||
);
|
||||
|
||||
await program.parseAsync(process.argv);
|
||||
}
|
||||
|
||||
async function handleCases(mode?: EvalMode) {
|
||||
const modes = mode ? [mode] : [...EVAL_MODES];
|
||||
|
||||
for (const entry of modes) {
|
||||
const cases = await loadCases(entry);
|
||||
process.stdout.write(`${entry} (${cases.length})\n`);
|
||||
for (const evalCase of cases) {
|
||||
process.stdout.write(`- ${evalCase.id}\n`);
|
||||
}
|
||||
process.stdout.write("\n");
|
||||
}
|
||||
}
|
||||
|
||||
function handleModels() {
|
||||
process.stdout.write("Available models\n");
|
||||
for (const model of EVAL_MODELS) {
|
||||
const supports = [
|
||||
...(model.frontend ? ["flow", "script", "app"] : []),
|
||||
...(model.cli ? ["cli"] : []),
|
||||
];
|
||||
const aliases = [model.id, ...model.aliases.filter((alias) => alias !== model.id)];
|
||||
process.stdout.write(`- ${model.id}: ${model.label}\n`);
|
||||
process.stdout.write(` aliases: ${aliases.join(", ")}\n`);
|
||||
process.stdout.write(` modes: ${supports.join(", ")}\n`);
|
||||
}
|
||||
process.stdout.write(`\nJudge model: ${DEFAULT_JUDGE_MODEL}\n`);
|
||||
}
|
||||
|
||||
async function handleRun(input: {
|
||||
mode: EvalMode;
|
||||
caseIds: string[];
|
||||
runs: number;
|
||||
outputPath?: string;
|
||||
model?: string;
|
||||
models?: string;
|
||||
verbose: boolean;
|
||||
record: boolean;
|
||||
backendValidation?: string;
|
||||
}) {
|
||||
if (input.record && input.caseIds.length > 0) {
|
||||
throw new Error("--record only supports full-suite runs; omit case ids to record history");
|
||||
}
|
||||
if (input.model && input.models) {
|
||||
throw new Error("Use either --model or --models, not both");
|
||||
}
|
||||
|
||||
const selectedCases = await loadSelectedCases(input.mode, input.caseIds);
|
||||
const models = resolveRequestedModels(input.mode, input.model, input.models);
|
||||
const backendValidation = parseBackendValidationMode(
|
||||
input.backendValidation ?? process.env.WMILL_AI_EVAL_BACKEND_VALIDATION
|
||||
);
|
||||
if (input.outputPath && models.length > 1) {
|
||||
throw new Error("--output only supports a single model run");
|
||||
}
|
||||
if (backendValidation !== "off" && input.mode !== "flow" && input.mode !== "script") {
|
||||
throw new Error("--backend-validation currently supports only flow and script modes");
|
||||
}
|
||||
|
||||
const summaries: Array<{ label: string; passRate: number; averageDurationMs: number }> = [];
|
||||
|
||||
for (const [index, model] of models.entries()) {
|
||||
const runModel = formatRunModelLabel(input.mode, model);
|
||||
if (models.length > 1) {
|
||||
process.stdout.write(
|
||||
`${index > 0 ? "\n" : ""}=== ${input.mode} ${model.id} (${runModel}) ===\n`
|
||||
);
|
||||
}
|
||||
process.stderr.write(`Starting ${input.mode} benchmark...\n`);
|
||||
|
||||
const result =
|
||||
input.mode === "cli"
|
||||
? await runCliBenchmark(selectedCases, input.runs, getCliEvalModel(model), runModel)
|
||||
: await runFrontendBenchmarkAdapter({
|
||||
mode: input.mode,
|
||||
caseIds: input.caseIds,
|
||||
runs: input.runs,
|
||||
model: model.id,
|
||||
verbose: input.verbose,
|
||||
backendValidation,
|
||||
});
|
||||
|
||||
const resolvedOutputPath =
|
||||
models.length === 1
|
||||
? resolveRunOutputPath(input.mode, input.outputPath)
|
||||
: resolveRunOutputPath(input.mode);
|
||||
const artifactsPath = await writeRunArtifacts(result, resolvedOutputPath);
|
||||
const resultPath = await writeRunResult(result, resolvedOutputPath);
|
||||
const historyPath = input.record ? await appendHistoryRecord(result) : null;
|
||||
process.stdout.write(`${formatRunSummary(result)}\n`);
|
||||
process.stdout.write(`Saved: ${resultPath}\n`);
|
||||
if (artifactsPath) {
|
||||
process.stdout.write(`Artifacts: ${artifactsPath}\n`);
|
||||
}
|
||||
if (historyPath) {
|
||||
process.stdout.write(`Recorded: ${historyPath}\n`);
|
||||
}
|
||||
|
||||
summaries.push({
|
||||
label: `${model.id} (${runModel})`,
|
||||
passRate: result.passRate,
|
||||
averageDurationMs: result.averageDurationMs,
|
||||
});
|
||||
}
|
||||
|
||||
if (summaries.length > 1) {
|
||||
process.stdout.write("\nModel summary\n");
|
||||
for (const summary of summaries) {
|
||||
process.stdout.write(
|
||||
`- ${summary.label}: ${formatPercent(summary.passRate)} | ${Math.round(summary.averageDurationMs)}ms\n`
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function runCliBenchmark(
|
||||
cases: Awaited<ReturnType<typeof loadSelectedCases>>,
|
||||
runs: number,
|
||||
model: ReturnType<typeof getCliEvalModel>,
|
||||
runModel: string
|
||||
) {
|
||||
const caseResults = await runSuite({
|
||||
modeRunner: createCliModeRunner(model),
|
||||
cases,
|
||||
runs,
|
||||
runModel,
|
||||
judgeModel: DEFAULT_JUDGE_MODEL,
|
||||
});
|
||||
|
||||
return buildRunResult({
|
||||
mode: "cli",
|
||||
runs,
|
||||
runModel,
|
||||
judgeModel: DEFAULT_JUDGE_MODEL,
|
||||
caseResults,
|
||||
});
|
||||
}
|
||||
|
||||
function parseMode(value: string): EvalMode {
|
||||
if (EVAL_MODES.includes(value as EvalMode)) {
|
||||
return value as EvalMode;
|
||||
}
|
||||
throw new InvalidArgumentError(`mode must be one of: ${EVAL_MODES.join(", ")}`);
|
||||
}
|
||||
|
||||
function parseOptionalMode(value: string | undefined): EvalMode | undefined {
|
||||
return value ? parseMode(value) : undefined;
|
||||
}
|
||||
|
||||
function parsePositiveInteger(value: string): number {
|
||||
const parsed = Number(value);
|
||||
if (!Number.isInteger(parsed) || parsed <= 0) {
|
||||
throw new InvalidArgumentError("must be a positive integer");
|
||||
}
|
||||
return parsed;
|
||||
}
|
||||
|
||||
function resolveRequestedModels(
|
||||
mode: EvalMode,
|
||||
singleModel?: string,
|
||||
multipleModels?: string
|
||||
): EvalModelSpec[] {
|
||||
if (!multipleModels) {
|
||||
return [resolveEvalModel(mode, singleModel)];
|
||||
}
|
||||
|
||||
const aliases = multipleModels
|
||||
.split(",")
|
||||
.map((value) => value.trim())
|
||||
.filter(Boolean);
|
||||
if (aliases.length === 0) {
|
||||
throw new Error("--models requires at least one model alias");
|
||||
}
|
||||
|
||||
const seen = new Set<string>();
|
||||
const models: EvalModelSpec[] = [];
|
||||
for (const alias of aliases) {
|
||||
const model = resolveEvalModel(mode, alias);
|
||||
if (seen.has(model.id)) {
|
||||
continue;
|
||||
}
|
||||
seen.add(model.id);
|
||||
models.push(model);
|
||||
}
|
||||
return models;
|
||||
}
|
||||
|
||||
function formatPercent(value: number): string {
|
||||
return `${(value * 100).toFixed(1)}%`;
|
||||
}
|
||||
|
||||
void main().catch((error) => {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
process.stderr.write(`${message}\n`);
|
||||
process.exit(1);
|
||||
});
|
||||
36
ai_evals/core/backendValidation.test.ts
Normal file
36
ai_evals/core/backendValidation.test.ts
Normal file
@@ -0,0 +1,36 @@
|
||||
import { describe, expect, it } from "bun:test";
|
||||
import {
|
||||
parseBackendValidationMode,
|
||||
resolveBackendValidationSettings,
|
||||
} from "./backendValidation";
|
||||
|
||||
describe("parseBackendValidationMode", () => {
|
||||
it("defaults to off", () => {
|
||||
expect(parseBackendValidationMode(undefined)).toBe("off");
|
||||
expect(parseBackendValidationMode("0")).toBe("off");
|
||||
expect(parseBackendValidationMode("false")).toBe("off");
|
||||
});
|
||||
|
||||
it("accepts preview aliases", () => {
|
||||
expect(parseBackendValidationMode("preview")).toBe("preview");
|
||||
expect(parseBackendValidationMode("1")).toBe("preview");
|
||||
expect(parseBackendValidationMode("true")).toBe("preview");
|
||||
});
|
||||
|
||||
it("rejects unknown modes", () => {
|
||||
expect(() => parseBackendValidationMode("maybe")).toThrow(
|
||||
"Unsupported backend validation mode: maybe"
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe("resolveBackendValidationSettings", () => {
|
||||
it("rejects unsupported eval modes", () => {
|
||||
expect(() =>
|
||||
resolveBackendValidationSettings({
|
||||
evalMode: "app",
|
||||
requestedMode: "preview",
|
||||
})
|
||||
).toThrow('Backend validation mode "preview" is only supported for flow and script evals');
|
||||
});
|
||||
});
|
||||
104
ai_evals/core/backendValidation.ts
Normal file
104
ai_evals/core/backendValidation.ts
Normal file
@@ -0,0 +1,104 @@
|
||||
import type { EvalMode } from "./types";
|
||||
|
||||
export const BACKEND_VALIDATION_MODES = ["off", "preview"] as const;
|
||||
|
||||
export type BackendValidationMode = (typeof BACKEND_VALIDATION_MODES)[number];
|
||||
|
||||
export interface BackendValidationSettings {
|
||||
mode: BackendValidationMode;
|
||||
baseUrl: string;
|
||||
email: string;
|
||||
password: string;
|
||||
keepWorkspaces: boolean;
|
||||
workspaceOverride?: string;
|
||||
workspacePrefix: string;
|
||||
pollIntervalMs: number;
|
||||
maxWaitMs: number;
|
||||
}
|
||||
|
||||
export function parseBackendValidationMode(value?: string | null): BackendValidationMode {
|
||||
const normalized = value?.trim().toLowerCase();
|
||||
|
||||
if (!normalized || normalized === "off" || normalized === "false" || normalized === "0") {
|
||||
return "off";
|
||||
}
|
||||
|
||||
if (normalized === "preview" || normalized === "true" || normalized === "1") {
|
||||
return "preview";
|
||||
}
|
||||
|
||||
throw new Error(
|
||||
`Unsupported backend validation mode: ${value}. Use one of: ${BACKEND_VALIDATION_MODES.join(", ")}`
|
||||
);
|
||||
}
|
||||
|
||||
export function resolveBackendValidationSettings(input: {
|
||||
evalMode: EvalMode;
|
||||
requestedMode?: string | null;
|
||||
}): BackendValidationSettings {
|
||||
const mode = parseBackendValidationMode(
|
||||
input.requestedMode ?? process.env.WMILL_AI_EVAL_BACKEND_VALIDATION
|
||||
);
|
||||
|
||||
if (mode !== "off" && input.evalMode !== "flow" && input.evalMode !== "script") {
|
||||
throw new Error(
|
||||
`Backend validation mode "${mode}" is only supported for flow and script evals`
|
||||
);
|
||||
}
|
||||
|
||||
return {
|
||||
mode,
|
||||
baseUrl: normalizeBaseUrl(
|
||||
process.env.WMILL_AI_EVAL_BACKEND_URL ??
|
||||
process.env.WINDMILL_URL ??
|
||||
process.env.WINDMILL_BASE_URL ??
|
||||
process.env.REMOTE ??
|
||||
"http://127.0.0.1:8000"
|
||||
),
|
||||
email: process.env.WMILL_AI_EVAL_BACKEND_EMAIL ?? "admin@windmill.dev",
|
||||
password: process.env.WMILL_AI_EVAL_BACKEND_PASSWORD ?? "changeme",
|
||||
keepWorkspaces: isTruthy(process.env.WMILL_AI_EVAL_KEEP_WORKSPACES),
|
||||
workspaceOverride: sanitizeOptionalWorkspaceId(process.env.WMILL_AI_EVAL_BACKEND_WORKSPACE),
|
||||
workspacePrefix: sanitizeWorkspacePrefix(
|
||||
process.env.WMILL_AI_EVAL_WORKSPACE_PREFIX ?? "ai-evals"
|
||||
),
|
||||
pollIntervalMs: parsePositiveInteger(
|
||||
process.env.WMILL_AI_EVAL_BACKEND_POLL_INTERVAL_MS,
|
||||
2000
|
||||
),
|
||||
maxWaitMs: parsePositiveInteger(process.env.WMILL_AI_EVAL_BACKEND_MAX_WAIT_MS, 120000),
|
||||
};
|
||||
}
|
||||
|
||||
function normalizeBaseUrl(value: string): string {
|
||||
return value.replace(/\/+$/, "");
|
||||
}
|
||||
|
||||
function sanitizeWorkspacePrefix(value: string): string {
|
||||
const sanitized = value
|
||||
.trim()
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9-]+/g, "-")
|
||||
.replace(/^-+|-+$/g, "");
|
||||
return sanitized.length > 0 ? sanitized : "ai-evals";
|
||||
}
|
||||
|
||||
function sanitizeOptionalWorkspaceId(value: string | undefined): string | undefined {
|
||||
const trimmed = value?.trim();
|
||||
return trimmed ? trimmed : undefined;
|
||||
}
|
||||
|
||||
function isTruthy(value: string | undefined): boolean {
|
||||
if (!value) {
|
||||
return false;
|
||||
}
|
||||
return ["1", "true", "yes", "on"].includes(value.trim().toLowerCase());
|
||||
}
|
||||
|
||||
function parsePositiveInteger(value: string | undefined, fallback: number): number {
|
||||
if (!value) {
|
||||
return fallback;
|
||||
}
|
||||
const parsed = Number(value);
|
||||
return Number.isInteger(parsed) && parsed > 0 ? parsed : fallback;
|
||||
}
|
||||
18
ai_evals/core/cases.test.ts
Normal file
18
ai_evals/core/cases.test.ts
Normal file
@@ -0,0 +1,18 @@
|
||||
import { describe, expect, it } from "bun:test";
|
||||
import { loadCases } from "./cases";
|
||||
|
||||
describe("loadCases", () => {
|
||||
it("loads backend preview runtime config for opt-in flow cases", async () => {
|
||||
const flowCases = await loadCases("flow");
|
||||
const caseEntry = flowCases.find((entry) => entry.id === "flow-test1-reuse-existing-script");
|
||||
|
||||
expect(caseEntry?.runtime).toEqual({
|
||||
backendPreview: {
|
||||
args: {
|
||||
a: 2,
|
||||
b: 3,
|
||||
},
|
||||
},
|
||||
});
|
||||
});
|
||||
});
|
||||
73
ai_evals/core/cases.ts
Normal file
73
ai_evals/core/cases.ts
Normal file
@@ -0,0 +1,73 @@
|
||||
import { readFile } from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import { parse } from "yaml";
|
||||
import type { EvalCase, EvalCaseRuntimeSpec, EvalMode, FlowValidationSpec } from "./types";
|
||||
|
||||
const REPO_ROOT = fileURLToPath(new URL("../../", import.meta.url));
|
||||
const CASES_DIR = path.join(REPO_ROOT, "ai_evals", "cases");
|
||||
|
||||
interface RawEvalCase {
|
||||
id: string;
|
||||
prompt: string;
|
||||
initial?: string;
|
||||
expected?: string;
|
||||
validate?: FlowValidationSpec;
|
||||
judgeChecklist?: string[];
|
||||
runtime?: EvalCaseRuntimeSpec;
|
||||
}
|
||||
|
||||
export function getRepoRoot(): string {
|
||||
return REPO_ROOT;
|
||||
}
|
||||
|
||||
export function getAiEvalsRoot(): string {
|
||||
return path.join(REPO_ROOT, "ai_evals");
|
||||
}
|
||||
|
||||
export async function loadCases(mode: EvalMode): Promise<EvalCase[]> {
|
||||
const filePath = path.join(CASES_DIR, `${mode}.yaml`);
|
||||
const raw = await readFile(filePath, "utf8");
|
||||
const parsed = parse(raw);
|
||||
|
||||
if (!Array.isArray(parsed)) {
|
||||
throw new Error(`Expected ${filePath} to contain a YAML list of cases`);
|
||||
}
|
||||
|
||||
return parsed.map((entry) => ({
|
||||
id: entry.id,
|
||||
prompt: entry.prompt,
|
||||
initialPath: resolveFixturePath(entry.initial),
|
||||
expectedPath: resolveFixturePath(entry.expected),
|
||||
validate: entry.validate,
|
||||
judgeChecklist: entry.judgeChecklist,
|
||||
runtime: entry.runtime,
|
||||
}));
|
||||
}
|
||||
|
||||
export async function loadSelectedCases(
|
||||
mode: EvalMode,
|
||||
selectedIds: string[]
|
||||
): Promise<EvalCase[]> {
|
||||
const allCases = await loadCases(mode);
|
||||
if (selectedIds.length === 0) {
|
||||
return allCases;
|
||||
}
|
||||
|
||||
const caseMap = new Map(allCases.map((entry) => [entry.id, entry]));
|
||||
const missing = selectedIds.filter((id) => !caseMap.has(id));
|
||||
if (missing.length > 0) {
|
||||
throw new Error(
|
||||
`Unknown ${mode} case${missing.length === 1 ? "" : "s"}: ${missing.join(", ")}`
|
||||
);
|
||||
}
|
||||
|
||||
return selectedIds.map((id) => caseMap.get(id)!);
|
||||
}
|
||||
|
||||
function resolveFixturePath(value: string | undefined): string | undefined {
|
||||
if (!value) {
|
||||
return undefined;
|
||||
}
|
||||
return path.isAbsolute(value) ? value : path.join(REPO_ROOT, value);
|
||||
}
|
||||
67
ai_evals/core/files.ts
Normal file
67
ai_evals/core/files.ts
Normal file
@@ -0,0 +1,67 @@
|
||||
import { access, copyFile, mkdir, readdir, readFile } from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
|
||||
export async function exists(filePath: string): Promise<boolean> {
|
||||
try {
|
||||
await access(filePath);
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
export async function readJsonFile<T>(filePath: string): Promise<T> {
|
||||
const raw = await readFile(filePath, "utf8");
|
||||
return JSON.parse(raw) as T;
|
||||
}
|
||||
|
||||
export async function readDirectoryFiles(
|
||||
rootDir: string,
|
||||
options: {
|
||||
ignore?: Set<string>;
|
||||
} = {}
|
||||
): Promise<Record<string, string>> {
|
||||
const files: Record<string, string> = {};
|
||||
await walkDirectory(rootDir, "", files, options.ignore ?? new Set());
|
||||
return files;
|
||||
}
|
||||
|
||||
export async function copyDirectory(sourceDir: string, targetDir: string): Promise<void> {
|
||||
const entries = await readdir(sourceDir, { withFileTypes: true });
|
||||
await mkdir(targetDir, { recursive: true });
|
||||
|
||||
for (const entry of entries) {
|
||||
const sourcePath = path.join(sourceDir, entry.name);
|
||||
const targetPath = path.join(targetDir, entry.name);
|
||||
if (entry.isDirectory()) {
|
||||
await copyDirectory(sourcePath, targetPath);
|
||||
continue;
|
||||
}
|
||||
await mkdir(path.dirname(targetPath), { recursive: true });
|
||||
await copyFile(sourcePath, targetPath);
|
||||
}
|
||||
}
|
||||
|
||||
async function walkDirectory(
|
||||
absoluteDir: string,
|
||||
relativeDir: string,
|
||||
output: Record<string, string>,
|
||||
ignore: Set<string>
|
||||
): Promise<void> {
|
||||
const entries = await readdir(absoluteDir, { withFileTypes: true });
|
||||
|
||||
for (const entry of entries) {
|
||||
const relativePath = relativeDir ? `${relativeDir}/${entry.name}` : entry.name;
|
||||
if (ignore.has(relativePath) || ignore.has(entry.name)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const absolutePath = path.join(absoluteDir, entry.name);
|
||||
if (entry.isDirectory()) {
|
||||
await walkDirectory(absolutePath, relativePath, output, ignore);
|
||||
continue;
|
||||
}
|
||||
|
||||
output[relativePath] = await readFile(absolutePath, "utf8");
|
||||
}
|
||||
}
|
||||
151
ai_evals/core/judge.ts
Normal file
151
ai_evals/core/judge.ts
Normal file
@@ -0,0 +1,151 @@
|
||||
import Anthropic from "@anthropic-ai/sdk";
|
||||
import type { EvalMode, JudgeResult } from "./types";
|
||||
|
||||
export const DEFAULT_JUDGE_MODEL = "claude-sonnet-4-6";
|
||||
|
||||
const JUDGE_TOOL_NAME = "submit_judgement";
|
||||
|
||||
export async function judgeOutput(input: {
|
||||
mode: EvalMode;
|
||||
prompt: string;
|
||||
checklist?: string[];
|
||||
initial?: unknown;
|
||||
expected?: unknown;
|
||||
actual: unknown;
|
||||
model?: string;
|
||||
}): Promise<JudgeResult> {
|
||||
const apiKey = process.env.ANTHROPIC_API_KEY;
|
||||
if (!apiKey) {
|
||||
return {
|
||||
success: false,
|
||||
score: 0,
|
||||
summary: "Judge unavailable",
|
||||
error: "ANTHROPIC_API_KEY is not set",
|
||||
};
|
||||
}
|
||||
|
||||
const client = new Anthropic({ apiKey });
|
||||
const model = input.model ?? DEFAULT_JUDGE_MODEL;
|
||||
|
||||
const system = [
|
||||
"You evaluate benchmark outputs for Windmill AI generation.",
|
||||
"Deterministic checks already run separately. Focus on whether the final output satisfies the user request.",
|
||||
"If expected state is provided, treat it as a valid example and reward semantically equivalent outputs.",
|
||||
"If a checklist is provided, treat it as the explicit acceptance criteria for this case.",
|
||||
"Be strict about missing requested functionality.",
|
||||
"When the prompt wording is ambiguous, prefer the checklist over inferred structural requirements.",
|
||||
"Do not invent additional Windmill-specific constraints that are not explicit in the prompt, checklist, or expected state.",
|
||||
"Do not lower the score just because the output uses a different but valid Windmill idiom, naming choice, or equivalent field shape.",
|
||||
"Do not require exact ids, exact topology, or exact field names unless the prompt, checklist, or expected state clearly requires them.",
|
||||
`Always respond by calling the ${JUDGE_TOOL_NAME} tool exactly once.`,
|
||||
].join("\n\n");
|
||||
|
||||
const user = [
|
||||
`Mode: ${input.mode}`,
|
||||
"",
|
||||
"User prompt:",
|
||||
input.prompt,
|
||||
"",
|
||||
"Checklist:",
|
||||
formatChecklist(input.checklist),
|
||||
"",
|
||||
"Initial state:",
|
||||
formatJsonBlock(input.initial),
|
||||
"",
|
||||
"Expected state:",
|
||||
formatJsonBlock(input.expected),
|
||||
"",
|
||||
"Actual result:",
|
||||
formatJsonBlock(input.actual),
|
||||
].join("\n");
|
||||
|
||||
try {
|
||||
const response = await client.messages.create({
|
||||
model,
|
||||
max_tokens: 1024,
|
||||
temperature: 0,
|
||||
system,
|
||||
messages: [{ role: "user", content: user }],
|
||||
tools: [
|
||||
{
|
||||
name: JUDGE_TOOL_NAME,
|
||||
description: "Submit the benchmark judgement as structured data.",
|
||||
input_schema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
score: {
|
||||
type: "integer",
|
||||
minimum: 0,
|
||||
maximum: 100,
|
||||
},
|
||||
summary: {
|
||||
type: "string",
|
||||
},
|
||||
},
|
||||
required: ["score", "summary"],
|
||||
},
|
||||
},
|
||||
],
|
||||
tool_choice: {
|
||||
type: "tool",
|
||||
name: JUDGE_TOOL_NAME,
|
||||
disable_parallel_tool_use: true,
|
||||
},
|
||||
});
|
||||
|
||||
const toolUseBlock = response.content.find(
|
||||
(block): block is Anthropic.ToolUseBlock =>
|
||||
block.type === "tool_use" && block.name === JUDGE_TOOL_NAME
|
||||
);
|
||||
|
||||
if (!toolUseBlock) {
|
||||
return {
|
||||
success: false,
|
||||
score: 0,
|
||||
summary: "Judge returned no tool output",
|
||||
error: "Expected structured tool output from judge",
|
||||
};
|
||||
}
|
||||
|
||||
const parsed = toolUseBlock.input as {
|
||||
score: number;
|
||||
summary: string;
|
||||
};
|
||||
|
||||
return {
|
||||
success: true,
|
||||
score: normalizeScore(parsed.score),
|
||||
summary: parsed.summary,
|
||||
};
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
return {
|
||||
success: false,
|
||||
score: 0,
|
||||
summary: "Judge failed",
|
||||
error: message,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
function formatJsonBlock(value: unknown): string {
|
||||
if (value === undefined) {
|
||||
return "(none)";
|
||||
}
|
||||
return JSON.stringify(value, null, 2);
|
||||
}
|
||||
|
||||
function formatChecklist(checklist: string[] | undefined): string {
|
||||
if (!checklist || checklist.length === 0) {
|
||||
return "(none)";
|
||||
}
|
||||
|
||||
return checklist.map((item) => `- ${item}`).join("\n");
|
||||
}
|
||||
|
||||
function normalizeScore(value: number): number {
|
||||
if (!Number.isFinite(value)) {
|
||||
return 0;
|
||||
}
|
||||
return Math.max(0, Math.min(100, Math.round(value)));
|
||||
}
|
||||
29
ai_evals/core/models.test.ts
Normal file
29
ai_evals/core/models.test.ts
Normal file
@@ -0,0 +1,29 @@
|
||||
import { describe, expect, it } from "bun:test";
|
||||
import { resolveEvalModel } from "./models";
|
||||
|
||||
describe("resolveEvalModel", () => {
|
||||
it("supports Gemini aliases for frontend evals", () => {
|
||||
expect(resolveEvalModel("flow", "gemini").frontend).toEqual({
|
||||
provider: "googleai",
|
||||
model: "gemini-2.5-flash",
|
||||
});
|
||||
expect(resolveEvalModel("app", "gemini-pro").frontend).toEqual({
|
||||
provider: "googleai",
|
||||
model: "gemini-2.5-pro",
|
||||
});
|
||||
expect(resolveEvalModel("script", "gemini-3-flash-preview").frontend).toEqual({
|
||||
provider: "googleai",
|
||||
model: "gemini-3-flash-preview",
|
||||
});
|
||||
expect(resolveEvalModel("flow", "gemini-3.1-pro-preview").frontend).toEqual({
|
||||
provider: "googleai",
|
||||
model: "gemini-3.1-pro-preview",
|
||||
});
|
||||
});
|
||||
|
||||
it("rejects Gemini aliases for cli evals", () => {
|
||||
expect(() => resolveEvalModel("cli", "gemini")).toThrow(
|
||||
"Model gemini-flash is not supported for cli mode"
|
||||
);
|
||||
});
|
||||
});
|
||||
185
ai_evals/core/models.ts
Normal file
185
ai_evals/core/models.ts
Normal file
@@ -0,0 +1,185 @@
|
||||
import type { EvalMode } from "./types";
|
||||
|
||||
export interface FrontendEvalModelConfig {
|
||||
provider: "anthropic" | "openai" | "googleai";
|
||||
model: string;
|
||||
}
|
||||
|
||||
export interface CliEvalModelConfig {
|
||||
provider: "anthropic";
|
||||
model: string;
|
||||
}
|
||||
|
||||
export interface EvalModelSpec {
|
||||
id: string;
|
||||
label: string;
|
||||
aliases: string[];
|
||||
frontend?: FrontendEvalModelConfig;
|
||||
cli?: CliEvalModelConfig;
|
||||
}
|
||||
|
||||
export const EVAL_MODELS: EvalModelSpec[] = [
|
||||
{
|
||||
id: "haiku",
|
||||
label: "Claude Haiku 4.5",
|
||||
aliases: [
|
||||
"haiku",
|
||||
"haiku-4.5",
|
||||
"claude-haiku",
|
||||
"claude-haiku-4.5",
|
||||
"claude-haiku-4-5",
|
||||
"claude-haiku-4-5-20251001",
|
||||
],
|
||||
frontend: {
|
||||
provider: "anthropic",
|
||||
model: "claude-haiku-4-5-20251001",
|
||||
},
|
||||
cli: {
|
||||
provider: "anthropic",
|
||||
model: "haiku",
|
||||
},
|
||||
},
|
||||
{
|
||||
id: "sonnet",
|
||||
label: "Claude Sonnet 4.5",
|
||||
aliases: [
|
||||
"sonnet",
|
||||
"sonnet-4.5",
|
||||
"claude-sonnet",
|
||||
"claude-sonnet-4.5",
|
||||
"claude-sonnet-4-5",
|
||||
"claude-sonnet-4-5-20250929",
|
||||
],
|
||||
frontend: {
|
||||
provider: "anthropic",
|
||||
model: "claude-sonnet-4-5-20250929",
|
||||
},
|
||||
cli: {
|
||||
provider: "anthropic",
|
||||
model: "sonnet",
|
||||
},
|
||||
},
|
||||
{
|
||||
id: "opus",
|
||||
label: "Claude Opus 4.6",
|
||||
aliases: [
|
||||
"opus",
|
||||
"opus-4.6",
|
||||
"claude-opus",
|
||||
"claude-opus-4.6",
|
||||
"claude-opus-4-6",
|
||||
],
|
||||
frontend: {
|
||||
provider: "anthropic",
|
||||
model: "claude-opus-4-6",
|
||||
},
|
||||
cli: {
|
||||
provider: "anthropic",
|
||||
model: "opus",
|
||||
},
|
||||
},
|
||||
{
|
||||
id: "4o",
|
||||
label: "GPT-4o",
|
||||
aliases: ["4o", "gpt-4o"],
|
||||
frontend: {
|
||||
provider: "openai",
|
||||
model: "gpt-4o",
|
||||
},
|
||||
},
|
||||
{
|
||||
id: "gemini-flash",
|
||||
label: "Gemini 2.5 Flash",
|
||||
aliases: ["gemini", "gemini-flash", "gemini-2.5-flash"],
|
||||
frontend: {
|
||||
provider: "googleai",
|
||||
model: "gemini-2.5-flash",
|
||||
},
|
||||
},
|
||||
{
|
||||
id: "gemini-pro",
|
||||
label: "Gemini 2.5 Pro",
|
||||
aliases: ["gemini-pro", "gemini-2.5-pro"],
|
||||
frontend: {
|
||||
provider: "googleai",
|
||||
model: "gemini-2.5-pro",
|
||||
},
|
||||
},
|
||||
{
|
||||
id: "gemini-3-flash-preview",
|
||||
label: "Gemini 3 Flash Preview",
|
||||
aliases: ["gemini-3-flash-preview", "gemini-3-flash"],
|
||||
frontend: {
|
||||
provider: "googleai",
|
||||
model: "gemini-3-flash-preview",
|
||||
},
|
||||
},
|
||||
{
|
||||
id: "gemini-3.1-pro-preview",
|
||||
label: "Gemini 3.1 Pro Preview",
|
||||
aliases: ["gemini-3.1-pro-preview", "gemini-3.1-pro", "gemini-3-pro-preview"],
|
||||
frontend: {
|
||||
provider: "googleai",
|
||||
model: "gemini-3.1-pro-preview",
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
export function resolveEvalModel(mode: EvalMode, alias?: string): EvalModelSpec {
|
||||
const spec = alias ? findEvalModel(alias) : getDefaultEvalModel(mode);
|
||||
if (!spec) {
|
||||
throw new Error(`Unknown model: ${alias}`);
|
||||
}
|
||||
|
||||
if (mode === "cli" && !spec.cli) {
|
||||
throw new Error(`Model ${spec.id} is not supported for cli mode`);
|
||||
}
|
||||
|
||||
if (mode !== "cli" && !spec.frontend) {
|
||||
throw new Error(`Model ${spec.id} is not supported for ${mode} mode`);
|
||||
}
|
||||
|
||||
return spec;
|
||||
}
|
||||
|
||||
export function getEvalModelHelpText(): string {
|
||||
return EVAL_MODELS.map((model) => {
|
||||
const modes = [
|
||||
...(model.frontend ? ["flow", "script", "app"] : []),
|
||||
...(model.cli ? ["cli"] : []),
|
||||
];
|
||||
return ` ${model.id.padEnd(8)} ${model.label} (${modes.join(", ")})`;
|
||||
}).join("\n");
|
||||
}
|
||||
|
||||
export function formatRunModelLabel(mode: EvalMode, model: EvalModelSpec): string {
|
||||
if (mode === "cli") {
|
||||
return `${model.cli!.provider}:${model.cli!.model}`;
|
||||
}
|
||||
return `${model.frontend!.provider}:${model.frontend!.model}`;
|
||||
}
|
||||
|
||||
export function getFrontendEvalModel(model: EvalModelSpec): FrontendEvalModelConfig {
|
||||
if (!model.frontend) {
|
||||
throw new Error(`Model ${model.id} does not support frontend evals`);
|
||||
}
|
||||
return model.frontend;
|
||||
}
|
||||
|
||||
export function getCliEvalModel(model: EvalModelSpec): CliEvalModelConfig {
|
||||
if (!model.cli) {
|
||||
throw new Error(`Model ${model.id} does not support cli evals`);
|
||||
}
|
||||
return model.cli;
|
||||
}
|
||||
|
||||
function getDefaultEvalModel(mode: EvalMode): EvalModelSpec {
|
||||
return mode === "cli" ? EVAL_MODELS[0]! : EVAL_MODELS[0]!;
|
||||
}
|
||||
|
||||
function findEvalModel(alias: string): EvalModelSpec | undefined {
|
||||
const normalized = alias.trim().toLowerCase();
|
||||
return EVAL_MODELS.find((model) =>
|
||||
[model.id, ...model.aliases].some((candidate) => candidate.toLowerCase() === normalized)
|
||||
);
|
||||
}
|
||||
296
ai_evals/core/results.ts
Normal file
296
ai_evals/core/results.ts
Normal file
@@ -0,0 +1,296 @@
|
||||
import { appendFile, mkdir, rm, writeFile } from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { execFileSync } from "node:child_process";
|
||||
import { getAiEvalsRoot, getRepoRoot } from "./cases";
|
||||
import type {
|
||||
BenchmarkArtifactFile,
|
||||
BenchmarkCaseResult,
|
||||
BenchmarkRunResult,
|
||||
BenchmarkTokenUsage,
|
||||
EvalMode,
|
||||
} from "./types";
|
||||
|
||||
export async function writeRunResult(
|
||||
result: BenchmarkRunResult,
|
||||
outputPath?: string
|
||||
): Promise<string> {
|
||||
const targetPath = resolveRunOutputPath(result.mode, outputPath);
|
||||
await mkdir(path.dirname(targetPath), { recursive: true });
|
||||
await writeFile(targetPath, JSON.stringify(toSerializableRunResult(result), null, 2) + "\n", "utf8");
|
||||
return targetPath;
|
||||
}
|
||||
|
||||
export async function appendHistoryRecord(
|
||||
result: BenchmarkRunResult,
|
||||
historyPath = resolveHistoryPath(result.mode)
|
||||
): Promise<string> {
|
||||
await mkdir(path.dirname(historyPath), { recursive: true });
|
||||
await appendFile(historyPath, JSON.stringify(toHistoryRecord(result)) + "\n", "utf8");
|
||||
return historyPath;
|
||||
}
|
||||
|
||||
export async function writeRunArtifacts(
|
||||
result: BenchmarkRunResult,
|
||||
outputPath?: string
|
||||
): Promise<string | null> {
|
||||
const targetPath = resolveRunOutputPath(result.mode, outputPath);
|
||||
const artifactRoot = defaultArtifactsRoot(targetPath);
|
||||
|
||||
await rm(artifactRoot, { recursive: true, force: true });
|
||||
|
||||
let wroteArtifacts = false;
|
||||
for (const caseResult of result.cases) {
|
||||
for (const attempt of caseResult.attempts) {
|
||||
const artifactFiles = attempt.artifactFiles ?? [];
|
||||
if (artifactFiles.length === 0) {
|
||||
attempt.artifactsPath = null;
|
||||
continue;
|
||||
}
|
||||
|
||||
const attemptDir = path.join(artifactRoot, caseResult.id, `attempt-${attempt.attempt}`);
|
||||
await writeArtifactFiles(attemptDir, artifactFiles);
|
||||
attempt.artifactsPath = attemptDir;
|
||||
wroteArtifacts = true;
|
||||
}
|
||||
}
|
||||
|
||||
result.artifactsPath = wroteArtifacts ? artifactRoot : null;
|
||||
return result.artifactsPath ?? null;
|
||||
}
|
||||
|
||||
export function buildRunResult(input: {
|
||||
mode: EvalMode;
|
||||
runs: number;
|
||||
runModel: string | null;
|
||||
judgeModel: string | null;
|
||||
caseResults: BenchmarkCaseResult[];
|
||||
}): BenchmarkRunResult {
|
||||
const attemptCount = input.caseResults.reduce((sum, entry) => sum + entry.attempts.length, 0);
|
||||
const passedAttempts = input.caseResults.reduce(
|
||||
(sum, entry) => sum + entry.attempts.filter((attempt) => attempt.passed).length,
|
||||
0
|
||||
);
|
||||
const durationTotal = input.caseResults.reduce(
|
||||
(sum, entry) => sum + entry.attempts.reduce((inner, attempt) => inner + attempt.durationMs, 0),
|
||||
0
|
||||
);
|
||||
const tokenUsageTotal = input.caseResults.reduce<BenchmarkTokenUsage | null>(
|
||||
(sum, entry) => {
|
||||
for (const attempt of entry.attempts) {
|
||||
if (!attempt.tokenUsage) {
|
||||
continue;
|
||||
}
|
||||
sum ??= { prompt: 0, completion: 0, total: 0 };
|
||||
sum.prompt += attempt.tokenUsage.prompt;
|
||||
sum.completion += attempt.tokenUsage.completion;
|
||||
sum.total += attempt.tokenUsage.total;
|
||||
}
|
||||
return sum;
|
||||
},
|
||||
null
|
||||
);
|
||||
|
||||
return {
|
||||
version: 1,
|
||||
mode: input.mode,
|
||||
createdAt: new Date().toISOString(),
|
||||
gitSha: getGitSha(),
|
||||
runs: input.runs,
|
||||
runModel: input.runModel,
|
||||
judgeModel: input.judgeModel,
|
||||
caseCount: input.caseResults.length,
|
||||
attemptCount,
|
||||
passedAttempts,
|
||||
passRate: attemptCount === 0 ? 0 : passedAttempts / attemptCount,
|
||||
averageDurationMs: attemptCount === 0 ? 0 : durationTotal / attemptCount,
|
||||
totalTokenUsage: tokenUsageTotal,
|
||||
averageTokenUsagePerAttempt:
|
||||
attemptCount === 0 || !tokenUsageTotal
|
||||
? null
|
||||
: {
|
||||
prompt: tokenUsageTotal.prompt / attemptCount,
|
||||
completion: tokenUsageTotal.completion / attemptCount,
|
||||
total: tokenUsageTotal.total / attemptCount,
|
||||
},
|
||||
cases: input.caseResults,
|
||||
};
|
||||
}
|
||||
|
||||
export function formatRunSummary(result: BenchmarkRunResult): string {
|
||||
const lines = [
|
||||
`${result.mode} benchmark complete`,
|
||||
`Pass rate: ${formatPercent(result.passRate)} (${result.passedAttempts}/${result.attemptCount})`,
|
||||
`Average duration: ${Math.round(result.averageDurationMs)}ms`,
|
||||
];
|
||||
|
||||
const failures = collectFailures(result);
|
||||
if (failures.length > 0) {
|
||||
lines.push("Failures:");
|
||||
for (const entry of failures.slice(0, 10)) {
|
||||
lines.push(`- ${entry}`);
|
||||
}
|
||||
}
|
||||
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
function collectFailures(result: BenchmarkRunResult): string[] {
|
||||
const failures: string[] = [];
|
||||
|
||||
for (const caseResult of result.cases) {
|
||||
for (const attempt of caseResult.attempts) {
|
||||
if (attempt.passed) {
|
||||
continue;
|
||||
}
|
||||
const failedChecks = attempt.checks.filter((check) => !check.passed).map((check) => check.name);
|
||||
failures.push(
|
||||
`${caseResult.id} attempt ${attempt.attempt}: ${failedChecks.join(", ") || attempt.error || "failed"}`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return failures;
|
||||
}
|
||||
|
||||
function defaultFileName(mode: EvalMode): string {
|
||||
return `${new Date().toISOString().replaceAll(":", "-")}__${mode}.json`;
|
||||
}
|
||||
|
||||
export function resolveRunOutputPath(mode: EvalMode, outputPath?: string): string {
|
||||
return outputPath ?? path.join(getAiEvalsRoot(), "results", defaultFileName(mode));
|
||||
}
|
||||
|
||||
export function resolveHistoryPath(mode: EvalMode): string {
|
||||
return path.join(getAiEvalsRoot(), "history", `${mode}.jsonl`);
|
||||
}
|
||||
|
||||
function defaultArtifactsRoot(resultPath: string): string {
|
||||
return resultPath.endsWith(".json")
|
||||
? resultPath.slice(0, -".json".length)
|
||||
: `${resultPath}.artifacts`;
|
||||
}
|
||||
|
||||
async function writeArtifactFiles(
|
||||
rootDir: string,
|
||||
files: BenchmarkArtifactFile[]
|
||||
): Promise<void> {
|
||||
for (const file of files) {
|
||||
const relativePath = normalizeArtifactPath(file.path);
|
||||
const targetPath = path.join(rootDir, relativePath);
|
||||
await mkdir(path.dirname(targetPath), { recursive: true });
|
||||
await writeFile(targetPath, file.content, "utf8");
|
||||
}
|
||||
}
|
||||
|
||||
function normalizeArtifactPath(filePath: string): string {
|
||||
const normalized = filePath.replaceAll("\\", "/").replace(/^\/+/, "");
|
||||
const parts = normalized.split("/").filter(Boolean);
|
||||
if (parts.length === 0 || parts.some((part) => part === "." || part === "..")) {
|
||||
throw new Error(`Invalid artifact path: ${filePath}`);
|
||||
}
|
||||
return parts.join("/");
|
||||
}
|
||||
|
||||
function toSerializableRunResult(result: BenchmarkRunResult): BenchmarkRunResult {
|
||||
return {
|
||||
...result,
|
||||
cases: result.cases.map((caseResult) => ({
|
||||
...caseResult,
|
||||
attempts: caseResult.attempts.map(({ artifactFiles, ...attempt }) => attempt),
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
function toHistoryRecord(result: BenchmarkRunResult) {
|
||||
const judgeScores = result.cases.flatMap((caseResult) =>
|
||||
caseResult.attempts.flatMap((attempt) =>
|
||||
typeof attempt.judgeScore === "number" ? [attempt.judgeScore] : []
|
||||
)
|
||||
);
|
||||
|
||||
return {
|
||||
createdAt: result.createdAt,
|
||||
gitSha: result.gitSha,
|
||||
mode: result.mode,
|
||||
runs: result.runs,
|
||||
runModel: result.runModel,
|
||||
judgeModel: result.judgeModel,
|
||||
caseCount: result.caseCount,
|
||||
attemptCount: result.attemptCount,
|
||||
passedAttempts: result.passedAttempts,
|
||||
passRate: result.passRate,
|
||||
averageDurationMs: result.averageDurationMs,
|
||||
averageJudgeScore:
|
||||
judgeScores.length === 0
|
||||
? null
|
||||
: judgeScores.reduce((sum, score) => sum + score, 0) / judgeScores.length,
|
||||
averageTokenUsagePerAttempt: result.averageTokenUsagePerAttempt ?? null,
|
||||
failedCaseIds: Array.from(
|
||||
new Set(
|
||||
result.cases
|
||||
.filter((caseResult) => caseResult.attempts.some((attempt) => !attempt.passed))
|
||||
.map((caseResult) => caseResult.id)
|
||||
)
|
||||
),
|
||||
cases: result.cases.map((caseResult) => {
|
||||
const attemptCount = caseResult.attempts.length;
|
||||
const passedAttempts = caseResult.attempts.filter((attempt) => attempt.passed).length;
|
||||
const totalDurationMs = caseResult.attempts.reduce(
|
||||
(sum, attempt) => sum + attempt.durationMs,
|
||||
0
|
||||
);
|
||||
const judgeScores = caseResult.attempts.flatMap((attempt) =>
|
||||
typeof attempt.judgeScore === "number" ? [attempt.judgeScore] : []
|
||||
);
|
||||
const totalTokenUsage = caseResult.attempts.reduce<BenchmarkTokenUsage | null>(
|
||||
(sum, attempt) => {
|
||||
if (!attempt.tokenUsage) {
|
||||
return sum;
|
||||
}
|
||||
sum ??= { prompt: 0, completion: 0, total: 0 };
|
||||
sum.prompt += attempt.tokenUsage.prompt;
|
||||
sum.completion += attempt.tokenUsage.completion;
|
||||
sum.total += attempt.tokenUsage.total;
|
||||
return sum;
|
||||
},
|
||||
null
|
||||
);
|
||||
|
||||
return {
|
||||
id: caseResult.id,
|
||||
attemptCount,
|
||||
passedAttempts,
|
||||
passRate: attemptCount === 0 ? 0 : passedAttempts / attemptCount,
|
||||
averageDurationMs: attemptCount === 0 ? 0 : totalDurationMs / attemptCount,
|
||||
averageJudgeScore:
|
||||
judgeScores.length === 0
|
||||
? null
|
||||
: judgeScores.reduce((sum, score) => sum + score, 0) / judgeScores.length,
|
||||
averageTokenUsagePerAttempt:
|
||||
attemptCount === 0 || !totalTokenUsage
|
||||
? null
|
||||
: {
|
||||
prompt: totalTokenUsage.prompt / attemptCount,
|
||||
completion: totalTokenUsage.completion / attemptCount,
|
||||
total: totalTokenUsage.total / attemptCount,
|
||||
},
|
||||
};
|
||||
}),
|
||||
};
|
||||
}
|
||||
|
||||
function getGitSha(): string | null {
|
||||
try {
|
||||
return execFileSync("git", ["rev-parse", "HEAD"], {
|
||||
cwd: getRepoRoot(),
|
||||
encoding: "utf8",
|
||||
stdio: ["ignore", "pipe", "ignore"],
|
||||
}).trim();
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function formatPercent(value: number): string {
|
||||
return `${(value * 100).toFixed(1)}%`;
|
||||
}
|
||||
301
ai_evals/core/runSuite.ts
Normal file
301
ai_evals/core/runSuite.ts
Normal file
@@ -0,0 +1,301 @@
|
||||
import { judgeOutput, DEFAULT_JUDGE_MODEL } from "./judge";
|
||||
import type {
|
||||
BenchmarkAttemptResult,
|
||||
BenchmarkCaseResult,
|
||||
BenchmarkCheck,
|
||||
EvalCase,
|
||||
FrontendBenchmarkProgressEvent,
|
||||
ModeRunner,
|
||||
} from "./types";
|
||||
|
||||
export async function runSuite<TInitial, TExpected, TActual>(input: {
|
||||
modeRunner: ModeRunner<TInitial, TExpected, TActual>;
|
||||
cases: EvalCase[];
|
||||
runs: number;
|
||||
runModel: string | null;
|
||||
judgeModel?: string | null;
|
||||
concurrency?: number;
|
||||
verbose?: boolean;
|
||||
onProgress?: (event: FrontendBenchmarkProgressEvent) => void;
|
||||
}): Promise<BenchmarkCaseResult[]> {
|
||||
const judgeModel = input.judgeModel ?? DEFAULT_JUDGE_MODEL;
|
||||
const concurrency = Math.max(1, input.concurrency ?? input.modeRunner.concurrency);
|
||||
const results = new Array<BenchmarkCaseResult>(input.cases.length);
|
||||
let cursor = 0;
|
||||
|
||||
if (input.modeRunner.mode !== "cli") {
|
||||
input.onProgress?.({
|
||||
type: "run-start",
|
||||
surface: input.modeRunner.mode,
|
||||
totalCases: input.cases.length,
|
||||
runs: input.runs,
|
||||
concurrency,
|
||||
});
|
||||
}
|
||||
|
||||
async function worker(): Promise<void> {
|
||||
while (true) {
|
||||
const caseIndex = cursor++;
|
||||
if (caseIndex >= input.cases.length) {
|
||||
return;
|
||||
}
|
||||
const evalCase = input.cases[caseIndex];
|
||||
results[caseIndex] = {
|
||||
id: evalCase.id,
|
||||
prompt: evalCase.prompt,
|
||||
initialPath: evalCase.initialPath,
|
||||
expectedPath: evalCase.expectedPath,
|
||||
attempts: await runCaseAttempts({
|
||||
caseIndex,
|
||||
evalCase,
|
||||
runs: input.runs,
|
||||
judgeModel,
|
||||
judgeThreshold: input.modeRunner.judgeThreshold ?? 80,
|
||||
modeRunner: input.modeRunner,
|
||||
totalCases: input.cases.length,
|
||||
verbose: input.verbose ?? false,
|
||||
onProgress: input.onProgress,
|
||||
}),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
await Promise.all(
|
||||
Array.from({ length: Math.min(concurrency, input.cases.length) }, () => worker())
|
||||
);
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
async function runCaseAttempts<TInitial, TExpected, TActual>(input: {
|
||||
caseIndex: number;
|
||||
evalCase: EvalCase;
|
||||
runs: number;
|
||||
judgeModel: string;
|
||||
judgeThreshold: number;
|
||||
modeRunner: ModeRunner<TInitial, TExpected, TActual>;
|
||||
totalCases: number;
|
||||
verbose: boolean;
|
||||
onProgress?: (event: FrontendBenchmarkProgressEvent) => void;
|
||||
}): Promise<BenchmarkAttemptResult[]> {
|
||||
const attempts: BenchmarkAttemptResult[] = [];
|
||||
const surface = input.modeRunner.mode === "cli" ? null : input.modeRunner.mode;
|
||||
|
||||
for (let attempt = 1; attempt <= input.runs; attempt += 1) {
|
||||
if (surface) {
|
||||
input.onProgress?.({
|
||||
type: "attempt-start",
|
||||
surface,
|
||||
caseId: input.evalCase.id,
|
||||
caseNumber: input.caseIndex + 1,
|
||||
totalCases: input.totalCases,
|
||||
attempt,
|
||||
runs: input.runs,
|
||||
});
|
||||
}
|
||||
|
||||
const startedAt = Date.now();
|
||||
|
||||
try {
|
||||
const initial = await input.modeRunner.loadInitial(input.evalCase.initialPath);
|
||||
const expected = await input.modeRunner.loadExpected(input.evalCase.expectedPath);
|
||||
const run = await input.modeRunner.run(input.evalCase.prompt, initial, {
|
||||
caseId: input.evalCase.id,
|
||||
caseNumber: input.caseIndex + 1,
|
||||
totalCases: input.totalCases,
|
||||
attempt,
|
||||
runs: input.runs,
|
||||
verbose: input.verbose,
|
||||
onAssistantMessageStart: input.verbose && surface
|
||||
? () =>
|
||||
input.onProgress?.({
|
||||
type: "assistant-message-start",
|
||||
surface,
|
||||
caseId: input.evalCase.id,
|
||||
caseNumber: input.caseIndex + 1,
|
||||
totalCases: input.totalCases,
|
||||
attempt,
|
||||
runs: input.runs,
|
||||
})
|
||||
: undefined,
|
||||
onAssistantChunk: input.verbose && surface
|
||||
? (chunk: string) =>
|
||||
input.onProgress?.({
|
||||
type: "assistant-chunk",
|
||||
surface,
|
||||
caseId: input.evalCase.id,
|
||||
caseNumber: input.caseIndex + 1,
|
||||
totalCases: input.totalCases,
|
||||
attempt,
|
||||
runs: input.runs,
|
||||
chunk,
|
||||
})
|
||||
: undefined,
|
||||
onAssistantMessageEnd: input.verbose && surface
|
||||
? () =>
|
||||
input.onProgress?.({
|
||||
type: "assistant-message-end",
|
||||
surface,
|
||||
caseId: input.evalCase.id,
|
||||
caseNumber: input.caseIndex + 1,
|
||||
totalCases: input.totalCases,
|
||||
attempt,
|
||||
runs: input.runs,
|
||||
})
|
||||
: undefined,
|
||||
});
|
||||
const checks: BenchmarkCheck[] = [
|
||||
buildCheck("run succeeded", run.success, run.error),
|
||||
...input.modeRunner.validate({
|
||||
evalCase: input.evalCase,
|
||||
prompt: input.evalCase.prompt,
|
||||
initial,
|
||||
expected,
|
||||
actual: run.actual,
|
||||
run,
|
||||
}),
|
||||
];
|
||||
const artifactFiles = input.modeRunner.buildArtifacts?.(run.actual) ?? [];
|
||||
|
||||
if (run.success && input.modeRunner.backendValidate) {
|
||||
try {
|
||||
const backendValidation = await input.modeRunner.backendValidate({
|
||||
evalCase: input.evalCase,
|
||||
prompt: input.evalCase.prompt,
|
||||
initial,
|
||||
expected,
|
||||
actual: run.actual,
|
||||
run,
|
||||
context: {
|
||||
caseId: input.evalCase.id,
|
||||
caseNumber: input.caseIndex + 1,
|
||||
totalCases: input.totalCases,
|
||||
attempt,
|
||||
runs: input.runs,
|
||||
verbose: input.verbose,
|
||||
onAssistantMessageStart: undefined,
|
||||
onAssistantChunk: undefined,
|
||||
onAssistantMessageEnd: undefined,
|
||||
},
|
||||
});
|
||||
|
||||
if (backendValidation) {
|
||||
checks.push(...backendValidation.checks);
|
||||
artifactFiles.push(...(backendValidation.artifactFiles ?? []));
|
||||
}
|
||||
} catch (error) {
|
||||
checks.push(
|
||||
buildCheck(
|
||||
"backend validation succeeded",
|
||||
false,
|
||||
error instanceof Error ? error.message : String(error)
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let judgeScore: number | null = null;
|
||||
let judgeSummary: string | null = null;
|
||||
|
||||
if (run.success) {
|
||||
const judge = await judgeOutput({
|
||||
mode: input.modeRunner.mode,
|
||||
prompt: input.evalCase.prompt,
|
||||
checklist: input.evalCase.judgeChecklist,
|
||||
initial,
|
||||
expected: input.modeRunner.mode === "cli" ? undefined : expected,
|
||||
actual: run.actual,
|
||||
model: input.judgeModel,
|
||||
});
|
||||
|
||||
judgeScore = judge.success ? judge.score : null;
|
||||
judgeSummary = judge.summary;
|
||||
checks.push(buildCheck("judge succeeded", judge.success, judge.error));
|
||||
checks.push(
|
||||
buildCheck(
|
||||
`judge score >= ${input.judgeThreshold}`,
|
||||
(judgeScore ?? 0) >= input.judgeThreshold,
|
||||
judge.success ? `score=${judgeScore}` : judge.error
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
const attemptResult: BenchmarkAttemptResult = {
|
||||
attempt,
|
||||
passed: checks.every((check) => check.passed),
|
||||
durationMs: Date.now() - startedAt,
|
||||
assistantMessageCount: run.assistantMessageCount,
|
||||
toolCallCount: run.toolCallCount,
|
||||
toolsUsed: uniqueStrings(run.toolsUsed),
|
||||
skillsInvoked: uniqueStrings(run.skillsInvoked),
|
||||
checks,
|
||||
judgeScore,
|
||||
judgeSummary,
|
||||
error: run.error ?? null,
|
||||
tokenUsage: run.tokenUsage ?? null,
|
||||
artifactsPath: null,
|
||||
artifactFiles,
|
||||
};
|
||||
|
||||
if (surface) {
|
||||
input.onProgress?.({
|
||||
type: "attempt-finish",
|
||||
surface,
|
||||
caseId: input.evalCase.id,
|
||||
caseNumber: input.caseIndex + 1,
|
||||
totalCases: input.totalCases,
|
||||
attempt,
|
||||
runs: input.runs,
|
||||
passed: attemptResult.passed,
|
||||
durationMs: attemptResult.durationMs,
|
||||
judgeScore: attemptResult.judgeScore,
|
||||
error: attemptResult.error,
|
||||
});
|
||||
}
|
||||
|
||||
attempts.push(attemptResult);
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
const failedAttempt: BenchmarkAttemptResult = {
|
||||
attempt,
|
||||
passed: false,
|
||||
durationMs: Date.now() - startedAt,
|
||||
assistantMessageCount: 0,
|
||||
toolCallCount: 0,
|
||||
toolsUsed: [],
|
||||
skillsInvoked: [],
|
||||
checks: [buildCheck("run crashed", false, message)],
|
||||
judgeScore: null,
|
||||
judgeSummary: null,
|
||||
error: message,
|
||||
tokenUsage: null,
|
||||
};
|
||||
if (surface) {
|
||||
input.onProgress?.({
|
||||
type: "attempt-finish",
|
||||
surface,
|
||||
caseId: input.evalCase.id,
|
||||
caseNumber: input.caseIndex + 1,
|
||||
totalCases: input.totalCases,
|
||||
attempt,
|
||||
runs: input.runs,
|
||||
passed: false,
|
||||
durationMs: failedAttempt.durationMs,
|
||||
judgeScore: null,
|
||||
error: message,
|
||||
});
|
||||
}
|
||||
attempts.push(failedAttempt);
|
||||
}
|
||||
}
|
||||
|
||||
return attempts;
|
||||
}
|
||||
|
||||
function buildCheck(name: string, passed: boolean, details?: string): BenchmarkCheck {
|
||||
return details ? { name, passed, details } : { name, passed };
|
||||
}
|
||||
|
||||
function uniqueStrings(values: string[]): string[] {
|
||||
return [...new Set(values)];
|
||||
}
|
||||
255
ai_evals/core/types.ts
Normal file
255
ai_evals/core/types.ts
Normal file
@@ -0,0 +1,255 @@
|
||||
export const EVAL_MODES = ["cli", "flow", "script", "app"] as const;
|
||||
|
||||
export type EvalMode = (typeof EVAL_MODES)[number];
|
||||
|
||||
export interface EvalCaseRuntimeBackendPreview {
|
||||
args?: Record<string, unknown>;
|
||||
timeoutSeconds?: number;
|
||||
}
|
||||
|
||||
export interface EvalCaseRuntimeSpec {
|
||||
backendPreview?: EvalCaseRuntimeBackendPreview;
|
||||
}
|
||||
|
||||
export interface FlowValidationSpec {
|
||||
schemaRequiredPaths?: string[];
|
||||
schemaAnyOf?: Array<{
|
||||
requiredPaths: string[];
|
||||
}>;
|
||||
exactTopLevelStepIds?: string[];
|
||||
topLevelStepIds?: string[];
|
||||
topLevelStepOrder?: string[];
|
||||
topLevelStepTypeCountsAtLeast?: Array<{
|
||||
type: string;
|
||||
count: number;
|
||||
}>;
|
||||
topLevelStepTypes?: Array<{
|
||||
id: string;
|
||||
type: string;
|
||||
}>;
|
||||
moduleRules?: Array<{
|
||||
id: string;
|
||||
hasStopAfterIf?: boolean;
|
||||
hasStopAfterAllItersIf?: boolean;
|
||||
immediateChildStepIds?: string[];
|
||||
exactImmediateChildStepIds?: string[];
|
||||
immediateChildStepTypes?: Array<{
|
||||
id: string;
|
||||
type: string;
|
||||
}>;
|
||||
requiredInputTransforms?: Array<{
|
||||
type?: string;
|
||||
expr?: string;
|
||||
exprAnyOf?: string[];
|
||||
value?: string | number | boolean | null;
|
||||
}>;
|
||||
}>;
|
||||
moduleFieldRules?: Array<{
|
||||
id: string;
|
||||
path: string;
|
||||
equals: string | number | boolean | null;
|
||||
}>;
|
||||
resolveResultsRefs?: boolean;
|
||||
requireSpecialModules?: Array<"preprocessor_module" | "failure_module">;
|
||||
requireSuspendSteps?: Array<{
|
||||
id: string;
|
||||
requiredEvents?: number;
|
||||
resumeRequiredStringFieldAnyOf?: string[];
|
||||
}>;
|
||||
}
|
||||
|
||||
export interface EvalCase {
|
||||
id: string;
|
||||
prompt: string;
|
||||
initialPath?: string;
|
||||
expectedPath?: string;
|
||||
validate?: FlowValidationSpec;
|
||||
judgeChecklist?: string[];
|
||||
runtime?: EvalCaseRuntimeSpec;
|
||||
}
|
||||
|
||||
export interface BenchmarkCheck {
|
||||
name: string;
|
||||
passed: boolean;
|
||||
details?: string;
|
||||
}
|
||||
|
||||
export interface JudgeResult {
|
||||
success: boolean;
|
||||
score: number;
|
||||
summary: string;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export interface BenchmarkArtifactFile {
|
||||
path: string;
|
||||
content: string;
|
||||
}
|
||||
|
||||
export interface BackendValidationResult {
|
||||
checks: BenchmarkCheck[];
|
||||
artifactFiles?: BenchmarkArtifactFile[];
|
||||
}
|
||||
|
||||
export interface BenchmarkTokenUsage {
|
||||
prompt: number;
|
||||
completion: number;
|
||||
total: number;
|
||||
}
|
||||
|
||||
export interface ModeRunOutput<TActual> {
|
||||
success: boolean;
|
||||
actual: TActual;
|
||||
error?: string;
|
||||
assistantMessageCount: number;
|
||||
toolCallCount: number;
|
||||
toolsUsed: string[];
|
||||
skillsInvoked: string[];
|
||||
tokenUsage?: BenchmarkTokenUsage | null;
|
||||
}
|
||||
|
||||
export interface ModeRunContext {
|
||||
caseId: string;
|
||||
caseNumber: number;
|
||||
totalCases: number;
|
||||
attempt: number;
|
||||
runs: number;
|
||||
verbose: boolean;
|
||||
onAssistantMessageStart?: () => void;
|
||||
onAssistantChunk?: (chunk: string) => void;
|
||||
onAssistantMessageEnd?: () => void;
|
||||
}
|
||||
|
||||
export interface ModeRunner<TInitial, TExpected, TActual> {
|
||||
mode: EvalMode;
|
||||
concurrency: number;
|
||||
judgeThreshold?: number;
|
||||
loadInitial(path?: string): Promise<TInitial | undefined>;
|
||||
loadExpected(path?: string): Promise<TExpected | undefined>;
|
||||
run(
|
||||
prompt: string,
|
||||
initial: TInitial | undefined,
|
||||
context: ModeRunContext
|
||||
): Promise<ModeRunOutput<TActual>>;
|
||||
validate(input: {
|
||||
evalCase: EvalCase;
|
||||
prompt: string;
|
||||
initial: TInitial | undefined;
|
||||
expected: TExpected | undefined;
|
||||
actual: TActual;
|
||||
run: ModeRunOutput<TActual>;
|
||||
}): BenchmarkCheck[];
|
||||
backendValidate?(input: {
|
||||
evalCase: EvalCase;
|
||||
prompt: string;
|
||||
initial: TInitial | undefined;
|
||||
expected: TExpected | undefined;
|
||||
actual: TActual;
|
||||
run: ModeRunOutput<TActual>;
|
||||
context: ModeRunContext;
|
||||
}): Promise<BackendValidationResult | null>;
|
||||
buildArtifacts?(actual: TActual): BenchmarkArtifactFile[];
|
||||
}
|
||||
|
||||
export interface BenchmarkAttemptResult {
|
||||
attempt: number;
|
||||
passed: boolean;
|
||||
durationMs: number;
|
||||
assistantMessageCount: number;
|
||||
toolCallCount: number;
|
||||
toolsUsed: string[];
|
||||
skillsInvoked: string[];
|
||||
checks: BenchmarkCheck[];
|
||||
judgeScore: number | null;
|
||||
judgeSummary: string | null;
|
||||
error: string | null;
|
||||
tokenUsage?: BenchmarkTokenUsage | null;
|
||||
artifactsPath?: string | null;
|
||||
artifactFiles?: BenchmarkArtifactFile[];
|
||||
}
|
||||
|
||||
export interface BenchmarkCaseResult {
|
||||
id: string;
|
||||
prompt: string;
|
||||
initialPath?: string;
|
||||
expectedPath?: string;
|
||||
attempts: BenchmarkAttemptResult[];
|
||||
}
|
||||
|
||||
export interface BenchmarkRunResult {
|
||||
version: 1;
|
||||
mode: EvalMode;
|
||||
createdAt: string;
|
||||
gitSha: string | null;
|
||||
runs: number;
|
||||
runModel: string | null;
|
||||
judgeModel: string | null;
|
||||
caseCount: number;
|
||||
attemptCount: number;
|
||||
passedAttempts: number;
|
||||
passRate: number;
|
||||
averageDurationMs: number;
|
||||
totalTokenUsage?: BenchmarkTokenUsage | null;
|
||||
averageTokenUsagePerAttempt?: BenchmarkTokenUsage | null;
|
||||
artifactsPath?: string | null;
|
||||
cases: BenchmarkCaseResult[];
|
||||
}
|
||||
|
||||
export type FrontendBenchmarkProgressEvent =
|
||||
| {
|
||||
type: "run-start";
|
||||
surface: Exclude<EvalMode, "cli">;
|
||||
totalCases: number;
|
||||
runs: number;
|
||||
concurrency: number;
|
||||
}
|
||||
| {
|
||||
type: "attempt-start";
|
||||
surface: Exclude<EvalMode, "cli">;
|
||||
caseId: string;
|
||||
caseNumber: number;
|
||||
totalCases: number;
|
||||
attempt: number;
|
||||
runs: number;
|
||||
}
|
||||
| {
|
||||
type: "attempt-finish";
|
||||
surface: Exclude<EvalMode, "cli">;
|
||||
caseId: string;
|
||||
caseNumber: number;
|
||||
totalCases: number;
|
||||
attempt: number;
|
||||
runs: number;
|
||||
passed: boolean;
|
||||
durationMs: number;
|
||||
judgeScore: number | null;
|
||||
error: string | null;
|
||||
}
|
||||
| {
|
||||
type: "assistant-message-start";
|
||||
surface: Exclude<EvalMode, "cli">;
|
||||
caseId: string;
|
||||
caseNumber: number;
|
||||
totalCases: number;
|
||||
attempt: number;
|
||||
runs: number;
|
||||
}
|
||||
| {
|
||||
type: "assistant-chunk";
|
||||
surface: Exclude<EvalMode, "cli">;
|
||||
caseId: string;
|
||||
caseNumber: number;
|
||||
totalCases: number;
|
||||
attempt: number;
|
||||
runs: number;
|
||||
chunk: string;
|
||||
}
|
||||
| {
|
||||
type: "assistant-message-end";
|
||||
surface: Exclude<EvalMode, "cli">;
|
||||
caseId: string;
|
||||
caseNumber: number;
|
||||
totalCases: number;
|
||||
attempt: number;
|
||||
runs: number;
|
||||
};
|
||||
36
ai_evals/core/validators.test.ts
Normal file
36
ai_evals/core/validators.test.ts
Normal file
@@ -0,0 +1,36 @@
|
||||
import { describe, expect, it } from "bun:test";
|
||||
import { validateScriptState } from "./validators";
|
||||
|
||||
describe("validateScriptState", () => {
|
||||
it("accepts semantically equivalent script implementations", () => {
|
||||
const checks = validateScriptState({
|
||||
actual: {
|
||||
path: "f/evals/greet_user.ts",
|
||||
lang: "bun",
|
||||
code: "export async function main(name: string): Promise<string> {\n return `Hello, ${name}!`;\n}\n",
|
||||
},
|
||||
expected: {
|
||||
path: "f/evals/greet_user.ts",
|
||||
lang: "bun",
|
||||
code: "export async function main(name: string) {\n\treturn `Hello, ${name}!`\n}\n",
|
||||
},
|
||||
});
|
||||
|
||||
expect(checks.every((check) => check.passed)).toBe(true);
|
||||
});
|
||||
|
||||
it("still requires an exported main entrypoint", () => {
|
||||
const checks = validateScriptState({
|
||||
actual: {
|
||||
path: "f/evals/greet_user.ts",
|
||||
lang: "bun",
|
||||
code: "async function main(name: string) {\n return `Hello, ${name}!`;\n}\n",
|
||||
},
|
||||
});
|
||||
|
||||
expect(checks).toContainEqual({
|
||||
name: "script exports entrypoint",
|
||||
passed: false,
|
||||
});
|
||||
});
|
||||
});
|
||||
1281
ai_evals/core/validators.ts
Normal file
1281
ai_evals/core/validators.ts
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,2 @@
|
||||
main(name: string)
|
||||
greeting: `Hello, ${name}!`
|
||||
@@ -0,0 +1,3 @@
|
||||
export async function main(name: string) {
|
||||
return { greeting: `Hello, ${name}!` };
|
||||
}
|
||||
@@ -0,0 +1,2 @@
|
||||
type: script
|
||||
path: f/lib/format_greeting
|
||||
@@ -0,0 +1,3 @@
|
||||
export async function main(name: string) {
|
||||
return { greeting: `Hello, ${name}!` };
|
||||
}
|
||||
@@ -0,0 +1,2 @@
|
||||
def main(
|
||||
return {"total": a + b}
|
||||
@@ -0,0 +1,20 @@
|
||||
summary: Simple greeting flow
|
||||
schema:
|
||||
type: object
|
||||
properties:
|
||||
name:
|
||||
type: string
|
||||
description: Name to greet
|
||||
required:
|
||||
- name
|
||||
value:
|
||||
modules:
|
||||
- id: hello_step
|
||||
value:
|
||||
type: rawscript
|
||||
language: bun
|
||||
content: !inline hello.ts
|
||||
input_transforms:
|
||||
name:
|
||||
type: javascript
|
||||
expr: flow_input.name
|
||||
@@ -0,0 +1,3 @@
|
||||
export async function main(name: string) {
|
||||
return { greeting: `Hello, ${name}!` };
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
export async function main(name: string) {
|
||||
return { greeting: `Hello, ${name}!` };
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
export async function main(name: string) {
|
||||
return { greeting: `Hello, ${name}!` };
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user