fix: improve flow chat and benchmark coverage (#8825 )

* fix: support special flow modules in evals Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * refactor: extract shared flow helper logic Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix: make special flow tools openai-compatible Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix: improve flow eval prompts and validation Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * test: relax flow benchmark overfits Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * test: record updated flow benchmark history Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix: address flow review findings Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * refactor: source flow chat special module prompt Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix: narrow rawscript helper return type Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * refactor: dedupe flow chat prompt guidance Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix: relax flow test10 validation Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
sqlx
2026-04-15 16:22:39 +00:00 · 2026-04-15 15:14:44 +00:00 · 2026-04-15 15:11:25 +00:00 · 2026-04-15 12:05:11 +00:00 · 2026-04-15 11:49:57 +00:00 · 2026-04-15 11:01:07 +00:00
2755 changed files with 216634 additions and 51156 deletions
--- a/.agents/skills/commit/SKILL.md
+++ b/.agents/skills/commit/SKILL.md
@@ -0,0 +1,59 @@
+---
+name: commit
+description: Create a git commit with conventional commit format. MUST use anytime you want to commit changes.
+---
+
+# Git Commit Skill
+
+Create a focused, single-line commit following conventional commit conventions.
+
+## Instructions
+
+1. **Analyze changes**: Run `git status` and `git diff` to understand what was modified
+2. **Stage only modified files**: Add files individually by name. NEVER use `git add -A` or `git add .`
+3. **Write commit message**: Follow the conventional commit format as a single line
+
+## Conventional Commit Format
+
+```
+<type>: <description>
+```
+
+### Types
+- `feat`: New feature or capability
+- `fix`: Bug fix
+- `refactor`: Code change that neither fixes a bug nor adds a feature
+- `docs`: Documentation only changes
+- `style`: Formatting, missing semicolons, etc (no code change)
+- `test`: Adding or correcting tests
+- `chore`: Maintenance tasks, dependency updates, etc
+- `perf`: Performance improvement
+
+### Rules
+- Message MUST be a single line (no multi-line messages)
+- Description should be lowercase, imperative mood ("add" not "added")
+- No period at the end
+- Keep under 72 characters total
+
+### Examples
+```
+feat: add token usage tracking for AI providers
+fix: resolve null pointer in job executor
+refactor: extract common validation logic
+docs: update API endpoint documentation
+chore: upgrade sqlx to 0.7
+```
+
+## Execution Steps
+
+1. Run `git status` to see all changes
+2. Run `git diff` to understand the changes in detail
+3. Run `git log --oneline -5` to see recent commit style
+4. Stage ONLY the modified/relevant files: `git add <file1> <file2> ...`
+5. Create the commit with conventional format:
+   ```bash
+   git commit -m "<type>: <description>
+
+   Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>"
+   ```
+6. Run `git status` to verify the commit succeeded
--- a/.agents/skills/local-review/SKILL.md
+++ b/.agents/skills/local-review/SKILL.md
@@ -0,0 +1,97 @@
+---
+name: local-review
+description: Code review a pull request for bugs and CLAUDE.md compliance. MUST use when asked to review code.
+---
+
+# Local Code Review Skill
+
+Review a pull request for real bugs and CLAUDE.md compliance violations. This review targets HIGH SIGNAL issues only.
+
+## Review Philosophy
+
+- **Only flag issues you are certain about.** If you are not sure an issue is real, do not flag it. False positives erode trust and waste reviewer time.
+- Think like a senior engineer doing a final review — flag things that would cause incidents, not things that are merely imperfect.
+
+## What to Flag
+
+- Code that won't compile or parse (syntax errors, type errors, missing imports)
+- Code that will definitely produce wrong results regardless of inputs
+- Clear, unambiguous CLAUDE.md violations (quote the exact rule being violated)
+- Security issues in introduced code (injection, auth bypass, data exposure)
+- Incorrect logic that will fail in production
+
+## What NOT to Flag
+
+- Code style or quality concerns
+- Potential issues that depend on specific inputs or runtime state
+- Subjective suggestions or improvements
+- Pre-existing issues not introduced by this PR
+- Pedantic nitpicks a senior engineer wouldn't flag
+- Issues a linter or type checker will catch
+- General quality concerns unless explicitly prohibited in CLAUDE.md
+- Issues silenced via lint ignore comments
+
+## Execution Steps
+
+1. **Determine the PR scope**:
+   - If an argument is provided, use it as the PR number or branch
+   - Otherwise, detect from the current branch vs main
+   - Run `gh pr view` if a PR exists, or use `git diff main...HEAD`
+
+2. **Find relevant CLAUDE.md files**:
+   - Read the root `CLAUDE.md`
+   - Check for CLAUDE.md files in directories containing changed files
+
+3. **Get the diff and metadata**:
+   - `gh pr diff` or `git diff main...HEAD` for the full diff
+   - `gh pr view` or `git log main..HEAD --oneline` for context
+
+4. **Read changed files** where the diff alone is insufficient to understand context
+
+5. **Review for**:
+   - CLAUDE.md compliance — check each rule against the changed code
+   - Bugs and logic errors — will this code work correctly?
+   - Security issues — injection, auth, data exposure in new code
+
+6. **Self-validate each finding**: Before reporting, ask yourself:
+   - "Is this definitely a real issue, not a false positive?"
+   - "Would a senior engineer flag this in review?"
+   - If the answer to either is no, discard the finding
+
+7. **Output findings** to the terminal (default) or post as PR comments (with `--comment` flag)
+
+## Output Format
+
+```
+## Code review
+
+Found N issues:
+
+1. <description> (<reason: CLAUDE.md adherence | bug | security>)
+   <file_path:line_number>
+
+2. <description> (<reason>)
+   <file_path:line_number>
+```
+
+If no issues are found:
+
+```
+## Code review
+
+No issues found. Checked for bugs and CLAUDE.md compliance.
+```
+
+## Posting Comments (--comment flag)
+
+If the user passes `--comment`, post findings as inline PR comments using:
+
+```bash
+gh pr review --comment --body "<summary>"
+```
+
+Or for inline comments on specific lines:
+
+```bash
+gh api repos/{owner}/{repo}/pulls/{pr}/reviews -f body="<summary>" -f event="COMMENT" -f comments="[...]"
+```
--- a/.agents/skills/native-trigger/SKILL.md
+++ b/.agents/skills/native-trigger/SKILL.md
@@ -0,0 +1,782 @@
+---
+name: native-trigger
+description: Guidance for adding native trigger services to Windmill. Use when implementing or modifying native trigger integrations across the backend and frontend.
+---
+
+# Skill: Adding Native Trigger Services
+
+This skill provides comprehensive guidance for adding new native trigger services to Windmill. Native triggers allow external services (like Nextcloud, Google Drive, etc.) to trigger Windmill scripts/flows via webhooks or push notifications.
+
+## Architecture Overview
+
+The native trigger system consists of:
+
+1. **Database Layer** - PostgreSQL tables and enum types
+2. **Backend Rust Implementation** - Core trait, handlers, and service modules in the `windmill-native-triggers` crate
+3. **Frontend Svelte Components** - Configuration forms and UI components
+
+### Key Files
+
+| Component | Path |
+|-----------|------|
+| Core module with `External` trait | `backend/windmill-native-triggers/src/lib.rs` |
+| Generic CRUD handlers | `backend/windmill-native-triggers/src/handler.rs` |
+| Background sync logic | `backend/windmill-native-triggers/src/sync.rs` |
+| OAuth/workspace integration | `backend/windmill-native-triggers/src/workspace_integrations.rs` |
+| Re-export shim (windmill-api) | `backend/windmill-api/src/native_triggers/mod.rs` |
+| TriggerKind enum | `backend/windmill-common/src/triggers.rs` |
+| JobTriggerKind enum | `backend/windmill-common/src/jobs.rs` |
+| Frontend service registry | `frontend/src/lib/components/triggers/native/utils.ts` |
+| Frontend trigger utilities | `frontend/src/lib/components/triggers/utils.ts` |
+| Trigger badges (icons + counts) | `frontend/src/lib/components/graph/renderers/triggers/TriggersBadge.svelte` |
+| Workspace integrations UI | `frontend/src/lib/components/workspaceSettings/WorkspaceIntegrations.svelte` |
+| OAuth config form component | `frontend/src/lib/components/workspaceSettings/OAuthClientConfig.svelte` |
+| OpenAPI spec | `backend/windmill-api/openapi.yaml` |
+| Reference: Nextcloud module | `backend/windmill-native-triggers/src/nextcloud/` |
+| Reference: Google module | `backend/windmill-native-triggers/src/google/` |
+
+### Crate Structure
+
+The native trigger code lives in the `windmill-native-triggers` crate (`backend/windmill-native-triggers/`). The `windmill-api` crate re-exports everything via a shim:
+
+```rust
+// backend/windmill-api/src/native_triggers/mod.rs
+pub use windmill_native_triggers::*;
+```
+
+All new service modules go in `backend/windmill-native-triggers/src/`.
+
+---
+
+## Core Concepts
+
+### The `External` Trait
+
+Every native trigger service implements the `External` trait defined in `lib.rs`:
+
+```rust
+#[async_trait]
+pub trait External: Send + Sync + 'static {
+    // Associated types:
+    type ServiceConfig: Debug + DeserializeOwned + Serialize + Send + Sync;
+    type TriggerData: Debug + Serialize + Send + Sync;
+    type OAuthData: DeserializeOwned + Serialize + Clone + Send + Sync;
+    type CreateResponse: DeserializeOwned + Send + Sync;
+
+    // Constants:
+    const SUPPORT_WEBHOOK: bool;
+    const SERVICE_NAME: ServiceName;
+    const DISPLAY_NAME: &'static str;
+    const TOKEN_ENDPOINT: &'static str;
+    const REFRESH_ENDPOINT: &'static str;
+    const AUTH_ENDPOINT: &'static str;
+
+    // Required methods:
+    async fn create(&self, w_id, oauth_data, webhook_token, data, db, tx) -> Result<Self::CreateResponse>;
+    async fn update(&self, w_id, oauth_data, external_id, webhook_token, data, db, tx) -> Result<serde_json::Value>;
+    async fn get(&self, w_id, oauth_data, external_id, db, tx) -> Result<Self::TriggerData>;
+    async fn delete(&self, w_id, oauth_data, external_id, db, tx) -> Result<()>;
+    async fn exists(&self, w_id, oauth_data, external_id, db, tx) -> Result<bool>;
+    async fn maintain_triggers(&self, db, workspace_id, triggers, oauth_data, synced, errors);
+    fn external_id_and_metadata_from_response(&self, resp) -> (String, Option<serde_json::Value>);
+
+    // Methods with defaults:
+    async fn prepare_webhook(&self, db, w_id, headers, body, script_path, is_flow) -> Result<PushArgsOwned>;
+    fn service_config_from_create_response(&self, data, resp) -> Option<serde_json::Value>;
+    fn additional_routes(&self) -> axum::Router;
+    async fn http_client_request<T, B>(&self, url, method, workspace_id, tx, db, headers, body) -> Result<T>;
+}
+```
+
+Key design points:
+- **`update()` returns `serde_json::Value`** - the resolved service_config to store. Each service is responsible for building the final config.
+- **`maintain_triggers()`** - periodic background maintenance. Each service implements its own strategy (Nextcloud: reconcile with external state; Google: renew expiring channels).
+- **No `list_all()` in the trait** - services that need it (Nextcloud) implement it privately; services that don't (Google) use different maintenance strategies.
+- **No `get_external_id_from_trigger_data()` or `extract_service_config_from_trigger_data()`** - removed in favor of the `maintain_triggers` pattern.
+
+### Create Lifecycle: Two Paths
+
+The `create_native_trigger` handler in `handler.rs` supports two creation flows, controlled by `service_config_from_create_response()`:
+
+**Path A: Short (Google pattern)** - `service_config_from_create_response()` returns `Some(config)`:
+1. `create()` registers on external service
+2. `external_id_and_metadata_from_response()` extracts the ID
+3. `service_config_from_create_response()` builds the config directly from input data + response metadata
+4. Stores trigger in DB -- done, no extra round-trip
+
+Use this when the external_id is known before the create call (e.g., Google generates the channel_id as a UUID upfront and includes it in the webhook URL).
+
+**Path B: Long (Nextcloud pattern)** - `service_config_from_create_response()` returns `None` (default):
+1. `create()` registers on external service (webhook URL has no external_id yet)
+2. `external_id_and_metadata_from_response()` extracts the ID
+3. `update()` is called to fix the webhook URL with the now-known external_id
+4. `update()` returns the resolved service_config
+5. Stores trigger in DB
+
+Use this when the external_id is assigned by the remote service and the webhook URL needs to be corrected after creation.
+
+### OAuth Token Storage (Three-Table Pattern)
+
+OAuth tokens are stored across three tables, NOT in `workspace_integrations.oauth_data` directly:
+
+| Table | What's Stored |
+|-------|---------------|
+| `workspace_integrations` | `oauth_data` JSON with `base_url`, `client_id`, `client_secret`, `instance_shared` flag; `resource_path` pointing to the variable |
+| `variable` | Encrypted `access_token` (at the path stored in `resource_path`), linked to `account` via `account` column |
+| `account` | `refresh_token`, keyed by `workspace_id` + `client` (service name) + `is_workspace_integration = true` |
+
+The `decrypt_oauth_data()` function in `lib.rs` assembles these into a unified struct:
+```rust
+pub struct OAuthConfig {
+    pub base_url: String,
+    pub access_token: String,      // decrypted from variable
+    pub refresh_token: Option<String>, // from account table
+    pub client_id: String,         // from oauth_data or instance settings
+    pub client_secret: String,     // from oauth_data or instance settings
+}
+```
+
+Instance-level sharing: when `oauth_data.instance_shared == true`, `client_id` and `client_secret` are read from global settings instead of workspace_integrations.
+
+### URL Resolution
+
+The `resolve_endpoint()` helper handles both absolute and relative OAuth URLs:
+
+```rust
+pub fn resolve_endpoint(base_url: &str, endpoint: &str) -> String {
+    if endpoint.starts_with("http://") || endpoint.starts_with("https://") {
+        endpoint.to_string()  // Google: absolute URLs
+    } else {
+        format!("{}{}", base_url, endpoint)  // Nextcloud: relative paths
+    }
+}
+```
+
+### ServiceName Methods
+
+`ServiceName` is the central registry enum. Each variant must implement these match arms:
+
+| Method | Purpose |
+|--------|---------|
+| `as_str()` | Lowercase identifier (e.g., `"google"`) |
+| `as_trigger_kind()` | Maps to `TriggerKind` enum |
+| `as_job_trigger_kind()` | Maps to `JobTriggerKind` enum |
+| `token_endpoint()` | OAuth token endpoint (relative or absolute) |
+| `auth_endpoint()` | OAuth authorization endpoint |
+| `oauth_scopes()` | Space-separated OAuth scopes |
+| `resource_type()` | Resource type for token storage (e.g., `"gworkspace"`) |
+| `extra_auth_params()` | Extra OAuth params (e.g., Google needs `access_type=offline`, `prompt=consent`) |
+| `integration_service()` | Maps to the workspace integration service (usually `*self`) |
+| `TryFrom<String>` | Parse from string |
+| `Display` | Delegates to `as_str()` |
+
+---
+
+## Step-by-Step Implementation Guide
+
+### Step 1: Database Migration
+
+Create a new migration file: `backend/migrations/YYYYMMDDHHMMSS_newservice_trigger.up.sql`
+
+```sql
+-- Add the service to the native_trigger_service enum
+ALTER TYPE native_trigger_service ADD VALUE IF NOT EXISTS 'newservice';
+
+-- Add to TRIGGER_KIND enum (used for trigger tracking)
+ALTER TYPE TRIGGER_KIND ADD VALUE IF NOT EXISTS 'newservice';
+
+-- Add to job_trigger_kind enum (used for job tracking)
+ALTER TYPE job_trigger_kind ADD VALUE IF NOT EXISTS 'newservice';
+```
+
+Also create the corresponding down migration.
+
+### Step 2: Update windmill-common Enums
+
+#### `backend/windmill-common/src/triggers.rs`
+
+Add variant to `TriggerKind` enum, and update `to_key()` and `fmt()` implementations.
+
+#### `backend/windmill-common/src/jobs.rs`
+
+Add variant to `JobTriggerKind` enum and update the `Display` implementation.
+
+### Step 3: Backend Service Module
+
+Create a new directory: `backend/windmill-native-triggers/src/newservice/`
+
+#### `mod.rs` - Type Definitions
+
+```rust
+use serde::{Deserialize, Serialize};
+
+pub mod external;
+// pub mod routes; // Only if you need additional service-specific routes
+
+/// OAuth data deserialized from the three-table pattern.
+/// The actual structure is built by decrypt_oauth_data() from variable + account + workspace_integrations.
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct NewServiceOAuthData {
+    pub base_url: String,              // from workspace_integrations.oauth_data
+    pub access_token: String,          // decrypted from variable table
+    pub refresh_token: Option<String>, // from account table
+    // Note: client_id and client_secret are in OAuthConfig, not here
+    // unless the service needs them at runtime for API calls
+}
+
+/// Configuration provided by user when creating/updating a trigger.
+/// Stored as JSON in native_trigger.service_config.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct NewServiceConfig {
+    // Service-specific configuration fields
+    pub folder_path: String,
+    pub file_filter: Option<String>,
+}
+
+/// Data retrieved from the external service about a trigger.
+/// Returned by the get() method and shown in the UI.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct NewServiceTriggerData {
+    pub folder_path: String,
+    pub file_filter: Option<String>,
+    // Fields that shouldn't affect service_config comparison should use #[serde(skip_serializing)]
+}
+
+/// Response from external service when creating a trigger/webhook.
+#[derive(Debug, Deserialize)]
+pub struct CreateTriggerResponse {
+    pub id: String,
+}
+
+/// Handler struct (stateless, used for routing)
+#[derive(Copy, Clone)]
+pub struct NewService;
+```
+
+#### `external.rs` - External Trait Implementation
+
+```rust
+use async_trait::async_trait;
+use reqwest::Method;
+use sqlx::PgConnection;
+use std::collections::HashMap;
+use windmill_common::{
+    error::{Error, Result},
+    BASE_URL, DB,
+};
+
+use crate::{
+    generate_webhook_service_url, External, NativeTrigger, NativeTriggerData, ServiceName,
+    sync::{SyncError, TriggerSyncInfo},
+};
+use super::{NewService, NewServiceConfig, NewServiceOAuthData, NewServiceTriggerData, CreateTriggerResponse};
+
+#[async_trait]
+impl External for NewService {
+    type ServiceConfig = NewServiceConfig;
+    type TriggerData = NewServiceTriggerData;
+    type OAuthData = NewServiceOAuthData;
+    type CreateResponse = CreateTriggerResponse;
+
+    const SERVICE_NAME: ServiceName = ServiceName::NewService;
+    const DISPLAY_NAME: &'static str = "New Service";
+    const SUPPORT_WEBHOOK: bool = true;
+    const TOKEN_ENDPOINT: &'static str = "/oauth/token";
+    const REFRESH_ENDPOINT: &'static str = "/oauth/token";
+    const AUTH_ENDPOINT: &'static str = "/oauth/authorize";
+
+    async fn create(
+        &self,
+        w_id: &str,
+        oauth_data: &Self::OAuthData,
+        webhook_token: &str,
+        data: &NativeTriggerData<Self::ServiceConfig>,
+        db: &DB,
+        tx: &mut PgConnection,
+    ) -> Result<Self::CreateResponse> {
+        let base_url = &*BASE_URL.read().await;
+
+        // external_id is None during create (we get it from the response)
+        let webhook_url = generate_webhook_service_url(
+            base_url, w_id, &data.script_path, data.is_flow,
+            None, Self::SERVICE_NAME, webhook_token,
+        );
+
+        let url = format!("{}/api/webhooks/create", oauth_data.base_url);
+        let payload = serde_json::json!({
+            "callback_url": webhook_url,
+            "folder_path": data.service_config.folder_path,
+        });
+
+        let response: CreateTriggerResponse = self
+            .http_client_request(&url, Method::POST, w_id, tx, db, None, Some(&payload))
+            .await?;
+
+        Ok(response)
+    }
+
+    /// Update returns the resolved service_config as JSON.
+    /// For services using the update+get pattern, call self.get() and serialize.
+    async fn update(
+        &self,
+        w_id: &str,
+        oauth_data: &Self::OAuthData,
+        external_id: &str,
+        webhook_token: &str,
+        data: &NativeTriggerData<Self::ServiceConfig>,
+        db: &DB,
+        tx: &mut PgConnection,
+    ) -> Result<serde_json::Value> {
+        let base_url = &*BASE_URL.read().await;
+
+        let webhook_url = generate_webhook_service_url(
+            base_url, w_id, &data.script_path, data.is_flow,
+            Some(external_id), Self::SERVICE_NAME, webhook_token,
+        );
+
+        let url = format!("{}/api/webhooks/{}", oauth_data.base_url, external_id);
+        let payload = serde_json::json!({
+            "callback_url": webhook_url,
+            "folder_path": data.service_config.folder_path,
+        });
+
+        let _: serde_json::Value = self
+            .http_client_request(&url, Method::PUT, w_id, tx, db, None, Some(&payload))
+            .await?;
+
+        // Fetch back the updated state to get the resolved config
+        let trigger_data = self.get(w_id, oauth_data, external_id, db, tx).await?;
+        serde_json::to_value(&trigger_data)
+            .map_err(|e| Error::InternalErr(format!("Failed to serialize trigger data: {}", e)))
+    }
+
+    async fn get(
+        &self,
+        w_id: &str,
+        oauth_data: &Self::OAuthData,
+        external_id: &str,
+        db: &DB,
+        tx: &mut PgConnection,
+    ) -> Result<Self::TriggerData> {
+        let url = format!("{}/api/webhooks/{}", oauth_data.base_url, external_id);
+        self.http_client_request::<_, ()>(&url, Method::GET, w_id, tx, db, None, None).await
+    }
+
+    async fn delete(
+        &self,
+        w_id: &str,
+        oauth_data: &Self::OAuthData,
+        external_id: &str,
+        db: &DB,
+        tx: &mut PgConnection,
+    ) -> Result<()> {
+        let url = format!("{}/api/webhooks/{}", oauth_data.base_url, external_id);
+        let _: serde_json::Value = self
+            .http_client_request::<_, ()>(&url, Method::DELETE, w_id, tx, db, None, None)
+            .await
+            .or_else(|e| match &e {
+                Error::InternalErr(msg) if msg.contains("404") => Ok(serde_json::Value::Null),
+                _ => Err(e),
+            })?;
+        Ok(())
+    }
+
+    async fn exists(
+        &self,
+        w_id: &str,
+        oauth_data: &Self::OAuthData,
+        external_id: &str,
+        db: &DB,
+        tx: &mut PgConnection,
+    ) -> Result<bool> {
+        match self.get(w_id, oauth_data, external_id, db, tx).await {
+            Ok(_) => Ok(true),
+            Err(Error::NotFound(_)) => Ok(false),
+            Err(e) => Err(e),
+        }
+    }
+
+    /// Background maintenance. Choose the right pattern for your service:
+    /// - For services with queryable external state: use reconcile_with_external_state()
+    /// - For channel-based services with expiration: implement renewal logic
+    async fn maintain_triggers(
+        &self,
+        db: &DB,
+        workspace_id: &str,
+        triggers: &[NativeTrigger],
+        oauth_data: &Self::OAuthData,
+        synced: &mut Vec<TriggerSyncInfo>,
+        errors: &mut Vec<SyncError>,
+    ) {
+        // Option A: Reconcile with external state (Nextcloud pattern)
+        // Fetch all triggers from external service and compare with DB
+        let external_triggers = match self.list_all(workspace_id, oauth_data, db).await {
+            Ok(triggers) => triggers,
+            Err(e) => {
+                errors.push(SyncError {
+                    resource_path: format!("workspace:{}", workspace_id),
+                    error_message: format!("Failed to list triggers: {}", e),
+                    error_type: "api_error".to_string(),
+                });
+                return;
+            }
+        };
+
+        // Convert to (external_id, config_json) pairs
+        let external_pairs: Vec<(String, serde_json::Value)> = external_triggers
+            .into_iter()
+            .map(|t| (t.id.clone(), serde_json::to_value(&t).unwrap_or_default()))
+            .collect();
+
+        crate::sync::reconcile_with_external_state(
+            db, workspace_id, Self::SERVICE_NAME, triggers, &external_pairs, synced, errors,
+        ).await;
+    }
+
+    fn external_id_and_metadata_from_response(
+        &self,
+        resp: &Self::CreateResponse,
+    ) -> (String, Option<serde_json::Value>) {
+        (resp.id.clone(), None)
+    }
+
+    // service_config_from_create_response: NOT overridden (returns None).
+    // This means the handler uses the update+get pattern after create.
+    // Override and return Some(...) to skip the update+get cycle (Google pattern).
+}
+
+impl NewService {
+    /// Private helper to list all triggers from the external service.
+    async fn list_all(
+        &self,
+        w_id: &str,
+        oauth_data: &<Self as External>::OAuthData,
+        db: &DB,
+    ) -> Result<Vec<<Self as External>::TriggerData>> {
+        // Implementation depends on the external service's API
+        todo!()
+    }
+}
+```
+
+### Step 4: Update lib.rs Registry
+
+In `backend/windmill-native-triggers/src/lib.rs`:
+
+```rust
+// Service modules - add new services here:
+#[cfg(feature = "native_trigger")]
+pub mod newservice;  // <-- Add this
+
+// ServiceName enum - add variant:
+pub enum ServiceName {
+    Nextcloud,
+    Google,
+    NewService,  // <-- Add this
+}
+
+// Then add match arms in ALL ServiceName methods:
+// as_str(), as_trigger_kind(), as_job_trigger_kind(), token_endpoint(),
+// auth_endpoint(), oauth_scopes(), resource_type(), extra_auth_params(),
+// integration_service(), TryFrom<String>, Display
+```
+
+### Step 5: Update handler.rs Routes
+
+In `backend/windmill-native-triggers/src/handler.rs`:
+
+```rust
+pub fn generate_native_trigger_routers() -> Router {
+    // ...
+    #[cfg(feature = "native_trigger")]
+    {
+        use crate::newservice::NewService;
+        return router
+            .nest("/nextcloud", service_routes(NextCloud))
+            .nest("/google", service_routes(Google))
+            .nest("/newservice", service_routes(NewService));  // <-- Add this
+    }
+    // ...
+}
+```
+
+### Step 6: Update sync.rs
+
+In `backend/windmill-native-triggers/src/sync.rs`:
+
+```rust
+pub async fn sync_all_triggers(db: &DB) -> Result<BackgroundSyncResult> {
+    // ...
+    #[cfg(feature = "native_trigger")]
+    {
+        use crate::newservice::NewService;
+
+        // ... existing service syncs ...
+
+        // New service sync
+        let (service_name, result) = sync_service_triggers(db, NewService).await;
+        total_synced += result.synced_triggers.len();
+        total_errors += result.errors.len();
+        service_results.insert(service_name, result);
+    }
+    // ...
+}
+```
+
+### Step 7: Frontend Service Registry
+
+In `frontend/src/lib/components/triggers/native/utils.ts`:
+
+Add to `NATIVE_TRIGGER_SERVICES`, `getTriggerIconName()`, and `getServiceIcon()`.
+
+### Step 8: Frontend Trigger Form Component
+
+Create: `frontend/src/lib/components/triggers/native/services/newservice/NewServiceTriggerForm.svelte`
+
+### Step 9: Frontend Icon Component
+
+Create: `frontend/src/lib/components/icons/NewServiceIcon.svelte`
+
+### Step 10: Update NativeTriggerEditor
+
+Check `frontend/src/lib/components/triggers/native/NativeTriggerEditor.svelte` to ensure it dynamically loads form components based on service name.
+
+### Step 11: Workspace Integration UI
+
+Add your service to the `supportedServices` map in `frontend/src/lib/components/workspaceSettings/WorkspaceIntegrations.svelte`:
+
+```typescript
+const supportedServices: Record<string, ServiceConfig> = {
+    // ... existing services ...
+    newservice: {
+        name: 'newservice',
+        displayName: 'New Service',
+        description: 'Connect to New Service for triggers',
+        icon: NewServiceIcon,
+        docsUrl: 'https://www.windmill.dev/docs/integrations/newservice',
+        requiresBaseUrl: false,  // false for cloud services, true for self-hosted
+        setupInstructions: [
+            'Step 1: Create an OAuth app on the service',
+            'Step 2: Configure the redirect URI shown below',
+            'Step 3: Enter the client credentials below'
+        ]
+    }
+}
+```
+
+### Step 12: Update `frontend/src/lib/components/triggers/utils.ts`
+
+Update ALL of these maps/functions:
+1. `triggerIconMap` - import and add icon
+2. `triggerDisplayNamesMap` - add display name
+3. `triggerTypeOrder` in `sortTriggers()` - add type
+4. `getLightConfig()` - add case for your service
+5. `getTriggerLabel()` - add case for your service
+6. `jobTriggerKinds` - add to array
+7. `countPropertyMap` - add count property
+8. `triggerSaveFunctions` - add save function
+
+### Step 13: Update TriggersBadge Component
+
+In `frontend/src/lib/components/graph/renderers/triggers/TriggersBadge.svelte`:
+
+1. Import the icon
+2. Add to `baseConfig` with `countKey` (the dynamic `availableNativeServices` loop does NOT set `countKey`)
+3. Add to the `allTypes` array
+
+### Step 14: Update TriggersWrapper.svelte
+
+In `frontend/src/lib/components/triggers/TriggersWrapper.svelte`:
+
+Add a `{:else if selectedTrigger.type === 'yourservice'}` case that renders `<NativeTriggersPanel service="yourservice" ...>` with the same props pattern as the existing native trigger cases (e.g., `nextcloud`).
+
+### Step 15: Update AddTriggersButton.svelte
+
+In `frontend/src/lib/components/triggers/AddTriggersButton.svelte`:
+
+1. Add `yourserviceAvailable` state variable
+2. Add `setYourserviceState()` async function using `isServiceAvailable('yourservice', $workspaceStore!)`
+3. Call it at module level
+4. Add a dropdown entry to `addTriggerItems` with `hidden: !yourserviceAvailable`
+
+### Step 16: Update TriggersEditor.svelte Delete Handling
+
+In `frontend/src/lib/components/triggers/TriggersEditor.svelte`:
+
+Add your service to the `nativeTriggerServices` map in `deleteDeployedTrigger()`. Native triggers use `NativeTriggerService.deleteNativeTrigger({ workspace, serviceName, externalId })` instead of the standard `path`-based delete.
+
+### Step 17: Update OpenAPI Spec and Regenerate Types
+
+Add to `JobTriggerKind` enum in `backend/windmill-api/openapi.yaml`, then:
+
+```bash
+cd frontend && npm run generate-backend-client
+```
+
+---
+
+## Special Patterns
+
+### Unified Service with `trigger_type` (Google Pattern)
+
+When a single service handles multiple trigger types (e.g., Google Drive + Calendar share OAuth and API patterns), use a single `ServiceName` variant with a discriminator field:
+
+```rust
+pub enum GoogleTriggerType { Drive, Calendar }
+
+pub struct GoogleServiceConfig {
+    pub trigger_type: GoogleTriggerType,
+    // Drive-specific fields (only used when trigger_type = Drive)
+    pub resource_id: Option<String>,
+    pub resource_name: Option<String>,
+    // Calendar-specific fields (only used when trigger_type = Calendar)
+    pub calendar_id: Option<String>,
+    pub calendar_name: Option<String>,
+    // Metadata set after creation
+    pub google_resource_id: Option<String>,
+    pub expiration: Option<String>,
+}
+```
+
+Branch in trait methods based on `trigger_type`. Frontend uses a `ToggleButtonGroup` to switch between types. This keeps the codebase simpler (one service, one OAuth flow, one set of routes).
+
+See `backend/windmill-native-triggers/src/google/` for the reference implementation.
+
+### Skipping update+get After Create (Google Pattern)
+
+Override `service_config_from_create_response()` to return `Some(config)` when the external_id is known before the create call:
+
+```rust
+fn service_config_from_create_response(
+    &self,
+    data: &NativeTriggerData<Self::ServiceConfig>,
+    resp: &Self::CreateResponse,
+) -> Option<serde_json::Value> {
+    // Clone input config, add metadata from response
+    let mut config = data.service_config.clone();
+    config.google_resource_id = Some(resp.resource_id.clone());
+    config.expiration = Some(resp.expiration.clone());
+    Some(serde_json::to_value(&config).unwrap())
+}
+```
+
+### Services with Absolute OAuth Endpoints (Google)
+
+Unlike self-hosted services where OAuth endpoints are relative paths appended to `base_url`, services like Google have absolute URLs:
+
+```rust
+// Nextcloud: relative paths
+ServiceName::Nextcloud => "/apps/oauth2/api/v1/token",
+// Google: absolute URLs
+ServiceName::Google => "https://oauth2.googleapis.com/token",
+```
+
+The `resolve_endpoint()` function handles both. For services with absolute endpoints:
+- `base_url` can be empty
+- `requiresBaseUrl: false` in the frontend workspace integration config
+- Add `extra_auth_params()` if needed (Google requires `access_type=offline` and `prompt=consent`)
+
+### Channel-Based Push Notifications with Renewal (Google Pattern)
+
+For services using expiring watch channels instead of persistent webhooks:
+
+1. Store expiration in `service_config` (as part of `ServiceConfig`)
+2. In `maintain_triggers()`, implement renewal logic instead of using `reconcile_with_external_state()`:
+   ```rust
+   async fn maintain_triggers(&self, db, workspace_id, triggers, oauth_data, synced, errors) {
+       for trigger in triggers {
+           if should_renew_channel(trigger) {
+               self.renew_channel(db, trigger, oauth_data).await;
+           }
+       }
+   }
+   ```
+3. Renewal: best-effort stop old channel, create new one with same external_id, update service_config with new expiration
+4. Google example: Drive channels expire in 24h (renew when <1h left), Calendar channels expire in 7 days (renew when <1 day left)
+
+### reconcile_with_external_state (Nextcloud Pattern)
+
+The reusable function in `sync.rs` compares external triggers with DB state:
+- Triggers missing externally: sets error "Trigger no longer exists on external service"
+- Triggers present externally: clears errors, updates service_config if it differs
+
+Usage in `maintain_triggers()`:
+```rust
+let external_pairs: Vec<(String, serde_json::Value)> = /* fetch from external */;
+crate::sync::reconcile_with_external_state(
+    db, workspace_id, Self::SERVICE_NAME, triggers, &external_pairs, synced, errors,
+).await;
+```
+
+### Webhook Payload Processing
+
+Override `prepare_webhook()` to parse service-specific payloads into script/flow args:
+
+```rust
+async fn prepare_webhook(&self, db, w_id, headers, body, script_path, is_flow) -> Result<PushArgsOwned> {
+    let mut args = HashMap::new();
+    args.insert("event_type".to_string(), Box::new(headers.get("x-event-type").cloned()) as _);
+    args.insert("payload".to_string(), Box::new(serde_json::from_str::<serde_json::Value>(&body)?) as _);
+    Ok(PushArgsOwned { extra: None, args })
+}
+```
+
+Then register in `prepare_native_trigger_args()` in `lib.rs`:
+```rust
+pub async fn prepare_native_trigger_args(service_name, db, w_id, headers, body) -> Result<Option<PushArgsOwned>> {
+    match service_name {
+        ServiceName::Google => { /* ... */ Ok(Some(args)) }
+        ServiceName::NewService => { /* ... */ Ok(Some(args)) }
+        ServiceName::Nextcloud => Ok(None), // Uses default body parsing
+    }
+}
+```
+
+### Instance-Level OAuth Credentials
+
+When `workspace_integrations.oauth_data.instance_shared == true`, `decrypt_oauth_data()` reads `client_id` and `client_secret` from instance-level global settings instead of workspace-level. This allows admins to share OAuth app credentials across workspaces.
+
+The frontend handles this via the `generate_instance_connect_url` endpoint in `workspace_integrations.rs`.
+
+---
+
+## Testing Checklist
+
+- [ ] Database migration runs successfully
+- [ ] `cargo check -p windmill-native-triggers --features native_trigger` passes
+- [ ] `npx svelte-check --threshold error` passes (in frontend/)
+- [ ] Service appears in workspace integrations list
+- [ ] OAuth flow completes successfully
+- [ ] Can create a new trigger
+- [ ] Can view trigger details
+- [ ] Can update trigger configuration
+- [ ] Can delete trigger
+- [ ] Webhook receives and processes payloads
+- [ ] Background sync works correctly (reconciliation or channel renewal)
+- [ ] Error handling works (expired tokens, service unavailable)
+
+---
+
+## Reference Implementations
+
+### Nextcloud (Self-Hosted, Update+Get Pattern)
+
+| File | Purpose |
+|------|---------|
+| `nextcloud/mod.rs` | Types: NextCloudOAuthData, NextcloudServiceConfig, NextCloudTriggerData |
+| `nextcloud/external.rs` | External trait: uses update+get pattern, reconcile_with_external_state for sync |
+| `nextcloud/routes.rs` | Additional route: `GET /events` |
+
+Key patterns: relative OAuth endpoints, base_url required, list_all + reconcile for sync, update returns JSON from get().
+
+### Google (Cloud, Unified Service, Short Create)
+
+| File | Purpose |
+|------|---------|
+| `google/mod.rs` | Types: GoogleServiceConfig with trigger_type discriminator, GoogleTriggerType enum |
+| `google/external.rs` | External trait: overrides service_config_from_create_response, channel renewal for sync |
+| `google/routes.rs` | Additional routes: `GET /calendars`, `GET /drive/files`, `GET /drive/shared_drives` |
+
+Key patterns: absolute OAuth endpoints, empty base_url, trigger_type for Drive/Calendar, expiring watch channels with renewal, service_config_from_create_response skips update+get, get() reconstructs data from stored service_config (no external "get channel" API).
--- a/.agents/skills/pr/SKILL.md
+++ b/.agents/skills/pr/SKILL.md
@@ -0,0 +1,109 @@
+---
+name: pr
+description: Open a draft pull request on GitHub. MUST use when you want to create/open a PR.
+---
+
+# Pull Request Skill
+
+Create a draft pull request with a clear title and explicit description of changes.
+
+## Instructions
+
+1. **Analyze branch changes**: Understand all commits since diverging from main
+2. **Push to remote**: Ensure all commits are pushed
+3. **Create draft PR**: Always open as draft for review before merging
+
+## PR Title Format
+
+Follow conventional commit format for the PR title:
+```
+<type>: <description>
+```
+
+### Types
+- `feat`: New feature or capability
+- `fix`: Bug fix
+- `refactor`: Code restructuring
+- `docs`: Documentation changes
+- `chore`: Maintenance tasks
+- `perf`: Performance improvements
+
+### Title Rules
+- Keep under 70 characters
+- Use lowercase, imperative mood
+- No period at the end
+- If `*_ee.rs` files were modified, prefix with `[ee]`: `[ee] <type>: <description>`
+
+## PR Body Format
+
+The body MUST be explicit about what changed. Structure:
+
+```markdown
+## Summary
+<Clear description of what this PR does and why>
+
+## Changes
+- <Specific change 1>
+- <Specific change 2>
+- <Specific change 3>
+
+## Test plan
+- [ ] <How to verify change 1>
+- [ ] <How to verify change 2>
+
+---
+Generated with [Claude Code](https://claude.com/claude-code)
+```
+
+## Execution Steps
+
+1. Run `git status` to check for uncommitted changes
+2. Run `git log main..HEAD --oneline` to see all commits in this branch
+3. Run `git diff main...HEAD` to see the full diff against main
+4. Check if remote branch exists and is up to date:
+   ```bash
+   git rev-parse --abbrev-ref --symbolic-full-name @{u} 2>/dev/null || echo "no upstream"
+   ```
+5. Push to remote if needed: `git push -u origin HEAD`
+6. Create draft PR using gh CLI:
+   ```bash
+   gh pr create --draft --title "<type>: <description>" --body "$(cat <<'EOF'
+   ## Summary
+   <description>
+
+   ## Changes
+   - <change 1>
+   - <change 2>
+
+   ## Test plan
+   - [ ] <test 1>
+   - [ ] <test 2>
+
+   ---
+   Generated with [Claude Code](https://claude.com/claude-code)
+   EOF
+   )"
+   ```
+7. Return the PR URL to the user
+
+## EE Companion PR (when `*_ee.rs` files were modified)
+
+The `*_ee.rs` files in the windmill repo are **symlinks** to `windmill-ee-private` — changes won't appear in `git diff` of the windmill repo. Instead, check the EE repo for uncommitted or unpushed changes.
+
+Follow the full EE PR workflow in `docs/enterprise.md`. The key PR-specific details:
+
+1. Find the EE repo/worktree: see "Finding the EE Repo" in `docs/enterprise.md`
+2. Check for changes: `git -C <ee-path> status --short`
+   - If there are no changes in the EE repo, skip this entire section
+3. Follow steps 1–5 from the "EE PR Workflow" in `docs/enterprise.md`
+4. Create the companion PR (title does NOT get the `[ee]` prefix):
+   ```bash
+   gh pr create --draft --repo windmill-labs/windmill-ee-private --title "<type>: <description>" --body "$(cat <<'EOF'
+   Companion PR for windmill-labs/windmill#<PR_NUMBER>
+
+   ---
+   Generated with [Claude Code](https://claude.com/claude-code)
+   EOF
+   )"
+   ```
+5. Commit `ee-repo-ref.txt` and push the updated windmill branch
--- a/.agents/skills/refine/SKILL.md
+++ b/.agents/skills/refine/SKILL.md
@@ -0,0 +1,38 @@
+---
+name: refine
+description: End-of-session reflection. Reviews friction encountered during the session and proposes updates to docs/ to capture lessons learned.
+---
+
+# Refine Skill
+
+Reflect on the current session and update documentation with lessons learned.
+
+## Instructions
+
+1. **Identify friction**: Review what happened in this session:
+   - Run `git diff main...HEAD --stat` to see what files were touched
+   - Think about: what was slow, what failed, what required multiple attempts, what information was missing or hard to find
+
+2. **Read current docs**: Read the docs that were relevant to this session:
+   - `docs/validation.md`
+   - `docs/enterprise.md`
+   - `docs/autonomous-mode.md`
+   - Any skills that were invoked
+
+3. **Propose updates**: For each piece of friction, decide if it warrants a doc update:
+   - **Missing knowledge**: Information you had to discover that should be documented
+   - **Wrong guidance**: Instructions that led you astray
+   - **Missing validation rule**: A check that should be in the validation matrix
+   - **New pattern**: A codebase pattern worth capturing for next time
+
+4. **Apply updates**: Edit the relevant `docs/` files. Keep changes minimal and specific — add only what would have saved time this session.
+
+5. **Report**: Summarize what was added/changed and why.
+
+## Rules
+
+- Only add knowledge confirmed by this session — no speculative additions
+- Keep docs concise — add a line or two, not a paragraph
+- If a whole new doc is needed, create it in `docs/` and add a pointer in `CLAUDE.md`
+- Don't update skills unless a coding pattern was genuinely wrong
+- Don't add things Claude already knows — only Windmill-specific knowledge
--- a/.agents/skills/rust-backend/SKILL.md
+++ b/.agents/skills/rust-backend/SKILL.md
@@ -0,0 +1,107 @@
+---
+name: rust-backend
+description: Rust coding guidelines for the Windmill backend. MUST use when writing or modifying Rust code in the backend directory.
+---
+
+# Windmill Rust Patterns
+
+Apply these Windmill-specific patterns when writing Rust code in `backend/`.
+
+## Error Handling
+
+Use `Error` from `windmill_common::error`. Return `Result<T, Error>` or `JsonResult<T>`:
+
+```rust
+use windmill_common::error::{Error, Result};
+
+pub async fn get_job(db: &DB, id: Uuid) -> Result<Job> {
+    sqlx::query_as!(Job, "SELECT id, workspace_id FROM v2_job WHERE id = $1", id)
+        .fetch_optional(db)
+        .await?
+        .ok_or_else(|| Error::NotFound("job not found".to_string()))?;
+}
+```
+
+Never panic in library code. Reserve `.unwrap()` for compile-time guarantees.
+
+## SQLx Patterns
+
+**Never use `SELECT *`** — always list columns explicitly. Critical for backwards compatibility when workers lag behind API version:
+
+```rust
+// Correct
+sqlx::query_as!(Job, "SELECT id, workspace_id, path FROM v2_job WHERE id = $1", id)
+
+// Wrong — breaks when columns are added
+sqlx::query_as!(Job, "SELECT * FROM v2_job WHERE id = $1", id)
+```
+
+Use batch operations to avoid N+1:
+
+```rust
+// Preferred — single query with IN clause
+sqlx::query!("SELECT ... WHERE id = ANY($1)", &ids[..]).fetch_all(db).await?
+```
+
+Use transactions for multi-step operations. Parameterize all queries.
+
+## JSON Handling
+
+Prefer `Box<serde_json::value::RawValue>` over `serde_json::Value` when storing/passing JSON without inspection:
+
+```rust
+pub struct Job {
+    pub args: Option<Box<serde_json::value::RawValue>>,
+}
+```
+
+Only use `serde_json::Value` when you need to inspect or modify the JSON.
+
+## Serde Optimizations
+
+```rust
+#[derive(Serialize, Deserialize)]
+pub struct Job {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub parent_job: Option<Uuid>,
+    #[serde(skip_serializing_if = "Vec::is_empty")]
+    pub tags: Vec<String>,
+    #[serde(default)]
+    pub priority: i32,
+}
+```
+
+## Async & Concurrency
+
+Never block the async runtime. Use `spawn_blocking` for CPU-intensive work:
+
+```rust
+let result = tokio::task::spawn_blocking(move || expensive_computation(&data)).await?;
+```
+
+**Mutex selection**: Prefer `std::sync::Mutex` (or `parking_lot::Mutex`) for data protection. Only use `tokio::sync::Mutex` when holding locks across `.await` points.
+
+Use `tokio::sync::mpsc` (bounded) for channels. Avoid `std::thread::sleep` in async contexts.
+
+## Module Structure & Visibility
+
+- Use `pub(crate)` instead of `pub` when possible
+- Place new code in the appropriate crate based on functionality
+- API endpoints go in `windmill-api/src/` organized by domain
+- Shared functionality goes in `windmill-common/src/`
+
+## Code Navigation
+
+Always use rust-analyzer LSP for go-to-definition, find-references, and type info. Do not guess at module paths.
+
+## Axum Handlers
+
+Destructure extractors directly in function signatures:
+
+```rust
+async fn process_job(
+    Extension(db): Extension<DB>,
+    Path((workspace, job_id)): Path<(String, Uuid)>,
+    Query(pagination): Query<Pagination>,
+) -> Result<Json<Job>> { ... }
+```
--- a/.agents/skills/svelte-frontend/SKILL.md
+++ b/.agents/skills/svelte-frontend/SKILL.md
@@ -0,0 +1,80 @@
+---
+name: svelte-frontend
+description: Svelte coding guidelines for the Windmill frontend. MUST use when writing or modifying code in the frontend directory.
+---
+
+# Windmill Svelte Patterns
+
+Apply these Windmill-specific patterns when writing Svelte code in `frontend/`. For general Svelte 5 syntax (runes, snippets, event handling), use the Svelte MCP server.
+
+## Windmill UI Components (MUST use)
+
+Always use Windmill's design-system components. Never use raw HTML elements.
+
+### Buttons — `<Button>`
+
+```svelte
+<script>
+  import { Button } from '$lib/components/common'
+  import { ChevronLeft } from 'lucide-svelte'
+</script>
+
+<Button variant="default" onclick={handleClick}>Label</Button>
+<Button startIcon={{ icon: ChevronLeft }} iconOnly onclick={prev} />
+```
+
+Props: `variant?: 'accent' | 'accent-secondary' | 'default' | 'subtle'`, `unifiedSize?: 'sm' | 'md' | 'lg'`, `startIcon?: { icon: SvelteComponent }`, `iconOnly?: boolean`, `disabled?: boolean`
+
+### Text inputs — `<TextInput>`
+
+```svelte
+<script>
+  import { TextInput } from '$lib/components/common'
+</script>
+
+<TextInput bind:value={val} placeholder="Enter value" />
+```
+
+Props: `value?: string | number` (bindable), `placeholder?: string`, `disabled?: boolean`, `error?: string | boolean`, `size?: 'sm' | 'md' | 'lg'`
+
+### Selects — `<Select>`
+
+```svelte
+<script>
+  import Select from '$lib/components/select/Select.svelte'
+</script>
+
+<Select items={[{ label: 'Jan', value: 1 }]} bind:value={selected} />
+```
+
+Props: `items?: Array<{ label?: string; value: any }>`, `value` (bindable), `placeholder?: string`, `clearable?: boolean`, `size?: 'sm' | 'md' | 'lg'`
+
+### Icons — `lucide-svelte`
+
+Never write inline SVGs. Import from `lucide-svelte`:
+
+```svelte
+<script>
+  import { ChevronLeft, X } from 'lucide-svelte'
+</script>
+<ChevronLeft size={16} />
+```
+
+## Form Components
+
+Form components (TextInput, Toggle, Select, etc.) should use the unified size system when placed together.
+
+## Styling
+
+- Use Tailwind CSS for all styling — no custom CSS
+- Use Windmill's theming classes for colors/surfaces (see `frontend/brand-guidelines.md`)
+- Read component props JSDoc before using them
+
+## Svelte MCP Server
+
+Use the Svelte MCP tools when working on Svelte code:
+
+1. **list-sections**: Call first to discover available docs
+2. **get-documentation**: Fetch relevant sections based on use_cases
+3. **svelte-autofixer**: MUST use on all Svelte code before finalizing — keep calling until no issues
+4. **playground-link**: Only after user confirms and code was NOT written to project files
--- a/.claude/hooks/format-backend.sh
+++ b/.claude/hooks/format-backend.sh
@@ -13,8 +13,10 @@ fi
 # Check if the file is in the backend directory and is a Rust file
 if [[ "$FILE_PATH" == *"/backend/"* ]] && [[ "$FILE_PATH" =~ \.rs$ ]]; then
    cd "$CLAUDE_PROJECT_DIR/backend" || exit 0
-    # Run rustfmt with config from rustfmt.toml (edition=2021)
-    rustfmt --config-path rustfmt.toml "$FILE_PATH" 2>/dev/null || true
+    # Run rustfmt, surface errors as context but don't block Claude
+    if rustfmt --config-path rustfmt.toml "$FILE_PATH" 2>&1; then
+        echo "Formatted $(basename "$FILE_PATH")"
+    fi
 fi

 exit 0
--- a/.claude/hooks/format-frontend.sh
+++ b/.claude/hooks/format-frontend.sh
@@ -15,8 +15,10 @@ if [[ "$FILE_PATH" == *"/frontend/"* ]]; then
    # Check if it's a formattable file type
    if [[ "$FILE_PATH" =~ \.(ts|js|svelte|json|css|html|md)$ ]]; then
        cd "$CLAUDE_PROJECT_DIR/frontend" || exit 0
-        # Run prettier silently, don't fail the hook if prettier fails
-        npx prettier --write "$FILE_PATH" 2>/dev/null || true
+        # Run prettier, surface errors as context but don't block Claude
+        if ./node_modules/.bin/prettier --plugin prettier-plugin-svelte --write "$FILE_PATH" 2>&1; then
+            echo "Formatted $(basename "$FILE_PATH")"
+        fi
    fi
 fi

--- a/.claude/hooks/guard-main-branch.sh
+++ b/.claude/hooks/guard-main-branch.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+# PreToolUse hook: block destructive git operations when on the main branch.
+# Non-git tool calls and read-only git commands pass through silently.
+
+set -euo pipefail
+
+input="$(cat)"
+tool_name="$(echo "$input" | jq -r '.tool_name // empty')"
+
+# Only care about Bash tool calls
+[[ "$tool_name" == "Bash" ]] || exit 0
+
+command="$(echo "$input" | jq -r '.tool_input.command // empty')"
+
+# Only care about git write commands
+if [[ "$command" =~ ^git\ (push|reset|revert|checkout|merge|rebase|commit|add) ]]; then
+  branch="$(git rev-parse --abbrev-ref HEAD 2>/dev/null || true)"
+  if [[ "$branch" == "main" ]]; then
+    echo "BLOCK: You are on the main branch. Create or switch to a feature branch first."
+  fi
+fi
--- a/.claude/hooks/resolve-symlinks.sh
+++ b/.claude/hooks/resolve-symlinks.sh
@@ -1,30 +0,0 @@
-#!/usr/bin/env bash
-# Resolve _ee.rs symlinks to actual files so Claude can read them
-# This script runs before each user prompt is processed
-
-set -e
-
-PROJECT_DIR="${CLAUDE_PROJECT_DIR:-/home/farhad/windmill}"
-MANIFEST_FILE="$PROJECT_DIR/.claude/hooks/.symlink-manifest"
-
-# Find all _ee.rs symlinks and store their targets
-find "$PROJECT_DIR" -name "*_ee.rs" -type l 2>/dev/null | while read -r symlink; do
-    target=$(readlink -f "$symlink" 2>/dev/null) || continue
-
-    # Only process if target file exists
-    if [[ -f "$target" ]]; then
-        # Store symlink path and target in manifest
-        echo "$symlink|$target" >> "$MANIFEST_FILE.tmp"
-
-        # Replace symlink with actual file content
-        rm "$symlink"
-        cp "$target" "$symlink"
-    fi
-done
-
-# Atomically replace manifest
-if [[ -f "$MANIFEST_FILE.tmp" ]]; then
-    mv "$MANIFEST_FILE.tmp" "$MANIFEST_FILE"
-fi
-
-exit 0
--- a/.claude/hooks/restore-symlinks.sh
+++ b/.claude/hooks/restore-symlinks.sh
@@ -1,36 +0,0 @@
-#!/usr/bin/env bash
-# Restore _ee.rs symlinks after Claude finishes processing
-# This script runs when Claude stops
-# IMPORTANT: Copies any modifications back to the target before restoring symlinks
-
-set -e
-
-PROJECT_DIR="${CLAUDE_PROJECT_DIR:-/home/farhad/windmill}"
-MANIFEST_FILE="$PROJECT_DIR/.claude/hooks/.symlink-manifest"
-
-# Check if manifest exists
-if [[ ! -f "$MANIFEST_FILE" ]]; then
-    exit 0
-fi
-
-# Read manifest and restore symlinks
-while IFS='|' read -r symlink target; do
-    if [[ -n "$symlink" && -n "$target" ]]; then
-        # If the file exists (not a symlink) and target exists, copy changes back
-        if [[ -f "$symlink" && ! -L "$symlink" && -e "$target" ]]; then
-            # Copy the potentially modified file back to the target
-            cp "$symlink" "$target"
-        fi
-
-        # Remove the regular file (which was a copy)
-        rm -f "$symlink" 2>/dev/null || true
-
-        # Recreate the symlink
-        ln -s "$target" "$symlink" 2>/dev/null || true
-    fi
-done < "$MANIFEST_FILE"
-
-# Clean up manifest
-rm -f "$MANIFEST_FILE"
-
-exit 0
--- a/.claude/review-prompt.md
+++ b/.claude/review-prompt.md
@@ -0,0 +1,25 @@
+# Code Review Instructions
+
+Review this pull request and provide comprehensive feedback.
+
+## Focus Areas
+
+- **Code quality and best practices** — does the code follow established patterns?
+- **Potential bugs or issues** — will this code work correctly in all cases?
+- **Performance considerations** — are there unnecessary allocations, N+1 queries, or bottlenecks?
+- **Security implications** — injection, auth bypass, data exposure?
+
+## CLAUDE.md Compliance
+
+Read all relevant CLAUDE.md files (root and in directories containing changed files). Check each rule against the changed code. Quote the exact rule when flagging a violation.
+
+## Review Guidelines
+
+- Provide detailed feedback using inline comments for specific issues
+- Use top-level comments for general observations or praise
+- Only flag issues introduced by this PR, not pre-existing problems
+- Self-validate each finding: "Is this definitely a real issue?" If uncertain, discard it
+
+## Testing Instructions
+
+At the end of your review, add complete instructions to reproduce the added changes through the app interface. These instructions will be given to a tester so they can verify the changes. It should be a short descriptive text (not a step-by-step or a list) on how to navigate the app (what page, what action, what input, etc.) to see the changes.
--- a/.claude/settings.json
+++ b/.claude/settings.json
@@ -1,5 +1,8 @@
 {
  "permissions": {
+    "additionalDirectories": [
+      "../windmill-ee-private"
+    ],
    "allow": [
      "Bash(ls:*)",
      "Bash(grep:*)",
@@ -25,9 +28,23 @@
      "Bash(git show:*)",
      "Bash(git blame:*)",
      "Bash(cargo check:*)",
+      "Bash(cargo build --release:*)",
+      "Bash(sh wm-ts-nav/nav:*)",
+      "Bash(wm-ts-nav/nav:*)",
+      "Bash(./wm-ts-nav/nav:*)",
+      "Bash(wm-ts-nav/target/release/wm-ts-nav:*)",
+      "Bash(./wm-ts-nav/target/release/wm-ts-nav:*)",
      "mcp__ide__getDiagnostics",
      "Bash(npm run generate-backend-client:*)",
-      "Bash(npm run check:*)"
+      "Bash(npm run check:*)",
+      "Bash(git push:*)",
+      "Bash(git reset:*)",
+      "Bash(git revert:*)",
+      "Bash(git checkout:*)",
+      "Bash(git merge:*)",
+      "Bash(git rebase:*)",
+      "Bash(git add:*)",
+      "Bash(git commit:*)"
    ],
    "deny": [
      "Read(.env)",
@@ -52,46 +69,19 @@
      "Bash(chown:*)",
      "Bash(truncate:*)",
      "Bash(shred:*)",
-      "Bash(unlink:*)",
-      "Bash(git push:*)",
-      "Bash(git reset:*)",
-      "Bash(git revert:*)",
-      "Bash(git checkout:*)",
-      "Bash(git merge:*)",
-      "Bash(git rebase:*)"
+      "Bash(unlink:*)"
    ]
  },
  "enableAllProjectMcpServers": true,
  "hooks": {
-    "UserPromptSubmit": [
+    "PreToolUse": [
      {
+        "matcher": "Bash",
        "hooks": [
          {
            "type": "command",
-            "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/resolve-symlinks.sh",
-            "timeout": 30
-          }
-        ]
-      }
-    ],
-    "Stop": [
-      {
-        "hooks": [
-          {
-            "type": "command",
-            "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/restore-symlinks.sh",
-            "timeout": 30
-          }
-        ]
-      }
-    ],
-    "SessionEnd": [
-      {
-        "hooks": [
-          {
-            "type": "command",
-            "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/restore-symlinks.sh",
-            "timeout": 30
+            "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/guard-main-branch.sh",
+            "timeout": 5
          }
        ]
      }
@@ -126,8 +116,7 @@
    ]
  },
  "enabledPlugins": {
-    "rust-analyzer-lsp@claude-plugins-official": true,
    "typescript-lsp@claude-plugins-official": true,
    "code-review@claude-plugins-official": true
  }
-}
+}
--- a/.claude/skills/local-review/SKILL.md
+++ b/.claude/skills/local-review/SKILL.md
@@ -0,0 +1,69 @@
+---
+name: local-review
+user_invocable: true
+description: Code review a pull request for bugs and CLAUDE.md compliance. MUST use when asked to review code.
+---
+
+# Local Code Review Skill
+
+Run the same review locally that the GitHub Claude Auto Review action runs on PRs. The shared review instructions live in `.claude/review-prompt.md` — read that file first and follow its instructions.
+
+## Execution Steps
+
+1. **Read `.claude/review-prompt.md`** for the review criteria and focus areas
+
+2. **Determine the PR scope**:
+   - If an argument is provided, use it as the PR number or branch
+   - Otherwise, detect from the current branch vs main
+   - Run `gh pr view` if a PR exists, or use `git diff main...HEAD`
+
+3. **Get the diff and metadata**:
+   - `gh pr diff` or `git diff main...HEAD` for the full diff
+   - `gh pr view` or `git log main..HEAD --oneline` for context
+
+4. **Read changed files** where the diff alone is insufficient to understand context
+
+5. **Apply the review instructions from `.claude/review-prompt.md`**
+
+6. **Self-validate each finding**: Before reporting, ask yourself:
+   - "Is this definitely a real issue, not a false positive?"
+   - "Would a senior engineer flag this in review?"
+   - If the answer to either is no, discard the finding
+
+7. **Output findings** to the terminal (default) or post as PR comments (with `--comment` flag)
+
+## Output Format
+
+```
+## Code review
+
+Found N issues:
+
+1. <description> (<reason: CLAUDE.md adherence | bug | security>)
+   <file_path:line_number>
+
+2. <description> (<reason>)
+   <file_path:line_number>
+```
+
+If no issues are found:
+
+```
+## Code review
+
+No issues found. Checked for bugs and CLAUDE.md compliance.
+```
+
+## Posting Comments (--comment flag)
+
+If the user passes `--comment`, post findings as inline PR comments using:
+
+```bash
+gh pr review --comment --body "<summary>"
+```
+
+Or for inline comments on specific lines:
+
+```bash
+gh api repos/{owner}/{repo}/pulls/{pr}/reviews -f body="<summary>" -f event="COMMENT" -f comments="[...]"
+```
--- a/.claude/skills/native-trigger/SKILL.md
+++ b/.claude/skills/native-trigger/SKILL.md
@@ -1,3 +1,8 @@
+---
+name: native-trigger
+description: Guidance for adding native trigger services to Windmill. Use when implementing or modifying native trigger integrations across the backend and frontend.
+---
+
 # Skill: Adding Native Trigger Services

 This skill provides comprehensive guidance for adding new native trigger services to Windmill. Native triggers allow external services (like Nextcloud, Google Drive, etc.) to trigger Windmill scripts/flows via webhooks or push notifications.
@@ -581,7 +586,28 @@ In `frontend/src/lib/components/graph/renderers/triggers/TriggersBadge.svelte`:
 2. Add to `baseConfig` with `countKey` (the dynamic `availableNativeServices` loop does NOT set `countKey`)
 3. Add to the `allTypes` array

-### Step 14: Update OpenAPI Spec and Regenerate Types
+### Step 14: Update TriggersWrapper.svelte
+
+In `frontend/src/lib/components/triggers/TriggersWrapper.svelte`:
+
+Add a `{:else if selectedTrigger.type === 'yourservice'}` case that renders `<NativeTriggersPanel service="yourservice" ...>` with the same props pattern as the existing native trigger cases (e.g., `nextcloud`).
+
+### Step 15: Update AddTriggersButton.svelte
+
+In `frontend/src/lib/components/triggers/AddTriggersButton.svelte`:
+
+1. Add `yourserviceAvailable` state variable
+2. Add `setYourserviceState()` async function using `isServiceAvailable('yourservice', $workspaceStore!)`
+3. Call it at module level
+4. Add a dropdown entry to `addTriggerItems` with `hidden: !yourserviceAvailable`
+
+### Step 16: Update TriggersEditor.svelte Delete Handling
+
+In `frontend/src/lib/components/triggers/TriggersEditor.svelte`:
+
+Add your service to the `nativeTriggerServices` map in `deleteDeployedTrigger()`. Native triggers use `NativeTriggerService.deleteNativeTrigger({ workspace, serviceName, externalId })` instead of the standard `path`-based delete.
+
+### Step 17: Update OpenAPI Spec and Regenerate Types

 Add to `JobTriggerKind` enum in `backend/windmill-api/openapi.yaml`, then:

--- a/.claude/skills/pr/SKILL.md
+++ b/.claude/skills/pr/SKILL.md
@@ -33,6 +33,7 @@ Follow conventional commit format for the PR title:
 - Keep under 70 characters
 - Use lowercase, imperative mood
 - No period at the end
+- If `*_ee.rs` files were modified, prefix with `[ee]`: `[ee] <type>: <description>`

 ## PR Body Format

@@ -60,12 +61,13 @@ Generated with [Claude Code](https://claude.com/claude-code)
 1. Run `git status` to check for uncommitted changes
 2. Run `git log main..HEAD --oneline` to see all commits in this branch
 3. Run `git diff main...HEAD` to see the full diff against main
-4. Check if remote branch exists and is up to date:
+4. **Run `/local-review`** before creating the PR. If issues are found, fix them and commit before proceeding. Do not skip this step.
+5. Check if remote branch exists and is up to date:
   ```bash
   git rev-parse --abbrev-ref --symbolic-full-name @{u} 2>/dev/null || echo "no upstream"
   ```
-5. Push to remote if needed: `git push -u origin HEAD`
-6. Create draft PR using gh CLI:
+6. Push to remote if needed: `git push -u origin HEAD`
+7. Create draft PR using gh CLI:
   ```bash
   gh pr create --draft --title "<type>: <description>" --body "$(cat <<'EOF'
   ## Summary
@@ -84,4 +86,26 @@ Generated with [Claude Code](https://claude.com/claude-code)
   EOF
   )"
   ```
-7. Return the PR URL to the user
+8. Return the PR URL to the user
+
+## EE Companion PR (when `*_ee.rs` files were modified)
+
+The `*_ee.rs` files in the windmill repo are **symlinks** to `windmill-ee-private` — changes won't appear in `git diff` of the windmill repo. Instead, check the EE repo for uncommitted or unpushed changes.
+
+Follow the full EE PR workflow in `docs/enterprise.md`. The key PR-specific details:
+
+1. Find the EE repo/worktree: see "Finding the EE Repo" in `docs/enterprise.md`
+2. Check for changes: `git -C <ee-path> status --short`
+   - If there are no changes in the EE repo, skip this entire section
+3. Follow steps 1–5 from the "EE PR Workflow" in `docs/enterprise.md`
+4. Create the companion PR (title does NOT get the `[ee]` prefix):
+   ```bash
+   gh pr create --draft --repo windmill-labs/windmill-ee-private --title "<type>: <description>" --body "$(cat <<'EOF'
+   Companion PR for windmill-labs/windmill#<PR_NUMBER>
+
+   ---
+   Generated with [Claude Code](https://claude.com/claude-code)
+   EOF
+   )"
+   ```
+5. Commit `ee-repo-ref.txt` and push the updated windmill branch
--- a/.claude/skills/refine/SKILL.md
+++ b/.claude/skills/refine/SKILL.md
@@ -0,0 +1,39 @@
+---
+name: refine
+user_invocable: true
+description: End-of-session reflection. Reviews friction encountered during the session and proposes updates to docs/ to capture lessons learned.
+---
+
+# Refine Skill
+
+Reflect on the current session and update documentation with lessons learned.
+
+## Instructions
+
+1. **Identify friction**: Review what happened in this session:
+   - Run `git diff main...HEAD --stat` to see what files were touched
+   - Think about: what was slow, what failed, what required multiple attempts, what information was missing or hard to find
+
+2. **Read current docs**: Read the docs that were relevant to this session:
+   - `docs/validation.md`
+   - `docs/enterprise.md`
+   - `docs/autonomous-mode.md`
+   - Any skills that were invoked
+
+3. **Propose updates**: For each piece of friction, decide if it warrants a doc update:
+   - **Missing knowledge**: Information you had to discover that should be documented
+   - **Wrong guidance**: Instructions that led you astray
+   - **Missing validation rule**: A check that should be in the validation matrix
+   - **New pattern**: A codebase pattern worth capturing for next time
+
+4. **Apply updates**: Edit the relevant `docs/` files. Keep changes minimal and specific — add only what would have saved time this session.
+
+5. **Report**: Summarize what was added/changed and why.
+
+## Rules
+
+- Only add knowledge confirmed by this session — no speculative additions
+- Keep docs concise — add a line or two, not a paragraph
+- If a whole new doc is needed, create it in `docs/` and add a pointer in `CLAUDE.md`
+- Don't update skills unless a coding pattern was genuinely wrong
+- Don't add things Claude already knows — only Windmill-specific knowledge
--- a/.claude/skills/rust-backend/SKILL.md
+++ b/.claude/skills/rust-backend/SKILL.md
@@ -3,493 +3,105 @@ name: rust-backend
 description: Rust coding guidelines for the Windmill backend. MUST use when writing or modifying Rust code in the backend directory.
 ---

-# Rust Backend Coding Guidelines
+# Windmill Rust Patterns

-Apply these patterns when writing or modifying Rust code in the `backend/` directory.
-
-## Data Structure Design
-
-Choose between `struct`, `enum`, or `newtype` based on domain needs:
-
- Use `enum` for state machines instead of boolean flags or loosely related fields
- Model invariants explicitly using types (e.g., `NonZeroU32`, `Duration`, custom enums)
- Consider ownership of each field:
-  - Use `&str` vs `String`, slices vs vectors
-  - Use `Arc<T>` when sharing across threads
-  - Use `Cow<'a, T>` for flexible ownership
-
-```rust
-// State machine with enum
-enum JobState {
-    Pending { scheduled_for: DateTime<Utc> },
-    Running { started_at: DateTime<Utc>, worker: String },
-    Completed { result: JobResult, duration_ms: i64 },
-    Failed { error: String, retries: u32 },
-}
-
-// Avoid multiple booleans
-struct Job {
-    is_pending: bool,   // Don't do this
-    is_running: bool,
-    is_completed: bool,
-}
-```
-
-## Impl Block Organization
-
-Place `impl` blocks immediately below the struct/enum they modify. Group methods logically:
-
-```rust
-struct JobQueue {
-    jobs: Vec<Job>,
-    capacity: usize,
-}
-
-impl JobQueue {
-    // Constructors first
-    pub fn new(capacity: usize) -> Self { ... }
-    pub fn with_jobs(jobs: Vec<Job>) -> Self { ... }
-
-    // Getters
-    pub fn len(&self) -> usize { ... }
-    pub fn is_empty(&self) -> bool { ... }
-
-    // Mutation methods
-    pub fn push(&mut self, job: Job) -> Result<()> { ... }
-    pub fn pop(&mut self) -> Option<Job> { ... }
-
-    // Domain logic
-    pub fn next_scheduled(&self) -> Option<&Job> { ... }
-}
-```
-
-## Iterator Chains Over For-Loops
-
-Prefer functional iterator chains (`.filter().map().collect()`) over imperative for-loops:
-
-```rust
-// Preferred
-let results: Vec<_> = items
-    .iter()
-    .filter(|item| item.is_valid())
-    .map(|item| item.transform())
-    .collect();
-
-// Avoid
-let mut results = Vec::new();
-for item in items.iter() {
-    if item.is_valid() {
-        results.push(item.transform());
-    }
-}
-```
+Apply these Windmill-specific patterns when writing Rust code in `backend/`.

 ## Error Handling

-Use the `Error` type from `windmill_common::error`. Return `Result<T, Error>` or `JsonResult<T>` for fallible functions:
+Use `Error` from `windmill_common::error`. Return `Result<T, Error>` or `JsonResult<T>`:

 ```rust
 use windmill_common::error::{Error, Result};

-// Use ? operator for propagation
 pub async fn get_job(db: &DB, id: Uuid) -> Result<Job> {
-    let job = sqlx::query_as!(Job, "SELECT ... WHERE id = $1", id)
+    sqlx::query_as!(Job, "SELECT id, workspace_id FROM v2_job WHERE id = $1", id)
        .fetch_optional(db)
        .await?
        .ok_or_else(|| Error::NotFound("job not found".to_string()))?;
-    Ok(job)
 }
 ```

-Prefer `if let` for optional handling. Use `let...else` when early return makes code clearer:
+Never panic in library code. Reserve `.unwrap()` for compile-time guarantees.
+
+## SQLx Patterns
+
+**Never use `SELECT *`** — always list columns explicitly. Critical for backwards compatibility when workers lag behind API version:

 ```rust
-let Some(config) = get_config() else {
-    return Err(Error::MissingConfig);
-};
+// Correct
+sqlx::query_as!(Job, "SELECT id, workspace_id, path FROM v2_job WHERE id = $1", id)
+
+// Wrong — breaks when columns are added
+sqlx::query_as!(Job, "SELECT * FROM v2_job WHERE id = $1", id)
 ```

-Never panic in library code. Reserve `.unwrap()` for cases with compile-time guarantees. Keep functions short to help lifetime inference and clarity.
-
-## Early Returns
-
-Return early to avoid deep nesting. Handle error cases and edge conditions first:
+Use batch operations to avoid N+1:

 ```rust
-// Preferred - early returns
-fn process_job(job: Option<Job>) -> Result<Output> {
-    let Some(job) = job else {
-        return Ok(Output::default());
-    };
-
-    if !job.is_valid() {
-        return Err(Error::InvalidJob);
-    }
-
-    if job.is_cached() {
-        return Ok(job.cached_result());
-    }
-
-    // Main logic at the end, not nested
-    execute_job(job)
-}
-
-// Avoid - deep nesting
-fn process_job(job: Option<Job>) -> Result<Output> {
-    if let Some(job) = job {
-        if job.is_valid() {
-            if !job.is_cached() {
-                execute_job(job)
-            } else {
-                Ok(job.cached_result())
-            }
-        } else {
-            Err(Error::InvalidJob)
-        }
-    } else {
-        Ok(Output::default())
-    }
-}
+// Preferred — single query with IN clause
+sqlx::query!("SELECT ... WHERE id = ANY($1)", &ids[..]).fetch_all(db).await?
 ```

-## Variable Shadowing
-
-Shadow variables instead of creating new names with prefixes:
-
-```rust
-// Preferred
-let data = fetch_raw_data();
-let data = parse(data);
-let data = validate(data)?;
-
-// Avoid
-let raw_data = fetch_raw_data();
-let parsed_data = parse(raw_data);
-let validated_data = validate(parsed_data)?;
-```
-
-## Minimal Comments
-
- No inline comments explaining obvious code
- No TODO/FIXME comments in committed code
- Doc comments (`///`) only on public items
- Let code be self-documenting through clear naming
-
-## Type Safety
-
-Use enums over boolean flags for clarity:
-
-```rust
-// Preferred
-enum JobStatus {
-    Pending,
-    Running,
-    Completed,
-}
-
-// Avoid
-struct Job {
-    is_running: bool,
-    is_completed: bool,
-}
-```
-
-## Pattern Matching
-
-Prefer explicit matching. Use wildcards strategically for fallback cases or ignored fields:
-
-```rust
-// Explicit matching preferred
-match status {
-    JobStatus::Pending => handle_pending(),
-    JobStatus::Running => handle_running(),
-    JobStatus::Completed => handle_completed(),
-}
-
-// Wildcards OK for fallback
-match result {
-    Ok(value) => process(value),
-    Err(_) => return default_value(),
-}
-
-// Wildcards OK for ignoring fields in destructuring
-let Point { x, y, .. } = point;
-```
-
-## Destructuring in Function Signatures
-
-Destructure structs directly in function parameters:
-
-```rust
-// Preferred
-async fn process_job(
-    Extension(db): Extension<DB>,
-    Path((workspace, job_id)): Path<(String, Uuid)>,
-    Query(pagination): Query<Pagination>,
-) -> Result<Json<Job>> {
-    // ...
-}
-
-// Avoid
-async fn process_job(
-    db_ext: Extension<DB>,
-    path: Path<(String, Uuid)>,
-    query: Query<Pagination>,
-) -> Result<Json<Job>> {
-    let Extension(db) = db_ext;
-    let Path((workspace, job_id)) = path;
-    // ...
-}
-```
-
-## Trait Implementations
-
-Use standard trait implementations to simplify conversions and reduce boilerplate:
-
-```rust
-// Implement From/Into for type conversions
-impl From<DbJob> for ApiJob {
-    fn from(db: DbJob) -> Self {
-        ApiJob {
-            id: db.id,
-            status: db.status.into(),
-        }
-    }
-}
-
-// Use TryFrom for fallible conversions
-impl TryFrom<String> for JobKind {
-    type Error = Error;
-    fn try_from(s: String) -> Result<Self, Self::Error> { ... }
-}
-```
-
-Apply `derive` macros to reduce boilerplate:
-
-```rust
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct Job { ... }
-```
-
-## Module Structure
-
- Use `pub(crate)` instead of `pub` when possible; expose only what needs exposing
- Keep APIs small and expressive; avoid leaking internal types
- Organize code into modules reflecting ownership and domain boundaries
-
-```rust
-// Prefer restricted visibility
-pub(crate) fn internal_helper() { ... }
-
-// Only pub for external API
-pub fn create_job(...) -> Result<Job> { ... }
-```
-
-## Code Navigation
-
-Always use rust-analyzer LSP for:
- Go to definition
- Find references
- Type information
- Import resolution
-
-Do not guess at module paths or type definitions.
+Use transactions for multi-step operations. Parameterize all queries.

 ## JSON Handling

-Prefer `Box<serde_json::value::RawValue>` over `serde_json::Value` when:
- Storing JSON in the database (JSONB columns)
- Passing JSON through without modification
- The JSON structure doesn't need inspection
+Prefer `Box<serde_json::value::RawValue>` over `serde_json::Value` when storing/passing JSON without inspection:

 ```rust
-// Preferred - avoids parsing/serialization overhead
 pub struct Job {
-    pub id: Uuid,
    pub args: Option<Box<serde_json::value::RawValue>>,
 }
-
-// Only use Value when you need to inspect/modify JSON
-let value: serde_json::Value = serde_json::from_str(&json)?;
-if let Some(field) = value.get("field") {
-    // modify or inspect
-}
 ```

-## Serde Optimizations
+Only use `serde_json::Value` when you need to inspect or modify the JSON.

-Use serde attributes to optimize serialization:
+## Serde Optimizations

 ```rust
 #[derive(Serialize, Deserialize)]
 pub struct Job {
-    #[serde(rename = "jobId")]
-    pub id: Uuid,
-
-    #[serde(default)]
-    pub priority: i32,
-
    #[serde(skip_serializing_if = "Option::is_none")]
    pub parent_job: Option<Uuid>,
-
    #[serde(skip_serializing_if = "Vec::is_empty")]
    pub tags: Vec<String>,
+    #[serde(default)]
+    pub priority: i32,
 }
 ```

-Prefer borrowing for zero-copy deserialization when lifetimes allow:
+## Async & Concurrency
+
+Never block the async runtime. Use `spawn_blocking` for CPU-intensive work:

 ```rust
-#[derive(Deserialize)]
-pub struct JobInput<'a> {
-    #[serde(borrow)]
-    pub workspace_id: Cow<'a, str>,
-
-    #[serde(borrow)]
-    pub script_path: &'a str,
-}
+let result = tokio::task::spawn_blocking(move || expensive_computation(&data)).await?;
 ```

-## SQLx Patterns
+**Mutex selection**: Prefer `std::sync::Mutex` (or `parking_lot::Mutex`) for data protection. Only use `tokio::sync::Mutex` when holding locks across `.await` points.

-**Never use `SELECT *`** - always list columns explicitly. This is critical for backwards compatibility when workers run behind the API server version:
+Use `tokio::sync::mpsc` (bounded) for channels. Avoid `std::thread::sleep` in async contexts.
+
+## Module Structure & Visibility
+
+- Use `pub(crate)` instead of `pub` when possible
+- Place new code in the appropriate crate based on functionality
+- API endpoints go in `windmill-api/src/` organized by domain
+- Shared functionality goes in `windmill-common/src/`
+
+## Code Navigation
+
+Always use rust-analyzer LSP for go-to-definition, find-references, and type info. Do not guess at module paths.
+
+## Axum Handlers
+
+Destructure extractors directly in function signatures:

 ```rust
-// Preferred - explicit columns
-sqlx::query_as!(
-    Job,
-    "SELECT id, workspace_id, path, created_at FROM v2_job WHERE id = $1",
-    job_id
-)
-
-// Avoid - breaks when columns are added
-sqlx::query_as!(Job, "SELECT * FROM v2_job WHERE id = $1", job_id)
+async fn process_job(
+    Extension(db): Extension<DB>,
+    Path((workspace, job_id)): Path<(String, Uuid)>,
+    Query(pagination): Query<Pagination>,
+) -> Result<Json<Job>> { ... }
 ```
-
-Use batch operations to minimize round trips:
-
-```rust
-// Preferred - single query with multiple values
-sqlx::query!(
-    "INSERT INTO job_logs (job_id, logs) VALUES ($1, $2), ($3, $4)",
-    id1, log1, id2, log2
-)
-
-// Avoid N+1 queries
-for id in ids {
-    sqlx::query!("SELECT ... WHERE id = $1", id).fetch_one(db).await?;
-}
-
-// Preferred - single query with IN clause
-sqlx::query!("SELECT ... WHERE id = ANY($1)", &ids[..]).fetch_all(db).await?
-```
-
-Use transactions for multi-step operations and parameterize all queries.
-
-## Async & Tokio Patterns
-
-Never block the async runtime. Use `spawn_blocking` for CPU-intensive or blocking I/O:
-
-```rust
-// Preferred - offload blocking work
-let result = tokio::task::spawn_blocking(move || {
-    expensive_computation(&data)
-}).await?;
-
-// Avoid - blocks the runtime
-let result = expensive_computation(&data);  // Don't do this in async
-```
-
-Use tokio primitives for sleep and channels:
-
-```rust
-use tokio::sync::mpsc;
-use tokio::time::sleep;
-
-// Avoid in async contexts
-use std::thread::sleep; // Blocks the runtime
-```
-
-Use bounded channels for backpressure:
-
-```rust
-// Preferred - bounded channel prevents overwhelming
-let (tx, rx) = tokio::sync::mpsc::channel(100);
-
-// Be careful with unbounded
-let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
-```
-
-## Mutex Selection in Async Code
-
-**Prefer `std::sync::Mutex` (or `parking_lot::Mutex`) over `tokio::sync::Mutex`** for protecting data in async code. The async mutex is more expensive and only needed when holding locks across `.await` points.
-
-```rust
-// Preferred for data protection - std mutex is faster
-use std::sync::Mutex;
-
-struct Cache {
-    data: Mutex<HashMap<String, Value>>,
-}
-
-impl Cache {
-    fn get(&self, key: &str) -> Option<Value> {
-        self.data.lock().unwrap().get(key).cloned()
-    }
-
-    fn insert(&self, key: String, value: Value) {
-        self.data.lock().unwrap().insert(key, value);
-    }
-}
-```
-
-**Use `tokio::sync::Mutex` only when you must hold the lock across `.await` points**, typically for IO resources like database connections:
-
-```rust
-use tokio::sync::Mutex;
-use std::sync::Arc;
-
-// Async mutex for IO resources held across await points
-let conn = Arc::new(Mutex::new(db_connection));
-
-async fn execute_query(conn: Arc<Mutex<DbConn>>, query: &str) {
-    let mut lock = conn.lock().await;
-    lock.execute(query).await;  // Lock held across .await
-}
-```
-
-**Common pattern**: Wrap `Arc<Mutex<...>>` in a struct with non-async methods that lock internally, keeping lock scope minimal:
-
-```rust
-struct SharedState {
-    inner: std::sync::Mutex<StateInner>,
-}
-
-impl SharedState {
-    fn update(&self, value: i32) {
-        self.inner.lock().unwrap().value = value;
-    }
-
-    fn get(&self) -> i32 {
-        self.inner.lock().unwrap().value
-    }
-}
-```
-
-**Alternative for IO resources**: Spawn a dedicated task to manage the resource and communicate via message passing:
-
-```rust
-let (tx, mut rx) = tokio::sync::mpsc::channel(32);
-
-tokio::spawn(async move {
-    while let Some(cmd) = rx.recv().await {
-        handle_io_command(&mut resource, cmd).await;
-    }
-});
-```
-
-## Build & Tooling
-
-Build speed tips:
- Use `cargo check` during rapid iteration over `cargo build`
- Minimize unnecessary dependencies and feature flags
--- a/.claude/skills/svelte-frontend/SKILL.md
+++ b/.claude/skills/svelte-frontend/SKILL.md
@@ -3,227 +3,78 @@ name: svelte-frontend
 description: Svelte coding guidelines for the Windmill frontend. MUST use when writing or modifying code in the frontend directory.
 ---

-# Svelte 5 Best Practices
+# Windmill Svelte Patterns

-This guide outlines best practices for developing with Svelte 5, incorporating the new Runes API and other modern Svelte features. These rules MUST NOT be applied on svelte 4 files unless explicitly asked to do so.
+Apply these Windmill-specific patterns when writing Svelte code in `frontend/`. For general Svelte 5 syntax (runes, snippets, event handling), use the Svelte MCP server.

-## Reactivity with Runes
+## Windmill UI Components (MUST use)

-Svelte 5 introduces Runes for more explicit and flexible reactivity.
+Always use Windmill's design-system components. Never use raw HTML elements.

-1.  **Embrace Runes for State Management**:
-    *   Use `$state` for reactive local component state.
-        ```svelte
-        <script>
-          let count = $state(0);
+### Buttons — `<Button>`

-          function increment() {
-            count += 1;
-          }
-        </script>
+```svelte
+<script>
+  import { Button } from '$lib/components/common'
+  import { ChevronLeft } from 'lucide-svelte'
+</script>

-        <button onclick={increment}>
-          Clicked {count} {count === 1 ? 'time' : 'times'}
-        </button>
-        ```
-    *   Use `$derived` for computed values based on other reactive state.
-        ```svelte
-        <script>
-          let count = $state(0);
-          const doubled = $derived(count * 2);
-        </script>
+<Button variant="default" onclick={handleClick}>Label</Button>
+<Button startIcon={{ icon: ChevronLeft }} iconOnly onclick={prev} />
+```

-        <p>{count} * 2 = {doubled}</p>
-        ```
-    *   Use `$effect` for side effects that need to run when reactive values change (e.g., logging, manual DOM manipulation, data fetching). Remember `$effect` does not run on the server.
-        ```svelte
-        <script>
-          let count = $state(0);
+Props: `variant?: 'accent' | 'accent-secondary' | 'default' | 'subtle'`, `unifiedSize?: 'sm' | 'md' | 'lg'`, `startIcon?: { icon: SvelteComponent }`, `iconOnly?: boolean`, `disabled?: boolean`

-          $effect(() => {
-            console.log('The count is now', count);
-            if (count > 5) {
-              alert('Count is too high!');
-            }
-          });
-        </script>
-        ```
+### Text inputs — `<TextInput>`

-2.  **Props with `$props`**:
-    *   Declare component props using `$props()`. This offers better clarity and flexibility compared to `export let`.
-        ```svelte
-        <script>
-          // ChildComponent.svelte
-          let { name, age = $state(30) } = $props();
-        </script>
+```svelte
+<script>
+  import { TextInput } from '$lib/components/common'
+</script>

-        <p>Name: {name}</p>
-        <p>Age: {age}</p>
-        ```
-    *   For bindable props, use `$bindable`.
-        ```svelte
-        <script>
-          // MyInput.svelte
-          let { value = $bindable() } = $props();
-        </script>
+<TextInput bind:value={val} placeholder="Enter value" />
+```

-        <input bind:value />
-        ```
+Props: `value?: string | number` (bindable), `placeholder?: string`, `disabled?: boolean`, `error?: string | boolean`, `size?: 'sm' | 'md' | 'lg'`

-## Event Handling
+### Selects — `<Select>`

-*   **Use direct event attributes**: Svelte 5 moves away from `on:` directives for DOM events.
-    *   **Do**: `<button onclick={handleClick}>...</button>`
-    *   **Don't**: `<button on:click={handleClick}>...</button>`
-*   **For component events, prefer callback props**: Instead of `createEventDispatcher`, pass functions as props.
-    ```svelte
-    <!-- Parent.svelte -->
-    <script>
-      import Child from './Child.svelte';
-      let message = $state('');
-      function handleChildEvent(detail) {
-        message = detail;
-      }
-    </script>
-    <Child onCustomEvent={handleChildEvent} />
-    <p>Message from child: {message}</p>
+```svelte
+<script>
+  import Select from '$lib/components/select/Select.svelte'
+</script>

-    <!-- Child.svelte -->
-    <script>
-      let { onCustomEvent } = $props();
-      function emitEvent() {
-        onCustomEvent('Hello from child!');
-      }
-    </script>
-    <button onclick={emitEvent}>Send Event</button>
-    ```
+<Select items={[{ label: 'Jan', value: 1 }]} bind:value={selected} />
+```

-## Snippets for Content Projection
+Props: `items?: Array<{ label?: string; value: any }>`, `value` (bindable), `placeholder?: string`, `clearable?: boolean`, `size?: 'sm' | 'md' | 'lg'`

-*   **Use `{#snippet ...}` and `{@render ...}` instead of slots**: Snippets are more powerful and flexible.
-    ```svelte
-    <!-- Parent.svelte -->
-    <script>
-      import Card from './Card.svelte';
-    </script>
+### Icons — `lucide-svelte`

-    <Card>
-      {#snippet title()}
-        My Awesome Title
-      {/snippet}
-      {#snippet content()}
-        <p>Some interesting content here.</p>
-      {/snippet}
-    </Card>
+Never write inline SVGs. Import from `lucide-svelte`:

-    <!-- Card.svelte -->
-    <script>
-      let { title, content } = $props();
-    </script>
+```svelte
+<script>
+  import { ChevronLeft, X } from 'lucide-svelte'
+</script>
+<ChevronLeft size={16} />
+```

-    <article>
-      <header>{@render title()}</header>
-      <div>{@render content()}</div>
-    </article>
-    ```
-*   Default content is passed via the `children` prop (which is a snippet).
-    ```svelte
-    <!-- Wrapper.svelte -->
-    <script>
-      let { children } = $props();
-    </script>
-    <div>
-      {@render children?.()}
-    </div>
-    ```
+## Form Components

-## Component Design
+Form components (TextInput, Toggle, Select, etc.) should use the unified size system when placed together.

-1.  **Create Small, Reusable Components**: Break down complex UIs into smaller, focused components. Each component should have a single responsibility. This also aids performance by limiting the scope of reactivity updates.
-2.  **Descriptive Naming**: Use clear and descriptive names for variables, functions, and components.
-3.  **Minimize Logic in Components**: Move complex business logic to utility functions or services. Keep components focused on presentation and interaction.
+## Styling

-## State Management (Stores)
+- Use Tailwind CSS for all styling — no custom CSS
+- Use Windmill's theming classes for colors/surfaces (see `frontend/brand-guidelines.md`)
+- Read component props JSDoc before using them

-1.  **Segment Stores**: Avoid a single global store. Create multiple stores, each responsible for a specific piece of global state (e.g., `userStore.js`, `themeStore.js`). This can help limit reactivity updates to only the parts of the UI that depend on specific state segments.
-2.  **Use Custom Stores for Complex Logic**: For stores with related methods, create custom stores.
-    ```javascript
-    // counterStore.js
-    import { writable } from 'svelte/store';
+## Svelte MCP Server

-    function createCounter() {
-      const { subscribe, set, update } = writable(0);
+Use the Svelte MCP tools when working on Svelte code:

-      return {
-        subscribe,
-        increment: () => update(n => n + 1),
-        decrement: () => update(n => n - 1),
-        reset: () => set(0)
-      };
-    }
-    export const counter = createCounter();
-    ```
-3.  **Use Context API for Localized State**: For state shared within a component subtree, consider Svelte's context API (`setContext`, `getContext`) instead of global stores when the state doesn't need to be truly global.
-
-## Performance Optimizations (Svelte 5)
-
-When generating Svelte 5 code, prioritize frontend performance by applying the following principles:
-
-### General Svelte 5 Principles
-
-   **Leverage the Compiler:** Trust Svelte's compiler to generate optimized JavaScript. Avoid manual DOM manipulation (`document.querySelector`, etc.) unless absolutely necessary for integrating third-party libraries that lack Svelte adapters.
-   **Keep Components Small and Focused:** Reinforcing from Component Design, smaller components lead to less complex reactivity graphs and more targeted, efficient updates.
-
-### Reactivity & State Management
-
-   **Optimize Computations with `$derived`:** Always use `$derived` for computed values that depend on other state. This ensures the computation only runs when its specific dependencies change, avoiding unnecessary work compared to recomputing derived values in `$effect` or less efficient methods.
-   **Minimize `$effect` Usage:** Use `$effect` sparingly and only for true side effects that interact with the outside world or non-Svelte state. Avoid putting complex logic or state updates *within* an `$effect` unless those updates are explicitly intended as a reaction to external changes or non-Svelte state. Excessive or complex effects can impact rendering performance.
-   **Structure State for Fine-Grained Updates:** Design your `$state` objects or variables such that updates affect only the necessary parts of the UI. Avoid putting too much unrelated state into a single large object that gets frequently updated, as this can potentially trigger broader updates than necessary. Consider normalizing complex, nested state.
-
-### List Rendering (`{#each}`)
-
-   **Mandate `key` Attribute:** Always use a `key` attribute (`{#each items as item (item.id)}`) that refers to a unique, stable identifier for each item in a list. This is critical for allowing Svelte to efficiently update, reorder, add, or remove list items without destroying and re-creating unnecessary DOM elements and component instances.
-
-### Component Loading & Bundling
-
-   **Implement Lazy Loading/Code Splitting:** For routes, components, or modules that are not immediately needed on page load, use dynamic imports (`import(...)`) to split the code bundle. SvelteKit handles this automatically for routes, but it can be applied manually to components using helper patterns if needed.
-   **Be Mindful of Third-Party Libraries:** When incorporating external libraries, import only the necessary functions or components to minimize the final bundle size. Prefer libraries designed to be tree-shakeable.
-
-### Rendering & DOM
-
-   **Use CSS for Animations/Transitions:** Prefer CSS animations or transitions where possible for performance. Svelte's built-in `transition:` directive is also highly optimized and should be used for complex state-driven transitions, but simple cases can often use plain CSS.
-   **Optimize Image Loading:** Implement best practices for images: use optimized formats (WebP, AVIF), lazy loading (`loading="lazy"`), and responsive images (`<picture>`, `srcset`) to avoid loading unnecessarily large images.
-
-### Server-Side Rendering (SSR) & Hydration
-
-   **Ensure SSR Compatibility:** Write components that can be rendered on the server for faster initial page loads. Avoid relying on browser-specific APIs (like `window` or `document`) in the main `<script>` context. If necessary, use `$effect` or check `if (browser)` inside effects to run browser-specific code only on the client.
-   **Minimize Work During Hydration:** Structure components and data fetching such that minimal complex setup or computation is required when the client-side Svelte code takes over from the server-rendered HTML. Heavy synchronous work during hydration can block the main thread.
-
-## General Clean Code Practices
-
-1.  **Organized File Structure**: Group related files together. A common structure:
-    ```
-    /src
-    |-- /routes      // Page components (if using a router like SvelteKit)
-    |-- /lib         // Utility functions, services, constants (SvelteKit often uses this)
-    |   |-- /stores
-    |   |-- /utils
-    |   |-- /services
-    |   |-- /components  // Reusable UI components
-    |-- App.svelte
-    |-- main.js (or main.ts)
-    ```
-2.  **Scoped Styles**: Keep CSS scoped to components to avoid unintended side effects and improve maintainability. Avoid `:global` where possible.
-3.  **Immutability**: With Svelte 5 and `$state`, direct assignments to properties of `$state` objects (`obj.prop = value;`) are generally fine as Svelte's reactivity system handles updates. However, for non-rune state or when interacting with other systems, understanding and sometimes preferring immutable updates (creating new objects/arrays) can still be relevant.
-4.  **Use `class:` and `style:` directives**: For dynamic classes and styles, use Svelte's built-in directives for cleaner templates and potentially optimized updates.
-    ```svelte
-    <script>
-      let isActive = $state(true);
-      let color = $state('blue');
-    </script>
-
-    <div class:active={isActive} style:color={color}>
-      Hello
-    </div>
-    ```
-5.  **Stay Updated**: Keep Svelte and its related packages up to date to benefit from the latest features, performance improvements, and security fixes.
+1. **list-sections**: Call first to discover available docs
+2. **get-documentation**: Fetch relevant sections based on use_cases
+3. **svelte-autofixer**: MUST use on all Svelte code before finalizing — keep calling until no issues
+4. **playground-link**: Only after user confirms and code was NOT written to project files
--- a/.envrc
+++ b/.envrc
@@ -1 +1,7 @@
 use flake
+
+# Per-worktree overrides (ports, DATABASE_URL, etc.) written by webmux/workmux
+# post-create hooks. Must come after `use flake` so they take precedence over
+# the flake's defaults.
+# shellcheck source=/dev/null
+[ -f .env.local ] && source .env.local
--- a/.github/DockerfileBackendTests
+++ b/.github/DockerfileBackendTests
@@ -42,7 +42,11 @@ RUN wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VER
 RUN /usr/local/bin/python3 -m pip install pip-tools

 # Bun
-COPY --from=oven/bun:1.3.8 /usr/local/bin/bun /usr/bin/bun
+COPY --from=oven/bun:1.3.10 /usr/local/bin/bun /usr/bin/bun
+
+# Install windmill CLI
+RUN bun install -g windmill-cli \
+    && ln -s $(bun pm bin -g)/wmill /usr/bin/wmill

 ARG TARGETPLATFORM

--- a/.github/change-versions-mac.sh
+++ b/.github/change-versions-mac.sh
@@ -15,11 +15,8 @@ sed -i '' -e "/\"version\": /s/: .*,/: \"$VERSION\",/" ${root_dirpath}/typescrip
 sed -i '' -e "/\"version\": /s/: .*,/: \"$VERSION\",/" ${root_dirpath}/frontend/package.json
 sed -i '' -e "/^version =/s/= .*/= \"$VERSION\"/" ${root_dirpath}/python-client/wmill/pyproject.toml
 sed -i '' -e "/^windmill-api =/s/= .*/= \"\\^$VERSION\"/" ${root_dirpath}/python-client/wmill/pyproject.toml
-sed -i '' -e "/^version =/s/= .*/= \"$VERSION\"/" ${root_dirpath}/python-client/wmill_pg/pyproject.toml
 sed -i '' -e "/^[[:space:]]*ModuleVersion[[:space:]]*=/s/= .*/= '$VERSION'/" ${root_dirpath}/powershell-client/WindmillClient/WindmillClient.psd1
-# sed -i '' -e "/^wmill =/s/= .*/= \"\\^$VERSION\"/" python-client/wmill_pg/pyproject.toml
 sed -i '' -e "/^wmill =/s/= .*/= \">=$VERSION\"/" ${root_dirpath}/lsp/Pipfile
-sed -i '' -e "/^wmill_pg =/s/= .*/= \">=$VERSION\"/" ${root_dirpath}/lsp/Pipfile

 sed -i '' -E "s/name = \"windmill\"\nversion = \"[^\"]*\"\\n(.*)/name = \"windmill\"\nversion = \"$VERSION\"\\n\\1/" ${root_dirpath}/backend/Cargo.lock

--- a/.github/change-versions.sh
+++ b/.github/change-versions.sh
@@ -16,11 +16,8 @@ sed -i -e "/\"version\": /s/: .*,/: \"$VERSION\",/" ${root_dirpath}/typescript-c
 sed -i -e "/\"version\": /s/: .*,/: \"$VERSION\",/" ${root_dirpath}/frontend/package.json
 sed -i -e "/^version =/s/= .*/= \"$VERSION\"/" ${root_dirpath}/python-client/wmill/pyproject.toml
 sed -i -e "/^windmill-api =/s/= .*/= \"\\^$VERSION\"/" ${root_dirpath}/python-client/wmill/pyproject.toml
-sed -i -e "/^version =/s/= .*/= \"$VERSION\"/" ${root_dirpath}/python-client/wmill_pg/pyproject.toml
 sed -i -e "/^[[:space:]]*ModuleVersion[[:space:]]*=/s/= .*/= '$VERSION'/" ${root_dirpath}/powershell-client/WindmillClient/WindmillClient.psd1
-# sed -i -e "/^wmill =/s/= .*/= \"\\^$VERSION\"/" ${root_dirpath}/python-client/wmill_pg/pyproject.toml
 sed -i -e "/^wmill =/s/= .*/= \">=$VERSION\"/" ${root_dirpath}/lsp/Pipfile
-sed -i -e "/^wmill_pg =/s/= .*/= \">=$VERSION\"/" ${root_dirpath}/lsp/Pipfile

 sed -i -zE "s/name = \"windmill\"\nversion = \"[^\"]*\"\\n(.*)/name = \"windmill\"\nversion = \"$VERSION\"\\n\\1/" ${root_dirpath}/backend/Cargo.lock

--- a/.github/codex/pr-review.prompt.md
+++ b/.github/codex/pr-review.prompt.md
@@ -0,0 +1,23 @@
+You are reviewing a GitHub pull request for this repository.
+
+Review policy:
+- Read `CLAUDE.md` before reviewing code.
+- Only report issues you are confident are real and introduced by this pull request.
+- Focus on bugs, security problems, and clear `CLAUDE.md` violations.
+- Do not report style nits, speculative concerns, pre-existing issues, or problems that a normal linter/typechecker would obviously catch.
+- Keep the review high signal. If there is no clear issue, return no findings.
+
+Repository context:
+- Read `./.github/codex/pr-review-context.md` for the PR metadata and the exact diff commands to use.
+- Review only the changes introduced by this PR.
+- Read additional files only when the diff is not enough to validate a finding.
+- Do not modify any files.
+
+Output requirements:
+- Return a GitHub PR comment in markdown, not JSON.
+- Start with `## Codex Review`.
+- Give a short overall summary first.
+- If you found high-signal issues, list them in a short numbered list with file paths and line numbers when you know them confidently.
+- If you found no high-signal issues, say that explicitly.
+- End with a `### Reproduction instructions` section containing a short descriptive paragraph for a tester explaining how to navigate the app to observe the change. Do not make it a numbered list. If the diff is not enough to infer this safely, say that plainly.
+- Prefer at most 10 findings.
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -31,9 +31,3 @@ updates:
    directory: "/python-client/wmill"
    schedule:
      interval: "weekly"
-      
-  # Maintain dependencies for wmill_pg python client
-  - package-ecosystem: "pip"
-    directory: "/python-client/wmill_pg"
-    schedule:
-      interval: "weekly"
--- a/.github/workflows/backend-check.yml
+++ b/.github/workflows/backend-check.yml
@@ -119,6 +119,18 @@ jobs:
        with:
          cache-workspaces: backend
          toolchain: 1.93.0
+      - name: Fix stale v8 build cache
+        working-directory: ./backend
+        run: |
+          # Cargo cache may preserve v8 build fingerprints without the actual
+          # librusty_v8.a library. Since fingerprints look valid, cargo skips
+          # build.rs re-run, causing "could not find native static library rusty_v8".
+          for profile in debug release; do
+            if [ -d "target/$profile/.fingerprint" ] && [ ! -f "target/$profile/gn_out/obj/librusty_v8.a" ]; then
+              echo "Cleaning stale v8 build artifacts in target/$profile"
+              rm -rf "target/$profile/build/v8-"* "target/$profile/.fingerprint/v8-"*
+            fi
+          done
      - name: cargo check
        timeout-minutes: 16
        working-directory: ./backend
--- a/.github/workflows/backend-test-windows.yml
+++ b/.github/workflows/backend-test-windows.yml
@@ -0,0 +1,167 @@
+name: Backend integration tests (Windows)
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - "ci-windows-tests"
+    tags:
+      - "v*"
+
+env:
+  CARGO_INCREMENTAL: 0
+  SQLX_OFFLINE: true
+  DISABLE_EMBEDDING: true
+
+jobs:
+  cargo_test_windows:
+    runs-on: blacksmith-16vcpu-windows-2025
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Read EE repo commit hash
+        shell: pwsh
+        run: |
+          $ee_repo_ref = Get-Content .\backend\ee-repo-ref.txt
+          echo "ee_repo_ref=$ee_repo_ref" | Out-File -FilePath $env:GITHUB_ENV -Append
+
+      - name: Checkout windmill-ee-private repository
+        uses: actions/checkout@v4
+        with:
+          repository: windmill-labs/windmill-ee-private
+          path: ./windmill-ee-private
+          ref: ${{ env.ee_repo_ref }}
+          token: ${{ secrets.WINDMILL_EE_PRIVATE_ACCESS }}
+          fetch-depth: 0
+
+      - name: Substitute EE code
+        shell: bash
+        run: |
+          ./backend/substitute_ee_code.sh --copy --dir ./windmill-ee-private
+
+      - name: Setup PostgreSQL
+        uses: ikalnytskyi/action-setup-postgres@v6
+        with:
+          username: postgres
+          password: changeme
+          database: windmill
+          port: 5432
+
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+        with:
+          cache-workspaces: backend
+          toolchain: 1.93.0
+
+      - uses: actions/setup-dotnet@v4
+        with:
+          dotnet-version: "9.0.x"
+
+      - uses: denoland/setup-deno@v2
+        with:
+          deno-version: v2.x
+
+      - uses: actions/setup-go@v2
+        with:
+          go-version: 1.21.5
+
+      - uses: oven-sh/setup-bun@v2
+        with:
+          bun-version: 1.3.10
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+
+      - uses: astral-sh/setup-uv@v6.2.1
+        with:
+          version: "0.9.24"
+
+      - uses: shivammathur/setup-php@v2
+        with:
+          php-version: "8.3"
+          tools: composer
+
+      - name: Install windmill CLI
+        shell: bash
+        run: |
+          cd cli
+          bash gen_wm_client.sh
+          bun install
+          mkdir -p "$HOME/.local/bin"
+          printf '#!/bin/sh\nexec bun run "%s/cli/src/main.ts" "$@"\n' "$GITHUB_WORKSPACE" > "$HOME/.local/bin/wmill"
+          chmod +x "$HOME/.local/bin/wmill"
+          echo "$HOME/.local/bin" >> $GITHUB_PATH
+
+      - name: Install OpenSSL via vcpkg
+        run: |
+          vcpkg.exe install openssl-windows:x64-windows
+          vcpkg.exe install openssl:x64-windows-static
+          vcpkg.exe integrate install
+
+      - name: Get runtime paths
+        id: runtime-paths
+        shell: pwsh
+        run: |
+          echo "DENO_PATH=$($(Get-Command deno).Source)" >> $env:GITHUB_OUTPUT
+          echo "BUN_PATH=$($(Get-Command bun).Source)" >> $env:GITHUB_OUTPUT
+          echo "NODE_BIN_PATH=$($(Get-Command node).Source)" >> $env:GITHUB_OUTPUT
+          echo "GO_PATH=$($(Get-Command go).Source)" >> $env:GITHUB_OUTPUT
+          echo "UV_PATH=$($(Get-Command uv).Source)" >> $env:GITHUB_OUTPUT
+          echo "PHP_PATH=$($(Get-Command php).Source)" >> $env:GITHUB_OUTPUT
+          echo "COMPOSER_PATH=$($(Get-Command composer).Source)" >> $env:GITHUB_OUTPUT
+          echo "POWERSHELL_PATH=$($(Get-Command pwsh).Source)" >> $env:GITHUB_OUTPUT
+          echo "DOTNET_PATH=$($(Get-Command dotnet).Source)" >> $env:GITHUB_OUTPUT
+
+      - name: Build DuckDB FFI module
+        working-directory: backend/windmill-duckdb-ffi-internal
+        timeout-minutes: 30
+        run: |
+          cargo build --release -p windmill_duckdb_ffi_internal
+          New-Item -ItemType Directory -Path ..\target\debug -Force
+          Copy-Item target\release\windmill_duckdb_ffi_internal.dll ..\target\debug\
+
+      - name: Print runtime versions and env
+        shell: pwsh
+        run: |
+          deno --version
+          bun -v
+          node --version
+          go version
+          python3 --version
+          php --version
+          pwsh --version
+          dotnet --version
+          echo "TEMP=$env:TEMP"
+          echo "TMP=$env:TMP"
+          echo "USERPROFILE=$env:USERPROFILE"
+          echo "HOME=$env:HOME"
+
+      - name: cargo test
+        working-directory: backend
+        timeout-minutes: 60
+        env:
+          DATABASE_URL: postgres://postgres:changeme@localhost:5432/windmill
+          RUST_LOG: "off"
+          RUST_LOG_STYLE: never
+          CARGO_NET_GIT_FETCH_WITH_CLI: true
+          CARGO_BUILD_JOBS: 12
+          VCPKGRS_DYNAMIC: 1
+          OPENSSL_DIR: ${{ env.VCPKG_INSTALLATION_ROOT }}\installed\x64-windows-static
+          DENO_PATH: ${{ steps.runtime-paths.outputs.DENO_PATH }}
+          BUN_PATH: ${{ steps.runtime-paths.outputs.BUN_PATH }}
+          NODE_BIN_PATH: ${{ steps.runtime-paths.outputs.NODE_BIN_PATH }}
+          GO_PATH: ${{ steps.runtime-paths.outputs.GO_PATH }}
+          UV_PATH: ${{ steps.runtime-paths.outputs.UV_PATH }}
+          PHP_PATH: ${{ steps.runtime-paths.outputs.PHP_PATH }}
+          COMPOSER_PATH: ${{ steps.runtime-paths.outputs.COMPOSER_PATH }}
+          POWERSHELL_PATH: ${{ steps.runtime-paths.outputs.POWERSHELL_PATH }}
+          DOTNET_PATH: ${{ steps.runtime-paths.outputs.DOTNET_PATH }}
+          WMDEBUG_FORCE_V0_WORKSPACE_DEPENDENCIES: 1
+          WMDEBUG_FORCE_RUNNABLE_SETTINGS_V0: 1
+          WMDEBUG_FORCE_NO_LEGACY_DEBOUNCING_COMPAT: 1
+        run: >
+          cargo test
+          --no-fail-fast
+          --features enterprise,deno_core,duckdb,license,python,rust,scoped_cache,parquet,private,csharp,php,quickjs,mcp,run_inline
+          --all
+          -- --nocapture --test-threads=10
--- a/.github/workflows/backend-test.yml
+++ b/.github/workflows/backend-test.yml
@@ -1,6 +1,7 @@
 name: Backend only integration tests

 on:
+  workflow_dispatch:
  push:
    branches:
      - "main"
@@ -55,7 +56,7 @@ jobs:
          go-version: 1.21.5
      - uses: oven-sh/setup-bun@v2
        with:
-          bun-version: 1.3.8
+          bun-version: 1.3.10
      - uses: actions/setup-node@v4
        with:
          node-version: "20"
@@ -70,14 +71,36 @@ jobs:
        with:
          ruby-version: "3.3"
          bundler-cache: false
+      - name: Install windmill CLI from source
+        run: |
+          cd $GITHUB_WORKSPACE/cli
+          bash gen_wm_client.sh
+          bun install
+          mkdir -p "$HOME/.local/bin"
+          printf '#!/bin/sh\nexec bun run "%s/cli/src/main.ts" "$@"\n' "$GITHUB_WORKSPACE" > "$HOME/.local/bin/wmill"
+          chmod +x "$HOME/.local/bin/wmill"
+          echo "$HOME/.local/bin" >> $GITHUB_PATH
+        working-directory: /
      - name: Install PowerShell, mold and clang
        run: |
          sudo apt-get update && sudo apt-get install -y powershell mold clang libcurl4-openssl-dev
        working-directory: /
      - uses: actions-rust-lang/setup-rust-toolchain@v1
        with:
-          cache: false
+          cache-workspaces: backend
          toolchain: 1.93.0
+      - name: Fix stale v8 build cache
+        working-directory: ./backend
+        run: |
+          # Cargo cache may preserve v8 build fingerprints without the actual
+          # librusty_v8.a library. Since fingerprints look valid, cargo skips
+          # build.rs re-run, causing "could not find native static library rusty_v8".
+          for profile in debug release; do
+            if [ -d "target/$profile/.fingerprint" ] && [ ! -f "target/$profile/gn_out/obj/librusty_v8.a" ]; then
+              echo "Cleaning stale v8 build artifacts in target/$profile"
+              rm -rf "target/$profile/build/v8-"* "target/$profile/.fingerprint/v8-"*
+            fi
+          done
      - name: Read EE repo commit hash
        run: |
          echo "ee_repo_ref=$(cat ./ee-repo-ref.txt)" >> "$GITHUB_ENV"
@@ -165,6 +188,12 @@ jobs:
          fi

          echo "NPM_TOKEN=${NPM_TOKEN}" >> $GITHUB_ENV
+          {
+            echo "TEST_NPMRC<<NPMRC_EOF"
+            echo "@windmill-test:registry=http://localhost:4873/"
+            echo "//localhost:4873/:_authToken=${NPM_TOKEN}"
+            echo "NPMRC_EOF"
+          } >> $GITHUB_ENV
          echo "Got NPM token successfully: ${NPM_TOKEN:0:10}..."

          # Configure npm globally with the auth token
@@ -222,4 +251,4 @@ jobs:
        run: |
          deno --version && bun -v && node --version && go version && python3 --version && php --version && ruby --version && pwsh --version && dotnet --version
          cd windmill-duckdb-ffi-internal && ./build_dev.sh && cd ..
-          DENO_PATH=$(which deno) BUN_PATH=$(which bun) NODE_BIN_PATH=$(which node) GO_PATH=$(which go) UV_PATH=$(which uv) PHP_PATH=$(which php) COMPOSER_PATH=$(which composer) RUBY_PATH=$(which ruby) RUBY_BUNDLE_PATH=$(which bundle) RUBY_GEM_PATH=$(which gem) POWERSHELL_PATH=$(which pwsh) DOTNET_PATH=$(which dotnet) cargo test --features enterprise,deno_core,duckdb,license,python,rust,scoped_cache,parquet,private,private_registry_test,csharp,php,ruby,mysql,quickjs,mcp --all -- --nocapture --test-threads=10
+          DENO_PATH=$(which deno) BUN_PATH=$(which bun) NODE_BIN_PATH=$(which node) GO_PATH=$(which go) UV_PATH=$(which uv) PHP_PATH=$(which php) COMPOSER_PATH=$(which composer) RUBY_PATH=$(which ruby) RUBY_BUNDLE_PATH=$(which bundle) RUBY_GEM_PATH=$(which gem) POWERSHELL_PATH=$(which pwsh) DOTNET_PATH=$(which dotnet) cargo test --features enterprise,deno_core,duckdb,license,python,rust,scoped_cache,parquet,private,private_registry_test,csharp,php,ruby,mysql,quickjs,mcp,run_inline --all -- --nocapture --test-threads=10
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -290,6 +290,49 @@ jobs:
          path: |
            *.json

+  benchmark_wac:
+    runs-on: ubicloud-standard-8
+    services:
+      postgres:
+        image: postgres
+        env:
+          POSTGRES_DB: windmill
+          POSTGRES_PASSWORD: changeme
+          POSTGRES_INITDB_ARGS: "-c shared_buffers=2GB -c work_mem=32MB -c effective_cache_size=4GB"
+        options: >-
+          --health-cmd pg_isready --health-interval 10s --health-timeout 5s
+          --health-retries 5
+          --shm-size=2g
+      windmill:
+        image: ghcr.io/windmill-labs/windmill-ee:main
+        env:
+          DATABASE_URL: postgres://postgres:changeme@postgres:5432/windmill
+          LICENSE_KEY: ${{ secrets.WM_LICENSE_KEY_CI }}
+          WORKER_GROUP: main
+          WORKER_TAGS: deno,bun,go,python3,bash,dependency,flow,nativets
+        options: >-
+          --pull always --health-interval 10s --health-timeout 5s
+          --health-retries 5 --health-cmd "curl
+          http://localhost:8000/api/version"
+        ports:
+          - 8000:8000
+    steps:
+      - uses: denoland/setup-deno@v2
+        with:
+          deno-version: v2.x
+      - name: benchmark
+        timeout-minutes: 30
+        run: deno run  -A -r
+          https://raw.githubusercontent.com/windmill-labs/windmill/${GITHUB_REF##ref/head/}/benchmarks/benchmark_suite.ts
+          -c
+          https://raw.githubusercontent.com/windmill-labs/windmill/${GITHUB_REF##ref/head/}/benchmarks/suite_wac.json
+      - name: Save benchmark results
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark_wac
+          path: |
+            *.json
+
  benchmark_graphs:
    runs-on: ubicloud
    needs:
@@ -297,6 +340,7 @@ jobs:
      - benchmark_dedicated
      - benchmark_4workers
      - benchmark_8workers
+      - benchmark_wac
    steps:
      - uses: denoland/setup-deno@v2
        with:
--- a/.github/workflows/build-publish-rh-image.yml
+++ b/.github/workflows/build-publish-rh-image.yml
@@ -9,7 +9,7 @@ permissions: write-all

 jobs:
  build_ee:
-    runs-on: ubicloud
+    runs-on: ubicloud-standard-4
    steps:
      - uses: actions/checkout@v4
        with:
--- a/.github/workflows/build-publish-rh8-image.yml
+++ b/.github/workflows/build-publish-rh8-image.yml
@@ -9,7 +9,7 @@ permissions: write-all

 jobs:
  build_ee:
-    runs-on: ubicloud
+    runs-on: ubicloud-standard-4
    steps:
      - uses: actions/checkout@v4
        with:
--- a/.github/workflows/check-system-prompts.yml
+++ b/.github/workflows/check-system-prompts.yml
@@ -0,0 +1,37 @@
+name: Check system prompts freshness
+
+on:
+  push:
+    paths:
+      - "system_prompts/**"
+      - "typescript-client/**"
+      - "python-client/wmill/wmill/client.py"
+      - "openflow.openapi.yaml"
+      - "backend/windmill-api/openapi.yaml"
+      - "cli/src/main.ts"
+      - "cli/src/commands/**"
+  pull_request:
+    paths:
+      - "system_prompts/**"
+      - "typescript-client/**"
+      - "python-client/wmill/wmill/client.py"
+      - "openflow.openapi.yaml"
+      - "backend/windmill-api/openapi.yaml"
+      - "cli/src/main.ts"
+      - "cli/src/commands/**"
+
+jobs:
+  check-freshness:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: pip install pyyaml
+
+      - name: Check auto-generated files are up-to-date
+        run: bash system_prompts/check-freshness.sh
--- a/.github/workflows/claude.yml
+++ b/.github/workflows/claude.yml
@@ -13,10 +13,10 @@ on:
 jobs:
  check-membership:
    if: |
-      (github.event_name == 'issue_comment' && contains(github.event.comment.body, '/ai')) ||
-      (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '/ai')) ||
-      (github.event_name == 'pull_request_review' && contains(github.event.review.body, '/ai')) ||
-      (github.event_name == 'issues' && contains(github.event.issue.body, '/ai'))
+      (github.event_name == 'issue_comment' && startsWith(github.event.comment.body, '/ai') && !startsWith(github.event.comment.body, '/ai-fast')) ||
+      (github.event_name == 'pull_request_review_comment' && startsWith(github.event.comment.body, '/ai') && !startsWith(github.event.comment.body, '/ai-fast')) ||
+      (github.event_name == 'pull_request_review' && startsWith(github.event.review.body, '/ai') && !startsWith(github.event.review.body, '/ai-fast')) ||
+      (github.event_name == 'issues' && startsWith(github.event.issue.body, '/ai') && !startsWith(github.event.issue.body, '/ai-fast'))
    uses: ./.github/workflows/check-org-membership.yml
    secrets:
      access_token: ${{ secrets.ORG_ACCESS_TOKEN }}
--- a/.github/workflows/cli-tests.yml
+++ b/.github/workflows/cli-tests.yml
@@ -1,16 +1,19 @@
 name: CLI Tests

 on:
+  workflow_dispatch:
  push:
    branches: [main]
    paths:
-      - 'cli/**'
-      - '.github/workflows/cli-tests.yml'
+      - "cli/**"
+      - "backend/migrations/**"
+      - ".github/workflows/cli-tests.yml"
  pull_request:
    branches: [main]
    paths:
-      - 'cli/**'
-      - '.github/workflows/cli-tests.yml'
+      - "cli/**"
+      - "backend/migrations/**"
+      - ".github/workflows/cli-tests.yml"

 env:
  CARGO_TERM_COLOR: always
@@ -23,15 +26,15 @@ jobs:
      - name: Checkout code
        uses: actions/checkout@v4

-      - name: Setup Deno
-        uses: denoland/setup-deno@v2
-        with:
-          deno-version: v2.x
-
      - name: Setup Node.js
        uses: actions/setup-node@v4
        with:
-          node-version: '20'
+          node-version: "20"
+
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@v2
+        with:
+          bun-version: latest

      - name: Generate Windmill client
        working-directory: cli
@@ -69,15 +72,10 @@ jobs:
          cache: true
          cache-workspaces: backend

-      - name: Setup Deno
-        uses: denoland/setup-deno@v2
-        with:
-          deno-version: v2.x
-
      - name: Setup Node.js
        uses: actions/setup-node@v4
        with:
-          node-version: '20'
+          node-version: "20"

      - name: Setup Bun
        uses: oven-sh/setup-bun@v2
@@ -90,6 +88,10 @@ jobs:
      - name: Symlink Node to /usr/bin/node
        run: sudo ln -sf $(which node) /usr/bin/node

+      - name: Install dependencies
+        working-directory: cli
+        run: bun install
+
      - name: Generate Windmill clients
        working-directory: cli
        run: |
@@ -101,12 +103,10 @@ jobs:
        env:
          DATABASE_URL: postgres://postgres:changeme@localhost:5432
          CI_MINIMAL_FEATURES: "true"
-        run: |
-          deno test --no-check --allow-all test/ \
-            --ignore=test/cargo_backend_example.test.ts
+        run: bun test --timeout 120000 test/

  test-windows:
-    runs-on: windows-latest
+    runs-on: blacksmith-16vcpu-windows-2025

    steps:
      - name: Checkout code
@@ -126,15 +126,10 @@ jobs:
          cache: true
          cache-workspaces: backend

-      - name: Setup Deno
-        uses: denoland/setup-deno@v2
-        with:
-          deno-version: v2.x
-
      - name: Setup Node.js
        uses: actions/setup-node@v4
        with:
-          node-version: '20'
+          node-version: "20"

      - name: Setup Bun
        uses: oven-sh/setup-bun@v2
@@ -150,6 +145,10 @@ jobs:
          echo "BUN_PATH=$bunPath" >> $env:GITHUB_OUTPUT
          echo "NODE_BIN_PATH=$nodePath" >> $env:GITHUB_OUTPUT

+      - name: Install dependencies
+        working-directory: cli
+        run: bun install
+
      - name: Generate Windmill clients
        working-directory: cli
        shell: bash
@@ -165,9 +164,7 @@ jobs:
          CI_MINIMAL_FEATURES: "true"
          BUN_PATH: ${{ steps.runtime-paths.outputs.BUN_PATH }}
          NODE_BIN_PATH: ${{ steps.runtime-paths.outputs.NODE_BIN_PATH }}
-        run: |
-          deno test --no-check --allow-all test/ `
-            --ignore=test/cargo_backend_example.test.ts
+        run: bun test --timeout 120000 test/

  # Combined summary job for branch protection
  test-summary:
--- a/.github/workflows/codex-pr-review.yml
+++ b/.github/workflows/codex-pr-review.yml
@@ -0,0 +1,145 @@
+name: Codex Auto Review
+
+on:
+  pull_request:
+    types: [ready_for_review, opened]
+
+concurrency:
+  group: codex-review-${{ github.event.pull_request.number }}
+  cancel-in-progress: true
+
+jobs:
+  codex-review:
+    runs-on: ubicloud-standard-2
+    timeout-minutes: 30
+    if: github.event.pull_request.draft == false && github.event.pull_request.head.repo.fork == false
+    permissions:
+      contents: read
+      issues: write
+    steps:
+      - name: Check Codex configuration
+        id: codex_config
+        env:
+          CODEX_AUTH_JSON: ${{ secrets.CODEX_AUTH_JSON }}
+        run: |
+          if [ -n "$CODEX_AUTH_JSON" ]; then
+            echo "enabled=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "enabled=false" >> "$GITHUB_OUTPUT"
+            echo "CODEX_AUTH_JSON is not configured; skipping Codex review."
+          fi
+
+      - name: Checkout repository
+        if: steps.codex_config.outputs.enabled == 'true'
+        uses: actions/checkout@v5
+        with:
+          ref: refs/pull/${{ github.event.pull_request.number }}/merge
+          fetch-depth: 1
+
+      - name: Set up Node.js
+        if: steps.codex_config.outputs.enabled == 'true'
+        uses: actions/setup-node@v4
+        with:
+          node-version: 22
+
+      - name: Install Codex CLI
+        if: steps.codex_config.outputs.enabled == 'true'
+        run: npm install --global @openai/codex@0.117.0
+
+      - name: Configure file-backed Codex auth
+        if: steps.codex_config.outputs.enabled == 'true'
+        env:
+          CODEX_AUTH_JSON: ${{ secrets.CODEX_AUTH_JSON }}
+        run: |
+          CODEX_HOME="$HOME/.codex"
+          echo "CODEX_HOME=$CODEX_HOME" >> "$GITHUB_ENV"
+          mkdir -p "$CODEX_HOME"
+          chmod 700 "$CODEX_HOME"
+          cat > "$CODEX_HOME/config.toml" <<'EOF'
+          cli_auth_credentials_store = "file"
+          EOF
+          printf '%s' "$CODEX_AUTH_JSON" > "$CODEX_HOME/auth.json"
+          chmod 600 "$CODEX_HOME/auth.json"
+          node -e 'JSON.parse(require("fs").readFileSync(process.argv[1], "utf8"))' "$CODEX_HOME/auth.json"
+
+      - name: Pre-fetch base and head refs for the PR
+        if: steps.codex_config.outputs.enabled == 'true'
+        env:
+          PR_BASE_REF: ${{ github.event.pull_request.base.ref }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+        run: |
+          git fetch --no-tags origin \
+            "$PR_BASE_REF" \
+            "+refs/pull/$PR_NUMBER/head"
+
+      - name: Write Codex review context
+        if: steps.codex_config.outputs.enabled == 'true'
+        env:
+          PR_REPOSITORY: ${{ github.repository }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
+          PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
+          PR_TITLE: ${{ github.event.pull_request.title }}
+          PR_BODY: ${{ github.event.pull_request.body || '' }}
+        run: |
+          mkdir -p .github/codex
+          node <<'NODE'
+          const fs = require('fs');
+          const lines = [
+            `Repository: ${process.env.PR_REPOSITORY}`,
+            `PR number: ${process.env.PR_NUMBER}`,
+            `Base SHA: ${process.env.PR_BASE_SHA}`,
+            `Head SHA: ${process.env.PR_HEAD_SHA}`,
+            '',
+            'PR title:',
+            process.env.PR_TITLE || '(empty)',
+            '',
+            'PR body:',
+            process.env.PR_BODY || '(empty)',
+            '',
+            'Changed commits command:',
+            `git log --oneline ${process.env.PR_BASE_SHA}...${process.env.PR_HEAD_SHA}`,
+            '',
+            'Changed files command:',
+            `git diff --stat ${process.env.PR_BASE_SHA}...${process.env.PR_HEAD_SHA}`,
+            '',
+            'Full review diff command:',
+            `git diff --unified=0 ${process.env.PR_BASE_SHA}...${process.env.PR_HEAD_SHA}`
+          ];
+          fs.writeFileSync('.github/codex/pr-review-context.md', `${lines.join('\n')}\n`);
+          NODE
+
+      - name: Run Codex review
+        if: steps.codex_config.outputs.enabled == 'true'
+        run: |
+          codex exec \
+            -C "$GITHUB_WORKSPACE" \
+            -m gpt-5.4 \
+            -c 'model_reasoning_effort="xhigh"' \
+            -s read-only \
+            -o codex-final-message.md \
+            - < .github/codex/pr-review.prompt.md
+
+      - name: Post Codex review comment
+        if: steps.codex_config.outputs.enabled == 'true'
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ github.token }}
+          script: |
+            const fs = require('fs');
+            const path = `${process.env.GITHUB_WORKSPACE}/codex-final-message.md`;
+            if (!fs.existsSync(path)) {
+              core.info('Codex did not produce a final message; skipping PR comment.');
+              return;
+            }
+            const body = fs.readFileSync(path, 'utf8').trim();
+            if (!body) {
+              core.info('Codex final message was empty; skipping PR comment.');
+              return;
+            }
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.payload.pull_request.number,
+              body,
+            });
--- a/.github/workflows/discord-notification.yml
+++ b/.github/workflows/discord-notification.yml
@@ -6,6 +6,10 @@ on:
      - opened
      - ready_for_review
      - closed
+  issue_comment:
+    types:
+      - created
+      - edited

 jobs:
  notify_discord_when_pr_opened:
@@ -33,3 +37,22 @@ jobs:
      PR_NUMBER: ${{ github.event.pull_request.number }}
    secrets:
      DISCORD_BOT_TOKEN: ${{ secrets.DISCORD_AI_BOT_TOKEN }}
+
+  notify_discord_on_comment:
+    if: >
+      github.event_name == 'issue_comment'
+      && github.event.issue.pull_request
+      && github.event.comment.user.login != 'cloudflare-workers-and-pages[bot]'
+      && github.event.comment.user.login != 'ellipsis-dev[bot]'
+    uses: ./.github/workflows/shareable-discord-notification.yml
+    with:
+      PR_STATUS: "comment"
+      PR_NUMBER: ${{ github.event.issue.number }}
+      COMMENT_BODY: ${{ github.event.comment.body }}
+      COMMENT_AUTHOR: ${{ github.event.comment.user.login }}
+      COMMENT_URL: ${{ github.event.comment.html_url }}
+      COMMENT_IS_EDIT: ${{ github.event.action == 'edited' }}
+      DISCORD_CHANNEL_ID: "1372204995868491786"
+      DISCORD_GUILD_ID: "930051556043276338"
+    secrets:
+      DISCORD_BOT_TOKEN: ${{ secrets.DISCORD_AI_BOT_TOKEN }}
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -212,6 +212,59 @@ jobs:
            ${{ steps.extract-ee.outputs.destination }}/*
            ${{ steps.extract-duckdb-ffi-internal.outputs.destination }}/*

+  attach_ee_debug_to_release:
+    needs: [build_ee]
+    runs-on: ubicloud
+    if: ${{ startsWith(github.ref, 'refs/tags/v') }}
+    strategy:
+      matrix:
+        platform: [linux/amd64, linux/arm64]
+        include:
+          - platform: linux/amd64
+            arch: amd64
+          - platform: linux/arm64
+            arch: arm64
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.ref }}
+
+      - name: Read EE repo commit hash
+        run: |
+          echo "ee_repo_ref=$(cat ./backend/ee-repo-ref.txt)" >> "$GITHUB_ENV"
+
+      - uses: actions/checkout@v4
+        with:
+          repository: windmill-labs/windmill-ee-private
+          path: ./windmill-ee-private
+          ref: ${{ env.ee_repo_ref }}
+          token: ${{ secrets.WINDMILL_EE_PRIVATE_ACCESS }}
+
+      - name: Substitute EE code
+        run: |
+          ./backend/substitute_ee_code.sh --copy --dir ./windmill-ee-private
+
+      - uses: depot/setup-action@v1
+
+      - name: Extract EE debug info from builder stage (depot cache hit)
+        uses: depot/build-push-action@v1
+        with:
+          context: .
+          platforms: ${{ matrix.platform }}
+          target: debuginfo
+          build-args: |
+            features=ee
+          outputs: type=local,dest=./debuginfo
+
+      - name: Rename debug file with corresponding architecture
+        run: |
+          mv ./debuginfo/windmill.debug ./debuginfo/windmill-ee-${{ matrix.arch }}.debug
+
+      - name: Attach debug file to release
+        uses: softprops/action-gh-release@v2
+        with:
+          files: ./debuginfo/windmill-ee-${{ matrix.arch }}.debug
+
  # attach_arm64_binary_to_release:
  #   needs: [build, build_ee]
  #   runs-on: ubicoud
--- a/.github/workflows/git-commands.yaml
+++ b/.github/workflows/git-commands.yaml
@@ -106,6 +106,19 @@ jobs:
          git config --local user.name "windmill-internal-app[bot]"
          git config pull.rebase true
          git pull origin $BRANCH_NAME
+
+          # Checkout the correct windmill-ee-private commit from ee-repo-ref.txt
+          if [ -f backend/ee-repo-ref.txt ]; then
+            EE_REF=$(cat backend/ee-repo-ref.txt | tr -d '[:space:]')
+            echo "Checking out windmill-ee-private at commit: $EE_REF"
+            cd windmill-ee-private
+            git fetch origin $EE_REF
+            git checkout $EE_REF
+            cd ..
+          else
+            echo "Warning: ee-repo-ref.txt not found, using default branch"
+          fi
+
          mkdir -p frontend/build
          cd backend
          cargo install sqlx-cli --version 0.8.5
--- a/.github/workflows/git-sync-test.yml
+++ b/.github/workflows/git-sync-test.yml
@@ -0,0 +1,209 @@
+name: Git Sync Integration Tests
+
+on:
+  workflow_dispatch:
+  push:
+    branches: [main]
+    paths:
+      - "backend/windmill-git-sync/**"
+      - "backend/windmill-api-integration-tests/tests/git_sync*"
+      - "backend/ee-repo-ref.txt"
+      - "integration_tests/test/git_sync_test.py"
+      - ".github/workflows/git-sync-test.yml"
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths:
+      - "backend/windmill-git-sync/**"
+      - "backend/windmill-api-integration-tests/tests/git_sync*"
+      - "backend/ee-repo-ref.txt"
+      - "integration_tests/test/git_sync_test.py"
+      - ".github/workflows/git-sync-test.yml"
+
+concurrency:
+  group: git-sync-test-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  check-relevance:
+    runs-on: ubuntu-latest
+    outputs:
+      should_run: ${{ steps.check.outputs.should_run }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Check if git sync related files changed
+        id: check
+        env:
+          WINDMILL_EE_PRIVATE_ACCESS: ${{ secrets.WINDMILL_EE_PRIVATE_ACCESS }}
+        run: |
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            BASE=${{ github.event.pull_request.base.sha }}
+          else
+            BASE=${{ github.event.before }}
+          fi
+
+          CHANGED_FILES=$(git diff --name-only "$BASE"..HEAD 2>/dev/null || echo "")
+          echo "Changed files:"
+          echo "$CHANGED_FILES"
+
+          # Direct git sync file changes — always relevant
+          if echo "$CHANGED_FILES" | grep -qE '^(backend/windmill-git-sync/|backend/windmill-api-integration-tests/tests/git_sync|integration_tests/test/git_sync|\.github/workflows/git-sync-test\.yml)'; then
+            echo "should_run=true" >> "$GITHUB_OUTPUT"
+            echo "Relevant: direct git sync file changes"
+            exit 0
+          fi
+
+          # If ee-repo-ref.txt changed, check if the EE diff touches windmill-git-sync/
+          if echo "$CHANGED_FILES" | grep -q '^backend/ee-repo-ref.txt$'; then
+            NEW_REF=$(cat backend/ee-repo-ref.txt)
+            OLD_REF=$(git show "$BASE:backend/ee-repo-ref.txt" 2>/dev/null || echo "")
+
+            if [ -n "$OLD_REF" ] && [ "$OLD_REF" != "$NEW_REF" ]; then
+              # Clone EE repo and check diff
+              git clone --bare "https://x-access-token:${WINDMILL_EE_PRIVATE_ACCESS}@github.com/windmill-labs/windmill-ee-private.git" /tmp/ee-repo 2>/dev/null
+              EE_CHANGED=$(git -C /tmp/ee-repo diff --name-only "$OLD_REF".."$NEW_REF" 2>/dev/null || echo "")
+              echo "EE changed files:"
+              echo "$EE_CHANGED"
+
+              if echo "$EE_CHANGED" | grep -q '^windmill-git-sync/'; then
+                echo "should_run=true" >> "$GITHUB_OUTPUT"
+                echo "Relevant: EE git sync files changed"
+                exit 0
+              fi
+            fi
+          fi
+
+          echo "should_run=false" >> "$GITHUB_OUTPUT"
+          echo "No git sync relevant changes detected, skipping tests"
+
+  git_sync_e2e:
+    needs: [check-relevance]
+    if: needs.check-relevance.outputs.should_run == 'true'
+    runs-on: ubicloud-standard-16
+    services:
+      postgres:
+        image: postgres:14
+        ports:
+          - 5432:5432
+        env:
+          POSTGRES_DB: windmill
+          POSTGRES_PASSWORD: changeme
+        options: >-
+          --health-cmd pg_isready --health-interval 10s --health-timeout 5s
+          --health-retries 5
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.ref }}
+          fetch-depth: 0
+
+      - name: Read EE repo commit hash
+        run: |
+          echo "ee_repo_ref=$(cat ./backend/ee-repo-ref.txt)" >> "$GITHUB_ENV"
+
+      - uses: actions/checkout@v4
+        with:
+          repository: windmill-labs/windmill-ee-private
+          path: ./windmill-ee-private
+          ref: ${{ env.ee_repo_ref }}
+          token: ${{ secrets.WINDMILL_EE_PRIVATE_ACCESS }}
+          fetch-depth: 0
+
+      - name: Substitute EE code
+        run: |
+          cd backend && ./substitute_ee_code.sh --copy --dir ./windmill-ee-private
+
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+        with:
+          cache-workspaces: backend
+          toolchain: 1.93.0
+
+      - uses: oven-sh/setup-bun@v2
+        with:
+          bun-version: 1.3.10
+
+      - uses: denoland/setup-deno@v2
+        with:
+          deno-version: v2.x
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+
+      - name: Install wmill CLI
+        run: |
+          cd cli && bash gen_wm_client.sh && bun install
+          mkdir -p "$HOME/.local/bin"
+          printf '#!/bin/sh\nexec bun run "%s/cli/src/main.ts" "$@"\n' "$GITHUB_WORKSPACE" > "$HOME/.local/bin/wmill"
+          chmod +x "$HOME/.local/bin/wmill"
+          echo "$HOME/.local/bin" >> $GITHUB_PATH
+
+      - name: Build Windmill
+        working-directory: ./backend
+        env:
+          SQLX_OFFLINE: true
+          CARGO_BUILD_JOBS: 12
+          RUSTFLAGS: ""
+        run: |
+          cargo build --features enterprise,private,license,zip
+
+      - name: Start Gitea
+        run: |
+          docker run -d --name gitea \
+            -e GITEA__database__DB_TYPE=sqlite3 \
+            -e GITEA__security__INSTALL_LOCK=true \
+            -e GITEA__server__HTTP_PORT=3000 \
+            -e GITEA__server__ROOT_URL=http://localhost:3000 \
+            -e GITEA__service__DISABLE_REGISTRATION=false \
+            -p 3000:3000 \
+            gitea/gitea:1.22-rootless
+          echo "Waiting for Gitea to be ready..."
+          for i in $(seq 1 30); do
+            if curl -sf http://localhost:3000/api/v1/version > /dev/null 2>&1; then
+              echo "Gitea is ready"
+              break
+            fi
+            sleep 2
+          done
+          curl -sf http://localhost:3000/api/v1/version > /dev/null || { echo "Gitea failed to start"; exit 1; }
+
+      - name: Start Windmill
+        working-directory: ./backend
+        env:
+          DATABASE_URL: postgres://postgres:changeme@localhost:5432/windmill
+          LICENSE_KEY: ${{ secrets.WM_LICENSE_KEY_CI }}
+          DENO_PATH: deno
+          BUN_PATH: bun
+          NODE_BIN_PATH: node
+        run: |
+          ./target/debug/windmill &
+          echo "Waiting for Windmill to be ready..."
+          for i in $(seq 1 60); do
+            if curl -sf http://localhost:8000/api/version > /dev/null 2>&1; then
+              echo "Windmill is ready"
+              break
+            fi
+            sleep 2
+          done
+          curl -sf http://localhost:8000/api/version > /dev/null || { echo "Windmill failed to start"; exit 1; }
+
+      - name: Run git sync E2E tests
+        timeout-minutes: 10
+        env:
+          GITEA_DOCKER_URL: http://localhost:3000
+          LICENSE_KEY: ${{ secrets.WM_LICENSE_KEY_CI }}
+        run: |
+          python3 -m venv .venv
+          .venv/bin/pip install -r integration_tests/requirements.txt
+          cd integration_tests && ../.venv/bin/python -m unittest -v test.git_sync_test
+
+      - name: Archive logs
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: Git Sync Integration Tests Logs
+          path: |
+            integration_tests/logs
--- a/.github/workflows/npm_on_release.yml
+++ b/.github/workflows/npm_on_release.yml
@@ -14,7 +14,7 @@ jobs:
        with:
          node-version: "20.x"
          registry-url: "https://registry.npmjs.org"
-      - run: cd typescript-client && ./publish.sh && cd ..
+      - run: cd typescript-client && ./publish.sh --access public && cd ..
        env:
          NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
  publish_cli:
@@ -25,9 +25,9 @@ jobs:
        with:
          node-version: "20.x"
          registry-url: "https://registry.npmjs.org"
-      - uses: denoland/setup-deno@v2
+      - uses: oven-sh/setup-bun@v2
        with:
-          deno-version: v2.x
-      - run: cd cli && ./build.sh && cd npm && npm publish
+          bun-version: latest
+      - run: cd cli && ./build.sh && cd npm && npm publish --access public
        env:
          NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
--- a/.github/workflows/pr-ready-review.yml
+++ b/.github/workflows/pr-ready-review.yml
@@ -22,6 +22,15 @@ jobs:
        with:
          fetch-depth: 1

+      - name: Read review prompt
+        id: review-prompt
+        run: |
+          {
+            echo 'REVIEW_PROMPT<<EOF'
+            cat .claude/review-prompt.md
+            echo 'EOF'
+          } >> "$GITHUB_ENV"
+
      - name: Automatic PR Review
        uses: anthropics/claude-code-action@v1
        with:
@@ -31,18 +40,7 @@ jobs:
            REPO: ${{ github.repository }}
            PR NUMBER: ${{ github.event.pull_request.number }}

-            Please review this pull request and provide comprehensive feedback.
-
-            Focus on:
-            - Code quality and best practices
-            - Potential bugs or issues
-            - Performance considerations
-            - Security implications
-
-            Provide detailed feedback using inline comments for specific issues.
-            Use top-level comments for general observations or praise.
-
-            At the end of your review, add complete instructions to reproduce the added changes through the app interface. These instructions will be given to a tester so he can verify the changes. It should be a short descriptive text (not a step by step or a list) on how to navigate the app (what page, what action, what input, etc) to see the changes.
+            ${{ env.REVIEW_PROMPT }}
          claude_args: |
            --allowedTools "mcp__github_inline_comment__create_inline_comment,Bash(gh pr comment:*),Bash(gh pr diff:*),Bash(gh pr view:*)"
            --model opus
--- a/.github/workflows/rust-client-check.yml
+++ b/.github/workflows/rust-client-check.yml
@@ -18,10 +18,7 @@ jobs:
    runs-on: ubicloud-standard-8
    steps:
      - uses: actions/checkout@v4
-      - uses: cachix/install-nix-action@v20
-        with:
-          extra_nix_config: |
-            experimental-features = nix-command flakes
+      - uses: cachix/install-nix-action@v31
      - name: Check rust client builds
        run: cd rust-client && nix develop ../ --command ./dev.nu --check
        timeout-minutes: 16
--- a/.github/workflows/rust_on_release.yml
+++ b/.github/workflows/rust_on_release.yml
@@ -10,10 +10,7 @@ jobs:
    runs-on: ubicloud-standard-8
    steps:
      - uses: actions/checkout@v4
-      - uses: cachix/install-nix-action@v20
-        with:
-          extra_nix_config: |
-            experimental-features = nix-command flakes
+      - uses: cachix/install-nix-action@v31
      - run: cd rust-client && nix develop ../ --command ./dev.nu --check --publish 
        env:
          CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
--- a/.github/workflows/shareable-discord-notification.yml
+++ b/.github/workflows/shareable-discord-notification.yml
@@ -24,9 +24,26 @@ on:
      DISCORD_GUILD_ID:
        description: "The Discord guild ID"
        type: string
+      COMMENT_BODY:
+        description: "The comment body"
+        type: string
+        default: ""
+      COMMENT_AUTHOR:
+        description: "The comment author"
+        type: string
+        default: ""
+      COMMENT_URL:
+        description: "The comment URL"
+        type: string
+        default: ""
+      COMMENT_IS_EDIT:
+        description: "Whether this is an edit of an existing comment"
+        type: string
+        default: "false"
    secrets:
      DISCORD_WEBHOOK_URL:
        description: "Discord Webhook URL"
+        required: false
      DISCORD_BOT_TOKEN:
        description: "Discord Bot Token"

@@ -117,3 +134,81 @@ jobs:
          curl -X PUT \
            -H "Authorization: Bot $BOT_TOKEN" \
            "https://discord.com/api/v10/channels/$thread_id/messages/$message_id/reactions/%E2%9C%85/@me"
+
+  post_comment:
+    runs-on: ubuntu-latest
+    if: ${{ inputs.PR_STATUS == 'comment' }}
+    steps:
+      - name: Post or update comment in Discord thread
+        env:
+          BOT_TOKEN: ${{ secrets.DISCORD_BOT_TOKEN }}
+          CHANNEL_ID: ${{ inputs.DISCORD_CHANNEL_ID }}
+          GUILD_ID: ${{ inputs.DISCORD_GUILD_ID }}
+          PR_NUMBER: ${{ inputs.PR_NUMBER }}
+          COMMENT_BODY: ${{ inputs.COMMENT_BODY }}
+          COMMENT_AUTHOR: ${{ inputs.COMMENT_AUTHOR }}
+          COMMENT_URL: ${{ inputs.COMMENT_URL }}
+          COMMENT_IS_EDIT: ${{ inputs.COMMENT_IS_EDIT }}
+        run: |
+          # 1) Find the thread by PR number
+          threads=$(curl -s -H "Authorization: Bot $BOT_TOKEN" \
+            "https://discord.com/api/v10/guilds/${GUILD_ID}/threads/active")
+          thread_id=$(echo "$threads" | jq -r \
+            --arg cid "$CHANNEL_ID" \
+            --arg pref "#${PR_NUMBER}:" \
+            '.threads[] | select(.parent_id == $cid and (.name | startswith($pref))) | .id')
+
+          if [ -z "$thread_id" ]; then
+            echo "Thread not found for PR #${PR_NUMBER}, skipping"
+            exit 0
+          fi
+
+          # 2) Truncate comment body to fit Discord's 2000 char limit
+          # Reserve space for the author line + link (~100 chars)
+          max_body=1800
+          if [ ${#COMMENT_BODY} -gt $max_body ]; then
+            # For bot comments, show the tail (conclusions/code tend to be at the end)
+            if [[ "$COMMENT_AUTHOR" == *"[bot]"* ]] || [[ "$COMMENT_AUTHOR" == *"-bot"* ]]; then
+              truncated_body="...${COMMENT_BODY: -$max_body}"
+            else
+              truncated_body="${COMMENT_BODY:0:$max_body}..."
+            fi
+          else
+            truncated_body="$COMMENT_BODY"
+          fi
+
+          # 3) Build the message content
+          if [ "$COMMENT_IS_EDIT" = "true" ]; then
+            message=$(printf '**%s** [edited comment](%s):\n%s' "$COMMENT_AUTHOR" "$COMMENT_URL" "$truncated_body")
+          else
+            message=$(printf '**%s** [commented](%s):\n%s' "$COMMENT_AUTHOR" "$COMMENT_URL" "$truncated_body")
+          fi
+          payload=$(jq -n --arg content "$message" '{content: $content, flags: 4, allowed_mentions: {parse: []}}')
+
+          # 4) If this is an edit, try to find and update the existing Discord message
+          if [ "$COMMENT_IS_EDIT" = "true" ]; then
+            # Search recent messages in the thread for one containing the comment URL
+            messages=$(curl -s -H "Authorization: Bot $BOT_TOKEN" \
+              "https://discord.com/api/v10/channels/${thread_id}/messages?limit=100")
+            existing_msg_id=$(echo "$messages" | jq -r \
+              --arg url "$COMMENT_URL" \
+              '[.[] | select(.content | contains($url))] | first | .id // empty')
+
+            if [ -n "$existing_msg_id" ]; then
+              echo "Updating existing Discord message $existing_msg_id"
+              curl -s -X PATCH \
+                -H "Authorization: Bot $BOT_TOKEN" \
+                -H "Content-Type: application/json" \
+                -d "$payload" \
+                "https://discord.com/api/v10/channels/${thread_id}/messages/${existing_msg_id}"
+              exit 0
+            fi
+            echo "Original Discord message not found, posting as new message"
+          fi
+
+          # 5) Post a new message to the thread
+          curl -s -X POST \
+            -H "Authorization: Bot $BOT_TOKEN" \
+            -H "Content-Type: application/json" \
+            -d "$payload" \
+            "https://discord.com/api/v10/channels/${thread_id}/messages"
--- a/.gitignore
+++ b/.gitignore
@@ -14,9 +14,21 @@ backend/.minio-data
 !.aiderignore
 rust-client/Cargo.toml

+# Worktree-generated port isolation
+.env.local
+.webmux.local.yaml
+
+# Worktree-specific Claude Code settings (generated by scripts/worktree-env)
+.claude/settings.local.json
+
 # Symlinked cache directories (for git worktrees)
 backend/target
 frontend/node_modules
 typescript-client/node_modules
+ai_evals/node_modules
+ai_evals/results/
 frontend/.svelte-kit
 backend/chrome_profiler.json
+.fast-check/
+__pycache__/
+.playwright-mcp/
--- a/.mcp.json
+++ b/.mcp.json
@@ -3,10 +3,6 @@
    "svelte": {
      "type": "http",
      "url": "https://mcp.svelte.dev/mcp"
-    },
-    "playwright": {
-      "command": "npx",
-      "args": ["@playwright/mcp@latest"]
    }
  }
 }
--- a/.webmux.yaml
+++ b/.webmux.yaml
@@ -0,0 +1,108 @@
+# Project display name in the dashboard
+name: Windmill
+
+workspace:
+  mainBranch: main
+  worktreeRoot: ../windmill__worktrees
+  defaultAgent: claude
+
+startupEnvs:
+  CARGO_FEATURES: "quickjs"
+  WM_CLONE_DB: false
+  USE_RUST_PLUGIN: false
+
+lifecycleHooks:
+  postCreate: bash ./scripts/post-create.sh
+  preRemove: bash ./scripts/pre-remove.sh
+
+auto_name:
+  provider: claude
+  model: haiku
+
+# Each service defines a port env var that webmux injects into pane and agent
+# process environments when creating a worktree. Ports are auto-assigned:
+# base + (slot x step).
+services:
+  - name: backend
+    portEnv: BACKEND_PORT
+    portStart: 8000
+    portStep: 10
+  - name: frontend
+    portEnv: FRONTEND_PORT
+    portStart: 3000
+    portStep: 10
+
+profiles:
+  full:
+    runtime: host
+    yolo: true
+    envPassthrough: []
+    systemPrompt: >
+      You are running inside a tmux session with other panes running services.
+      Pane layout (current window):
+      - Pane 0: this pane (claude agent)
+      - Pane 1: backend (cargo watch -x run)
+      - Pane 2: frontend (npm run dev)
+      To check logs, use: \`tmux capture-pane -t $(tmux display-message -t "$TMUX_PANE" -p '#{session_name}:#{window_name}').1 -p -S -50\` (backend) or \`tmux capture-pane -t $(tmux display-message -t "$TMUX_PANE" -p '#{session_name}:#{window_name}').2 -p -S -50\` (frontend).
+      For this window specifically, backend is running on: ${BACKEND_PORT} and frontend is running on: ${FRONTEND_PORT}.
+      To connect to the database, use this connection string: ${DATABASE_URL}
+      Because we are running backend with cargo watch, to verify your changes, just check the logs in the backend pane. No need for cargo check.
+      IMPORTANT: Read docs/autonomous-mode.md before starting any work.
+    panes:
+      - id: agent
+        kind: agent
+        focus: true
+      - id: backend
+        kind: command
+        split: right
+        workingDir: backend
+        command: PORT=${BACKEND_PORT:-8000} cargo watch -x "run ${CARGO_FEATURES:+--features $CARGO_FEATURES}"
+      - id: frontend
+        kind: command
+        split: bottom
+        workingDir: frontend
+        command: npm run generate-backend-client && REMOTE=${REMOTE:-http://localhost:${BACKEND_PORT:-8000}} npm run dev -- --port ${FRONTEND_PORT:-3000} --host 0.0.0.0
+        
+  frontendOnly:
+    runtime: host
+    yolo: true
+    envPassthrough: []
+    systemPrompt: >
+      You are running inside a tmux session with other panes running services.
+      Pane layout (current window):
+      - Pane 0: this pane (claude agent)
+      - Pane 1: frontend (npm run dev)
+      To check logs, use: \`tmux capture-pane -t $(tmux display-message -t "$TMUX_PANE" -p '#{session_name}:#{window_name}').1 -p -S -50\` (frontend).
+      On this window specifically, frontend is running on: ${FRONTEND_PORT}.
+      To connect to the database, use this connection string: ${DATABASE_URL}
+      Because we are running frontend with npm run dev, to verify your changes, just check the logs in the frontend pane. No need for npm run build.
+      IMPORTANT: Read docs/autonomous-mode.md before starting any work.
+    panes:
+      - id: agent
+        kind: agent
+        focus: true
+      - id: frontend
+        kind: command
+        split: right
+        workingDir: frontend
+        command: npm run generate-backend-client && npm run dev -- --port ${FRONTEND_PORT:-3000} --host 0.0.0.0
+
+  agentOnly:
+    runtime: host
+    yolo: true
+    envPassthrough: []
+    systemPrompt: >
+      IMPORTANT: Read docs/autonomous-mode.md before starting any work.
+    panes:
+      - id: agent
+        kind: agent
+        focus: true
+
+integrations:
+  github:
+    linkedRepos:
+      - repo: windmill-labs/windmill-ee-private
+        alias: ee-private
+        dir: ../windmill-ee-private__worktrees
+  linear:
+    enabled: true
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -1,68 +1,87 @@
-# Windmill Development Guide
+# Windmill

-## Overview
+Open-source platform for internal tools, workflows, API integrations, background jobs, and UIs. Rust backend + Svelte 5 frontend.

-Windmill is an open-source developer platform for building internal tools, workflows, API integrations, background jobs, workflows, and user interfaces. See @windmill-overview.mdc for full platform details.
+## Workflow

-## New Feature Implementation Guidelines
+1. **Understand**: Before coding, explore the codebase (see Code Navigation below). Use `outline` to understand file structure, `body` to read specific symbols, `def`/`callers`/`callees` to trace code, `Grep` to find usages. Read `docs/` for domain context.
+2. **Plan**: For non-trivial changes, use plan mode. For large features, break into reviewable stages
+3. **Execute**: Follow coding patterns from skills (`rust-backend`, `svelte-frontend`)
+4. **Validate**: After every change, run the appropriate checks per `docs/validation.md`

-When implementing new features in Windmill, follow these best practices:
+## Documentation

- **Clean Code First**: Write clean, readable, and maintainable code. Prioritize clarity over cleverness.
- **Avoid Duplication at All Costs**: Before writing new code, thoroughly search for existing implementations that can be reused or extended.
- **Adapt Existing Code**: Refactor and generalize existing code when necessary to avoid logic duplication. Extract common patterns into reusable utilities.
- **Follow Established Patterns**: Study existing code patterns in the codebase and maintain consistency with established conventions.
- **Single Responsibility**: Each function, component, and module should have a single, well-defined responsibility.
- **Incremental Implementation**: Break large features into smaller, reviewable chunks that can be implemented and tested incrementally.
-
-## Language-Specific Guides
-
- Backend (Rust): see `backend/CLAUDE.md` and the `rust-backend` skill: `.claude/skills/rust-backend/SKILL.md`
- Frontend (Svelte 5): see `frontend/CLAUDE.md` and the `svelte-frontend` skill: `.claude/skills/svelte-frontend/SKILL.md`
+- **Validation**: `docs/validation.md` — what checks to run based on what you changed
+- **Enterprise**: `docs/enterprise.md` — EE file conventions and PR workflow
+- **Backend patterns**: use the `rust-backend` skill when writing Rust code
+- **Frontend patterns**: use the `svelte-frontend` skill when writing Svelte code. Do NOT edit svelte files unless you have read that skill.
+- **Code review**: use `/local-review` to review a PR for bugs and CLAUDE.md compliance
+- **Domain guides**: `.claude/skills/native-trigger/` and `frontend/tutorial-system-guide.mdc`
+- **Brand/UI guidelines**: `frontend/brand-guidelines.md`

 ## Dev Environment

 - **Backend**: `cargo run` from `backend/` (API at http://localhost:8000)
- **Frontend**: `REMOTE=http://localhost:8000 npm run dev` from `frontend/`
-  - The `REMOTE` env var configures the Vite proxy target. Without it, API calls proxy to `https://app.windmill.dev` instead of the local backend.
-  - The dev server starts on port 3000 (or 3001+ if 3000 is in use).
- **Default login**: `admin@windmill.dev` / `changeme`
- **Instance settings**: navigate to `/#superadmin-settings` (opens the drawer overlay)
+- **Frontend**: `REMOTE=http://localhost:8000 npm run dev` from `frontend/` (port 3000+)
+- **DB**: `psql postgres://postgres:changeme@localhost:5432/windmill`
+- **Login**: `admin@windmill.dev` / `changeme`
+- **Instance settings**: navigate to `/#superadmin-settings`
+- **Migrations**: use `cargo sqlx migrate add -r <name>` from `backend/` to create new migrations (never generate timestamps manually)

-## UI Testing with Playwright MCP
+## Banned Patterns

-When testing the frontend with the Playwright MCP tools:
+### `$bindable(default_value)` on optional props

-1. **Start servers**: Launch backend (`cargo run`) and frontend (`REMOTE=http://localhost:8000 npm run dev`) as background tasks
-2. **Wait for readiness**: Backend takes ~60s to compile; check output for `health check completed`. Frontend starts in ~5s.
-3. **Login flow**: Navigate to `/user/login`, click "Log in without third-party", fill email/password, submit
-4. **Instance settings drawer**: Navigate to `/#superadmin-settings` to open the drawer directly
-5. **Toggle components**: The YAML toggle uses a custom `<Toggle>` component where the checkbox is visually hidden (`sr-only`). Click the wrapper `<label>` element (the parent container with `cursor=pointer`), not the checkbox ref directly.
-6. **Console errors to ignore**: `critical_alerts` 404s are expected on CE builds (EE-only endpoint). VSCode worker 404s are dev-mode artifacts.
+Using `$bindable(default_value)` on props that can be `undefined` is **banned**. This pattern causes subtle bugs because the default value masks the `undefined` state.

-## Code Validation (MUST DO)
+**Bad:**

-After making code changes, you MUST run the appropriate checks and fix all errors before considering the work done:
-
- **Backend**: Run `cargo check` from the `backend/` directory. Only enable the feature flags needed for the code you changed — check `backend/Cargo.toml` `[features]` section to identify which flags gate the crates/modules you modified. For example: `cargo check --features enterprise,parquet` if you only touched enterprise and parquet code.
- **Frontend**: Run `npm run check` from the `frontend/` directory.
-
-## Querying the Database
-
-`backend/summarized_schema.txt` provides a compact overview of all tables, columns, types, ENUMs, and foreign keys. Use it to quickly understand the data model and relationships. Note: this file is a simplified summary — it omits indexes, constraints details, and other metadata.
-
-For exact table definitions (indexes, constraints, column defaults, etc.), query the database directly:
-
-```bash
-psql postgres://postgres:changeme@localhost:5432/windmill
+```svelte
+let { my_prop = $bindable(default_value) }: { my_prop?: string } = $props()
 ```

-Useful psql commands:
- `\d <table_name>` — full table definition with indexes and constraints
- `\di <table_name>*` — list indexes for a table
- `\d+ <table_name>` — extended table info including storage and descriptions
+**Correct alternatives:**

-This is also helpful for:
- Inspecting database state during development
- Testing queries before implementing them in Rust
- Debugging data-related issues
+1. **Use `$derived` with nullish coalescing** — handle the potential `undefined` at the usage site:
+
+   ```svelte
+   let { my_prop = $bindable() }: { my_prop?: string } = $props()
+   let effective_value = $derived(my_prop ?? default_value)
+   ```
+
+2. **Create a `useMyPropState()` helper** — encapsulate the undefined-handling logic in a reusable function and call it higher in the component tree, so the child component always receives a defined value.
+
+## Code Navigation
+
+`wm-ts-nav` is an AST-aware code navigator. Use **wm-ts-nav** for structural queries — it skips comments/strings and understands symbol boundaries.
+
+**MUST use `outline` before `Read`** on unfamiliar files — a 500-line file costs ~500 lines of context, while `outline` costs ~20. Then **MUST use `body "X"`** instead of reading a full file to see one function/struct. Use `Read` with offset/limit only when you need surrounding context that `body` doesn't capture.
+- `refs "X" --caller` instead of reading files to find which function contains each reference
+- `callers "X"` / `callees "X"` for call-graph questions
+
+EE files (`*_ee.rs`, `*_ee.ts`, `*_ee.svelte`) are indexed — you can `outline`, `def`, `body`, `refs` etc. on them just like regular files.
+
+```bash
+NAV="sh wm-ts-nav/nav"
+# Use --root backend for Rust, --root frontend/src for TS/Svelte
+$NAV --root backend outline backend/path/to/file.rs      # file structure
+$NAV --root backend def "ServiceName"                     # find definition
+$NAV --root backend body "decrypt_oauth_data"             # extract source code
+$NAV --root backend search "%" --parent ServiceName       # methods on a type
+$NAV --root backend search "Trigger" --kind struct        # find by kind
+$NAV --root backend refs "X" --file handler.rs --caller   # scoped refs with caller
+$NAV --root backend callers "X"                           # who calls X?
+$NAV --root backend callees "X"                           # what does X call?
+```
+
+**Limitations** — syntax-level analysis, no type inference. Use **Grep** instead when completeness matters (finding all usages, exhaustiveness checks):
+- `refs`/`callers`/`callees` can't follow re-exports, glob imports, or different import paths to the same symbol
+- Trait impls, macro-generated symbols (`sqlx::FromRow`), and namespace member access (`ns.X`) are invisible
+- `callees` shows all identifiers in a function body, not just actual calls
+
+## Core Principles
+
+- **MUST `outline` before `Read`** on unfamiliar files — then `body` or `Read` with offset/limit for specifics
+- Search for existing code to reuse before writing new code
+- Follow established patterns in the codebase
+- Keep changes focused — don't refactor beyond what's asked
--- a/14
+++ b/14
@@ -11,18 +11,8 @@
 {$BASE_URL} {
        bind {$ADDRESS}

-        # LSP - Language Server Protocol for code intelligence (windmill_extra:3001)
-        reverse_proxy /ws/* http://windmill_extra:3001
-
-        # Multiplayer - Real-time collaboration, Enterprise Edition (windmill_extra:3002)
-        # Uncomment and set ENABLE_MULTIPLAYER=true in docker-compose.yml
-        # reverse_proxy /ws_mp/* http://windmill_extra:3002
-
-        # Debugger - Interactive debugging via DAP WebSocket (windmill_extra:3003)
-        # Set ENABLE_DEBUGGER=true in docker-compose.yml to enable
-        handle_path /ws_debug/* {
-                reverse_proxy http://windmill_extra:3003
-        }
+        # Extra services: LSP, Multiplayer, Debugger (windmill_extra gateway)
+        reverse_proxy /ws/* /ws_mp/* /ws_debug/* http://windmill_extra:3000

        # Search indexer, Enterprise Edition (windmill_indexer:8002)
        # reverse_proxy /api/srch/* http://windmill_indexer:8002
--- a/44
+++ b/44
@@ -58,7 +58,7 @@ FROM node:24-alpine as frontend

 # install dependencies
 WORKDIR /frontend
-COPY ./frontend/package.json ./frontend/package-lock.json ./
+COPY ./frontend/package.json ./frontend/package-lock.json ./frontend/.npmrc ./
 COPY ./frontend/scripts/ ./scripts/
 RUN npm ci

@@ -118,6 +118,18 @@ RUN --mount=type=cache,target=/usr/local/cargo/registry \
    --mount=type=cache,target=$SCCACHE_DIR,sharing=locked \
    CARGO_NET_GIT_FETCH_WITH_CLI=true cargo build --release --features "$features"

+# Split debug info into a separate file, then strip the binary.
+# The .debug file can be extracted as a CI artifact for production debugging.
+# The debuglink allows gdb to auto-discover the debug file when placed next to the binary.
+RUN objcopy --only-keep-debug /windmill/target/release/windmill /windmill/target/release/windmill.debug \
+    && strip /windmill/target/release/windmill \
+    && objcopy --add-gnu-debuglink=/windmill/target/release/windmill.debug /windmill/target/release/windmill
+
+# Standalone stage for extracting the .debug file without including it in the final image.
+# Build with: docker build --target debuginfo --output type=local,dest=./out .
+FROM scratch AS debuginfo
+COPY --from=builder /windmill/target/release/windmill.debug /windmill.debug
+
 FROM ${DEBIAN_IMAGE}

 ARG TARGETPLATFORM
@@ -126,7 +138,7 @@ ARG POWERSHELL_DEB_VERSION=7.5.0-1
 ARG KUBECTL_VERSION=1.28.7
 ARG HELM_VERSION=3.14.3
 # NOTE: If changing, also change go version in workspace dependencies template at WorkspaceDependenciesEditor.svelte
-ARG GO_VERSION=1.25.0
+ARG GO_VERSION=1.26.0
 ARG APP=/usr/src/app
 ARG WITH_POWERSHELL=true
 ARG WITH_KUBECTL=true
@@ -150,11 +162,19 @@ ENV PATH /usr/local/bin:/root/.local/bin:/tmp/.local/bin:$PATH


 RUN apt-get update \
-    && apt-get install -y --no-install-recommends netbase tzdata ca-certificates wget curl jq unzip build-essential unixodbc xmlsec1 software-properties-common tini \
+    && apt-get install -y --no-install-recommends netbase tzdata ca-certificates wget curl jq unzip build-essential unixodbc xmlsec1 software-properties-common tini gnupg lsb-release \
    && if echo "$features" | grep -q "ee"; then apt-get install -y --no-install-recommends libsasl2-modules-gssapi-mit krb5-user; fi \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*

+# Install latest PostgreSQL client (pg_dump) from official PostgreSQL apt repository
+RUN curl -fsSL https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor -o /usr/share/keyrings/postgresql-archive-keyring.gpg \
+    && echo "deb [signed-by=/usr/share/keyrings/postgresql-archive-keyring.gpg] https://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list \
+    && apt-get update \
+    && apt-get install -y --no-install-recommends postgresql-client \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
 RUN if [ "$WITH_GIT" = "true" ]; then \
    apt-get update  -y \
    && apt-get install -y git \
@@ -256,13 +276,23 @@ COPY --from=windmill_duckdb_ffi_internal_builder /windmill-duckdb-ffi-internal/t

 COPY --from=denoland/deno:2.2.1 --chmod=755 /usr/bin/deno /usr/bin/deno

-COPY --from=oven/bun:1.3.8 /usr/local/bin/bun /usr/bin/bun
+COPY --from=oven/bun:1.3.10 /usr/local/bin/bun /usr/bin/bun

-COPY --from=php:8.3.7-cli /usr/local/bin/php /usr/bin/php
-COPY --from=composer:2.7.6 /usr/bin/composer /usr/bin/composer
+# Install windmill CLI
+RUN bun install -g windmill-cli \
+    && ln -s $(bun pm bin -g)/wmill /usr/bin/wmill
+
+# Install Claude Code CLI (used by claude sandbox scripts)
+# The installer puts the binary in ~/.local/bin/claude (symlink to ~/.local/share/claude/versions/*)
+# Copy it to /usr/bin/claude so it's accessible inside nsjail sandbox (which mounts /usr but not /root)
+RUN curl -fsSL https://claude.ai/install.sh | bash \
+    && cp /root/.local/share/claude/versions/* /usr/bin/claude
+
+COPY --from=php:8.3.30-cli-bookworm /usr/local/bin/php /usr/bin/php
+COPY --from=composer:2.9.5 /usr/bin/composer /usr/bin/composer

 # add the docker client to call docker from a worker if enabled
-COPY --from=docker:dind /usr/local/bin/docker /usr/local/bin/
+COPY --from=docker:29-dind /usr/local/bin/docker /usr/local/bin/

 ENV RUSTUP_HOME="/tmp/windmill/cache/rustup"
 ENV CARGO_HOME="/tmp/windmill/cache/cargo"
--- a/README.md
+++ b/README.md
@@ -257,6 +257,7 @@ On self-hosted instances, you might want to import all the approved resource typ
 | BASE_URL                            | http://localhost:8000            | The base url that is exposed publicly to access your instance. Is overriden by the instance settings if any.                                                                                       | Server                |
 | ZOMBIE_JOB_TIMEOUT                  | 30                               | The timeout after which a job is considered to be zombie if the worker did not send pings about processing the job (every server check for zombie jobs every 30s)                                  | Server                |
 | RESTART_ZOMBIE_JOBS                 | true                             | If true then a zombie job is restarted (in-place with the same uuid and some logs), if false the zombie job is failed                                                                              | Server                |
+| NATIVE_MODE                         | false                            | Enable native mode: sets NUM_WORKERS=8, rejects non-native jobs (nativets, postgresql, mysql, etc.)                                                                                                | Worker                |
 | SLEEP_QUEUE                         | 50                               | The number of ms to sleep in between the last check for new jobs in the DB. It is multiplied by NUM_WORKERS such that in average, for one worker instance, there is one pull every SLEEP_QUEUE ms. | Worker                |
 | KEEP_JOB_DIR                        | false                            | Keep the job directory after the job is done. Useful for debugging.                                                                                                                                | Worker                |
 | LICENSE_KEY (EE only)               | None                             | License key checked at startup for the Enterprise Edition of Windmill                                                                                                                              | Worker                |
--- a/README_WORKMUX_DEV.md
+++ b/README_WORKMUX_DEV.md
@@ -0,0 +1,211 @@
+# Windmill Development with workmux
+
+This guide covers the workmux-based development setup for Windmill. Each worktree gets its own tmux window with a Claude Code agent, a backend server (with auto-reload), and a frontend dev server — all on isolated ports.
+
+## Prerequisites
+
+- tmux
+- Rust toolchain (rustup)
+- Node.js + npm
+- PostgreSQL running locally (see `backend/.env`)
+
+## Installation
+
+### 1. Install workmux
+
+```bash
+cargo install workmux
+```
+
+### 2. Install the Claude Code plugin
+
+```bash
+workmux claude install
+```
+
+This lets workmux manage Claude Code agents in worktree panes.
+
+### 3. Install cargo-watch
+
+Used for auto-recompiling the backend on file changes:
+
+```bash
+cargo install cargo-watch
+```
+
+### 4. Install llm CLI (required for auto branch naming)
+
+workmux uses the `llm` CLI to automatically generate branch names from prompts. Install it with:
+
+```bash
+uv tool install llm
+llm install llm-anthropic
+```
+
+Then set your Anthropic API key:
+
+```bash
+llm keys set anthropic
+# paste your API key when prompted
+```
+
+### 5. Recommended: shell alias and autocomplete
+
+Set up a `wm` alias for convenience:
+
+```bash
+# Add to your ~/.zshrc
+alias wm="workmux"
+```
+
+Setting up zsh autocomplete is also recommended — see the [workmux docs](https://github.com/rubenfiszel/workmux) for instructions.
+
+## Port Slot System
+
+Each worktree is assigned a **slot** that determines its ports:
+
+| Slot | Backend | Frontend |
+| ---- | ------- | -------- |
+| 0    | 8000    | 3000     |
+| 1    | 8010    | 3010     |
+| 2    | 8020    | 3020     |
+| 3    | 8030    | 3030     |
+| ...  | ...     | ...      |
+
+- **Slot 0** is reserved for the main worktree (default `cargo run` / `npm run dev`).
+- Without `WM_SLOT`, the script auto-assigns the first available slot (starting from 1) and prints it.
+- With `WM_SLOT=N`, it uses that slot and errors if the ports are taken.
+
+## SSH Port Forwarding
+
+If you develop over SSH, add this to `~/.ssh/config` on your **local machine** to pre-configure tunnels for each slot:
+
+```
+Host windmill-dev
+  HostName <remote-ip>
+  User <username>
+  # Slot 0 (main worktree)
+  LocalForward 8000 localhost:8000
+  LocalForward 3000 localhost:3000
+  # Slot 1
+  LocalForward 8010 localhost:8010
+  LocalForward 3010 localhost:3010
+  # Slot 2
+  LocalForward 8020 localhost:8020
+  LocalForward 3020 localhost:3020
+  # Slot 3
+  LocalForward 8030 localhost:8030
+  LocalForward 3030 localhost:3030
+```
+
+Then connect once and all tunnels are active:
+
+```bash
+ssh windmill-dev
+```
+
+Access the frontend at `http://localhost:<frontend-port>` in your local browser.
+
+## Quickstart
+
+```bash
+# Create a new worktree (auto-assigns slot, prints ports)
+workmux add my-feature
+
+# Or with an explicit slot
+WM_SLOT=2 workmux add my-feature
+
+# Create a worktree and immediately send a prompt to the agent
+workmux add -A -p "fix the login bug in auth.rs"
+```
+
+The `add` command creates the worktree but does **not** open it. To open the tmux window and start working:
+
+```bash
+workmux open my-feature
+```
+
+This will open a tmux window with three panes:
+
+- **Claude Code agent** (focused)
+- **Backend**: `cargo watch -x run` on the assigned port (auto-reloads on save)
+- **Frontend**: `npm run dev` proxying to the backend
+
+When using `-A` with `add`, the worktree is created and opened automatically, and the prompt is sent to the agent right away.
+
+Check which ports were assigned:
+
+```bash
+cat <worktree-path>/.env.local
+```
+
+### Sending work to the agent
+
+```bash
+# Send a prompt to the agent in a worktree
+workmux send my-feature "fix the login bug in auth.rs"
+
+# Check agent status
+workmux status
+```
+
+### Merging and cleaning up
+
+We never merge worktrees directly — always create a PR on GitHub and let it be merged there. Once the PR is merged, clean up the worktree:
+
+```bash
+# Close the tmux window but keep the worktree
+workmux close my-feature
+
+# After your PR is merged, remove the worktree, branch, and tmux window
+workmux rm my-feature
+```
+
+> **Note**: Do not use `workmux merge`. Always go through a PR to get your changes into main. You can ask the Claude Code agent in the worktree to create the PR for you.
+
+## Configuration
+
+The setup is defined in `.workmux.yaml` at the repo root. Key sections:
+
+- **`post_create`**: Runs `scripts/worktree-env` to generate `.env.local` with port assignments
+- **`panes`**: Defines the tmux layout (agent, backend, frontend)
+- **`files.copy`**: Copies `backend/.env` and `scripts/` into each worktree
+
+The `post_create` hook also copies `frontend/node_modules` using `cp -a` (preserves `.bin/` symlinks that `cp -r` would dereference).
+
+## Enterprise (EE) Code Access
+
+The enterprise source code lives in the `windmill-ee-private` repository (sibling to this repo). When you create a worktree, `scripts/worktree-env` automatically creates a matching EE worktree on the same branch and configures Claude Code's `additionalDirectories` to grant access.
+
+### Sandbox setup
+
+When using sandbox mode, the container needs explicit mounts to access the EE repo. Add the following to your global workmux config (`~/.config/workmux/config.yaml`):
+
+```yaml
+sandbox:
+  extra_mounts:
+    - host_path: ~/windmill-ee-private
+      writable: true
+    - host_path: ~/windmill-ee-private__worktrees
+      writable: true
+```
+
+This mounts both the main EE repo (used by the main worktree) and the EE worktrees directory (used by feature worktrees) into every sandbox container.
+
+## Cargo Features
+
+To build the backend with specific Cargo features (e.g., `enterprise`, `parquet`), pass them via `CARGO_FEATURES`. The backend pane reads this from `.env.local` and appends `--features <value>` to the `cargo watch` command.
+
+**With `wm` (workmux):**
+
+Set `CARGO_FEATURES` as an environment variable before creating the worktree:
+
+```bash
+CARGO_FEATURES="enterprise,parquet" wm add my-feature
+```
+
+This gets written to `.env.local` by the `post_create` hook (`scripts/worktree-env`), and the backend pane picks it up automatically.
+
+## Login
+
+Default credentials: `admin@windmill.dev` / `changeme`
--- a/ai_evals/.gitignore
+++ b/ai_evals/.gitignore
@@ -0,0 +1,2 @@
+.env
+results/
--- a/ai_evals/AGENTS.md
+++ b/ai_evals/AGENTS.md
@@ -0,0 +1,172 @@
+# AI Evals Authoring Guide
+
+This folder contains black-box benchmark cases for:
+
+- `flow`
+- `app`
+- `script`
+- `cli`
+
+The goal is to test the current production prompts and guidance with realistic user requests, not to test one exact implementation shape.
+
+## Core rules
+
+1. Write prompts like a real user request.
+2. Prefer behavior, inputs, constraints, and outcomes over internal implementation details.
+3. Keep deterministic validation narrow and hard.
+4. Put semantic expectations in `judgeChecklist`.
+5. Use `expected` fixtures only when exact structure really matters.
+
+## Prompt writing
+
+Prompts should sound like something a user would naturally ask.
+
+Good:
+
+- "Create a flow that routes support requests based on customer tier."
+- "Add a reset button that sets the counter back to 0."
+- "Create a flow that reuses the existing greeting script instead of duplicating the logic."
+
+Bad:
+
+- "Use `branchone` with 3 branches and a default branch."
+- "Create a `rawscript` step with this exact topology."
+- "This is a benchmark harness."
+
+Do not write prompts as if the user knows Windmill internals unless the case is explicitly testing a power-user workflow.
+
+## Flow-specific rules
+
+This is the main principle you asked for:
+
+- flow prompts should read like requests from a user who does not know the product internals
+- the user should ask for behavior, not for `branchone`, `branchall`, `rawscript`, `preprocessor_module`, `failure_module`, exact graph topology, or other internal constructs
+
+That means:
+
+- creation cases should describe the business behavior and expected result
+- modification cases may mention existing step names, because the user can see the current flow
+- only mention special Windmill constructs when the case is explicitly about those constructs
+
+Examples:
+
+- acceptable creation prompt:
+  "Create a purchase approval flow that pauses for approval and asks the approver for a comment."
+- avoid:
+  "Create a suspend step with one required event and a resume form."
+
+For flow cases, do not fail a case just because the model chose a different valid topology.
+
+## App-specific rules
+
+App prompts should focus on user-visible behavior:
+
+- what the UI should let the user do
+- what should persist
+- what backend behavior is needed
+
+Avoid prompting in terms of React structure, component names, or implementation unless the case is specifically about editing an existing app.
+
+## CLI-specific rules
+
+CLI prompts can be more explicit about paths and file names because real CLI users often do specify them.
+
+Still, avoid benchmark phrasing. The prompt should read like a repo task, not a harness instruction.
+
+When relevant, ask the assistant to tell the user which `wmill` commands to run next. That is part of the benchmarked behavior.
+
+## Deterministic validation
+
+Use deterministic validation only for hard failures such as:
+
+- missing required files
+- unexpected extra files when the prompt says not to create them
+- syntax errors
+- unresolved flow refs
+- missing required special modules or suspend config
+- obvious artifact corruption
+
+Do not use deterministic validation to enforce one preferred implementation for broad creation tasks.
+
+Examples of bad hard checks:
+
+- exact step topology for a creation flow
+- exact branch structure when the prompt only asked for routing behavior
+- exact input shape when multiple reasonable shapes are acceptable
+
+## Judge checklist
+
+Every non-trivial case should have a `judgeChecklist`.
+
+The checklist should capture:
+
+- the user-visible behavior that must be present
+- important constraints
+- key completion criteria
+
+The checklist should not duplicate low-level implementation details unless they are truly required by the task.
+
+Good checklist items:
+
+- "the flow calculates the order total with 8% tax"
+- "the app persists recipes appropriately for a raw Windmill app"
+- "the flow reuses the existing workspace script instead of rewriting the logic"
+
+Bad checklist items:
+
+- "uses `branchone`"
+- "contains a `rawscript` node"
+
+## When to use `expected`
+
+Use `expected` fixtures when the case is structure-sensitive, for example:
+
+- exact file creation
+- exact script content
+- modification cases where a specific file must change in a specific way
+- cases where preserving an existing structure is part of the requirement
+
+Do not use a full `expected` artifact as the semantic oracle for broad creation tasks when multiple valid outputs should pass.
+
+## When to use `initial`
+
+Use `initial` when the benchmark is about:
+
+- editing an existing artifact
+- reusing existing workspace assets
+- preserving existing behavior while adding a change
+
+If the case is greenfield, prefer no `initial`.
+
+## Case design ladder
+
+Prefer suites that get gradually harder:
+
+1. trivial create case
+2. realistic create case
+3. reuse-existing-assets case
+4. modification case
+5. refactor case
+6. edge-case or niche product behavior
+
+The last cases in a suite should cover unusual or product-specific behavior.
+
+## Anti-patterns
+
+Avoid these:
+
+- benchmark framing in prompts
+- over-specified internal topology for creation tasks
+- judge checklists that just restate implementation details
+- deterministic validation that encodes one preferred solution
+- fixtures that are so minimal or brittle that they create false negatives
+
+## Before adding a case
+
+Ask:
+
+1. Would a real user plausibly write this prompt?
+2. If the model solves it in a different valid way, would the case still pass?
+3. Are the hard deterministic checks only catching objectively broken output?
+4. Does the `judgeChecklist` describe the real success criteria?
+5. If this case fails, will the reason be understandable from the saved artifacts?
--- a/ai_evals/CLAUDE.md
+++ b/ai_evals/CLAUDE.md
@@ -0,0 +1 @@
+@AGENTS.md
--- a/ai_evals/README.md
+++ b/ai_evals/README.md
@@ -0,0 +1,197 @@
+# AI Evals
+
+Small benchmark runner for the four Windmill AI generation modes:
+
+- `cli`
+- `flow`
+- `script`
+- `app`
+
+The benchmark always tests the current production prompts, tools, and guidance in this checkout.
+
+Each attempt runs:
+
+1. the real production path
+2. deterministic validation
+3. LLM judging
+
+## Install
+
+```bash
+cd ai_evals
+bun install
+```
+
+Frontend modes also require frontend dependencies:
+
+```bash
+cd frontend
+bun install
+```
+
+## Commands
+
+List model aliases:
+
+```bash
+cd ai_evals
+bun run cli -- models
+```
+
+List cases:
+
+```bash
+cd ai_evals
+bun run cli -- cases
+bun run cli -- cases flow
+```
+
+Run benchmarks:
+
+```bash
+cd ai_evals
+bun run cli -- run flow
+bun run cli -- run flow flow-test4-order-processing-loop --model opus
+bun run cli -- run flow flow-test0-sum-two-numbers --models haiku,opus,4o
+bun run cli -- run flow flow-test0-sum-two-numbers --runs 3 --verbose
+bun run cli -- run flow --record
+WMILL_AI_EVAL_BACKEND_URL=http://127.0.0.1:8000 bun run cli -- run flow --backend-validation preview
+bun run cli -- run cli bun-hello-script
+```
+
+Public CLI surface:
+
+- `models`
+- `cases [mode]`
+- `run <mode> [caseIds...]`
+
+`run` options:
+
+- `--runs <n>`: repeat each case `n` times
+- `--output <path>`: custom result JSON path
+- `--model <alias>`: choose the model under test
+- `--models <a,b,c>`: run the same cases sequentially against several model aliases
+- `--verbose`: stream assistant output for frontend runs
+- `--record`: append a compact tracked summary line to `ai_evals/history/<mode>.jsonl` for full-suite runs only
+- `--backend-validation <mode>`: optional backend smoke validation (`off` or `preview`) for `script` and `flow` evals
+
+## Models
+
+Use `bun run cli -- models` to see the current aliases.
+
+Today:
+
+- `haiku`
+- `sonnet`
+- `opus`
+- `4o`
+- `gemini-flash`
+- `gemini-pro`
+- `gemini-3-flash-preview`
+- `gemini-3.1-pro-preview`
+
+Notes:
+
+- the command also prints accepted alias spellings such as `gpt-4o`, `claude-opus-4.6`, and `claude-haiku-4.5`
+- frontend modes (`flow`, `script`, `app`) can use Anthropic, OpenAI, and Gemini-backed aliases
+- `cli` mode always uses the Anthropic agent SDK, so only Anthropic aliases are valid there
+- the judge model is separate and currently defaults to `claude-sonnet-4-6`
+
+## Case Format
+
+Cases live in one YAML file per mode under `ai_evals/cases/`.
+
+Minimal shape:
+
+```yaml
+- id: flow-test0-sum-two-numbers
+  prompt: |-
+    Create a flow that takes two numbers, `a` and `b`, and returns their sum.
+  initial: ai_evals/fixtures/...
+  expected: ai_evals/fixtures/...
+```
+
+Optional fields:
+
+- `initial`: starting state fixture
+- `expected`: expected artifact fixture
+- `validate`: extra deterministic validation rules
+- `runtime.backendPreview`: optional real backend preview config for smoke validation
+
+For `flow` mode, `validate` can express requirements such as:
+
+- accepted input schema shapes
+- required `results.*` reference validity
+- required module/code/input characteristics
+
+For `flow` mode, an `initial` fixture can also include a benchmark workspace catalog of
+existing scripts and flows. That lets the real `search_workspace` and
+`get_runnable_details` tools discover reusable workspace runnables during evals.
+
+If `--backend-validation preview` is enabled:
+
+- `script` evals run a real backend script preview in an isolated temp workspace
+- `flow` evals run a real backend flow preview only for cases that define `runtime.backendPreview`
+- `flow` cases with `initial.workspace` fixtures seed those scripts and flows into the preview workspace before preview
+- when `WMILL_AI_EVAL_BACKEND_WORKSPACE` is set, `ai_evals` treats that workspace as a dedicated test workspace, clears managed eval assets under `f/evals/*` before each preview run, and then reseeds the current case fixtures
+
+Supported backend validation env vars:
+
+- `WMILL_AI_EVAL_BACKEND_VALIDATION=preview`
+- `WMILL_AI_EVAL_BACKEND_URL=http://127.0.0.1:8000`
+- `WMILL_AI_EVAL_BACKEND_EMAIL=admin@windmill.dev`
+- `WMILL_AI_EVAL_BACKEND_PASSWORD=changeme`
+- `WMILL_AI_EVAL_BACKEND_WORKSPACE=integration-tests` to reuse an existing workspace on CE installs with low workspace limits
+- `WMILL_AI_EVAL_KEEP_WORKSPACES=1`
+- `WMILL_AI_EVAL_WORKSPACE_PREFIX=ai-evals`
+
+## Results And Artifacts
+
+Every run writes:
+
+- a summary JSON under `ai_evals/results/`
+- generated artifacts in a sibling directory
+
+If `--record` is used, the CLI also appends one compact JSON line to:
+
+- `ai_evals/history/flow.jsonl`
+- `ai_evals/history/script.jsonl`
+- `ai_evals/history/app.jsonl`
+- `ai_evals/history/cli.jsonl`
+
+Each recorded line contains:
+
+- run metadata (`createdAt`, `gitSha`, `mode`, `runModel`, `judgeModel`)
+- suite totals (`caseCount`, `attemptCount`, `passedAttempts`, `passRate`, `averageDurationMs`, `averageJudgeScore`)
+- average token usage (`averageTokenUsagePerAttempt`)
+- per-case metrics under `cases[]` (`averageDurationMs`, `averageJudgeScore`, `averageTokenUsagePerAttempt`, pass rate)
+- `failedCaseIds`
+
+Example:
+
+- summary: `ai_evals/results/2026-04-09T09-40-33.051Z__flow.json`
+- artifacts: `ai_evals/results/2026-04-09T09-40-33.051Z__flow/`
+
+Typical artifacts by mode:
+
+- `flow`: `flow.json`
+- `script`: `script.json` plus the generated script file
+- `app`: `app.json` plus frontend/backend files
+- `cli`: `assistant-output.txt` plus generated workspace files
+- backend-validated attempts also include `backend-preview.json`
+
+## Layout
+
+- `cases/`: one YAML file per mode
+- `fixtures/`: initial and expected fixtures
+- `core/`: shared loading, model resolution, validation, judging, and result writing
+- `modes/`: one runner per mode
+- `history/`: optional tracked pass-rate history written by `run --record`, one JSONL file per mode
+- `results/`: local benchmark output and artifacts
+
+## Notes
+
+- Frontend modes reuse the production frontend chat code through the Vitest bridge.
+- CLI mode creates an isolated workspace, writes the current checkout guidance into it, and benchmarks the real skills / `AGENTS.md` flow.
+- Frontend progress streams live while the benchmark is running.
+- Deterministic validators should stay focused on real correctness constraints, not one exact implementation shape.
--- a/ai_evals/adapters/cli/runtime.test.ts
+++ b/ai_evals/adapters/cli/runtime.test.ts
@@ -0,0 +1,72 @@
+import { describe, expect, it } from "bun:test";
+import {
+  anthropicUsageToBenchmarkTokenUsage,
+  extractCliResultTokenUsage,
+} from "./runtime";
+
+describe("anthropicUsageToBenchmarkTokenUsage", () => {
+  it("includes cache tokens in prompt usage", () => {
+    expect(
+      anthropicUsageToBenchmarkTokenUsage({
+        input_tokens: 120,
+        output_tokens: 45,
+        cache_creation_input_tokens: 30,
+        cache_read_input_tokens: 5,
+      })
+    ).toEqual({
+      prompt: 155,
+      completion: 45,
+      total: 200,
+    });
+  });
+
+  it("returns null when usage is absent", () => {
+    expect(anthropicUsageToBenchmarkTokenUsage(null)).toBeNull();
+  });
+});
+
+describe("extractCliResultTokenUsage", () => {
+  it("reads aggregate usage from the SDK result event", () => {
+    expect(
+      extractCliResultTokenUsage({
+        type: "result",
+        usage: {
+          input_tokens: 400,
+          output_tokens: 120,
+          cache_creation_input_tokens: 50,
+          cache_read_input_tokens: 25,
+        },
+      })
+    ).toEqual({
+      prompt: 475,
+      completion: 120,
+      total: 595,
+    });
+  });
+
+  it("falls back to modelUsage when aggregate usage is unavailable", () => {
+    expect(
+      extractCliResultTokenUsage({
+        type: "result",
+        modelUsage: {
+          opus: {
+            inputTokens: 200,
+            outputTokens: 60,
+            cacheCreationInputTokens: 10,
+            cacheReadInputTokens: 5,
+          },
+          haiku: {
+            inputTokens: 80,
+            outputTokens: 20,
+            cacheCreationInputTokens: 0,
+            cacheReadInputTokens: 15,
+          },
+        },
+      })
+    ).toEqual({
+      prompt: 310,
+      completion: 80,
+      total: 390,
+    });
+  });
+});
--- a/ai_evals/adapters/cli/runtime.ts
+++ b/ai_evals/adapters/cli/runtime.ts
@@ -0,0 +1,199 @@
+import { query, type Options } from "@anthropic-ai/claude-agent-sdk";
+import { join } from "path";
+import { fileURLToPath } from "url";
+import { getCliEvalModel, resolveEvalModel, type CliEvalModelConfig } from "../../core/models";
+import type { BenchmarkTokenUsage } from "../../core/types";
+
+export interface ToolInvocation {
+  tool: string;
+  input: Record<string, unknown>;
+  timestamp: number;
+}
+
+export interface PromptRunResult {
+  toolsUsed: ToolInvocation[];
+  skillsInvoked: string[];
+  output: string;
+  durationMs: number;
+  assistantMessageCount: number;
+  tokenUsage: BenchmarkTokenUsage | null;
+}
+
+interface AnthropicUsageLike {
+  input_tokens?: number | null;
+  output_tokens?: number | null;
+  cache_creation_input_tokens?: number | null;
+  cache_read_input_tokens?: number | null;
+}
+
+interface AnthropicModelUsageLike {
+  inputTokens?: number | null;
+  outputTokens?: number | null;
+  cacheCreationInputTokens?: number | null;
+  cacheReadInputTokens?: number | null;
+}
+
+interface CliResultMessageLike {
+  type?: string;
+  usage?: AnthropicUsageLike | null;
+  modelUsage?: Record<string, AnthropicModelUsageLike> | null;
+}
+
+const REPO_ROOT = fileURLToPath(new URL("../../../", import.meta.url));
+export const DEFAULT_CLI_EVAL_MODEL: CliEvalModelConfig = getCliEvalModel(resolveEvalModel("cli"));
+
+export function getGeneratedSkillsSource(): string {
+  return join(REPO_ROOT, "system_prompts", "auto-generated", "skills");
+}
+
+export function anthropicUsageToBenchmarkTokenUsage(
+  usage: AnthropicUsageLike | null | undefined
+): BenchmarkTokenUsage | null {
+  if (!usage) {
+    return null;
+  }
+
+  const prompt =
+    (usage.input_tokens ?? 0) +
+    (usage.cache_creation_input_tokens ?? 0) +
+    (usage.cache_read_input_tokens ?? 0);
+  const completion = usage.output_tokens ?? 0;
+
+  return {
+    prompt,
+    completion,
+    total: prompt + completion,
+  };
+}
+
+export function extractCliResultTokenUsage(message: unknown): BenchmarkTokenUsage | null {
+  if (!message || typeof message !== "object") {
+    return null;
+  }
+
+  const resultMessage = message as CliResultMessageLike;
+  if (resultMessage.type !== "result") {
+    return null;
+  }
+
+  const usage = anthropicUsageToBenchmarkTokenUsage(resultMessage.usage);
+  if (usage) {
+    return usage;
+  }
+
+  if (!resultMessage.modelUsage || typeof resultMessage.modelUsage !== "object") {
+    return null;
+  }
+
+  let prompt = 0;
+  let completion = 0;
+  let sawModelUsage = false;
+
+  for (const modelUsage of Object.values(resultMessage.modelUsage)) {
+    if (!modelUsage || typeof modelUsage !== "object") {
+      continue;
+    }
+
+    prompt +=
+      (modelUsage.inputTokens ?? 0) +
+      (modelUsage.cacheCreationInputTokens ?? 0) +
+      (modelUsage.cacheReadInputTokens ?? 0);
+    completion += modelUsage.outputTokens ?? 0;
+    sawModelUsage = true;
+  }
+
+  if (!sawModelUsage) {
+    return null;
+  }
+
+  return {
+    prompt,
+    completion,
+    total: prompt + completion,
+  };
+}
+
+export async function runPromptAndCapture(
+  prompt: string,
+  cwd: string,
+  maxTurns: number = 3,
+  modelConfig: CliEvalModelConfig = DEFAULT_CLI_EVAL_MODEL
+): Promise<PromptRunResult> {
+  const toolsUsed: ToolInvocation[] = [];
+  const skillsInvoked: string[] = [];
+  let output = "";
+  let assistantMessageCount = 0;
+  let tokenUsage: BenchmarkTokenUsage | null = null;
+  const startedAt = Date.now();
+
+  const options: Options = {
+    cwd,
+    model: modelConfig.model,
+    maxTurns,
+    settingSources: ["project"],
+    allowedTools: ["Skill", "Read", "Glob", "Grep", "Bash", "Write", "Edit"]
+  };
+
+  for await (const message of query({ prompt, options })) {
+    if (message.type === "assistant") {
+      assistantMessageCount += 1;
+      const content = message.message?.content;
+      if (Array.isArray(content)) {
+        for (const block of content) {
+          if (block.type === "tool_use") {
+            toolsUsed.push({
+              tool: block.name,
+              input: block.input as Record<string, unknown>,
+              timestamp: Date.now()
+            });
+
+            if (block.name === "Skill" && typeof block.input === "object" && block.input !== null) {
+              const skillInput = block.input as { skill?: string };
+              if (skillInput.skill) {
+                skillsInvoked.push(skillInput.skill);
+              }
+            }
+          } else if (block.type === "text") {
+            output += block.text;
+          }
+        }
+      }
+    } else if (message.type === "result") {
+      const resultMessage = message as { result?: string };
+      tokenUsage = extractCliResultTokenUsage(message) ?? tokenUsage;
+      if (typeof resultMessage.result === "string") {
+        output += resultMessage.result;
+      }
+    }
+  }
+
+  return {
+    toolsUsed,
+    skillsInvoked,
+    output,
+    durationMs: Date.now() - startedAt,
+    assistantMessageCount,
+    tokenUsage,
+  };
+}
+
+export function wasSkillInvoked(result: PromptRunResult, skillName: string): boolean {
+  return result.skillsInvoked.some((skill) => skill === skillName || skill.includes(skillName));
+}
+
+export function wasToolUsed(result: PromptRunResult, toolName: string): boolean {
+  return result.toolsUsed.some((tool) => tool.tool === toolName);
+}
+
+export function formatCliRunModelLabel(modelConfig: CliEvalModelConfig): string {
+  return `${modelConfig.provider}:${modelConfig.model}`;
+}
+
+export function getToolInputs(
+  result: PromptRunResult,
+  toolName: string
+): Record<string, unknown>[] {
+  return result.toolsUsed
+    .filter((tool) => tool.tool === toolName)
+    .map((tool) => tool.input);
+}
--- a/ai_evals/adapters/frontend/backendPreview.test.ts
+++ b/ai_evals/adapters/frontend/backendPreview.test.ts
@@ -0,0 +1,246 @@
+import { afterEach, describe, expect, it } from 'bun:test'
+import type { BackendValidationSettings } from '../../core/backendValidation'
+import { BackendPreviewClient } from './backendPreview'
+
+const ORIGINAL_FETCH = globalThis.fetch
+
+afterEach(() => {
+	globalThis.fetch = ORIGINAL_FETCH
+})
+
+describe('BackendPreviewClient', () => {
+	it('updates an existing seeded script on path conflict and waits for deployment', async () => {
+		const requests: Array<{ url: string; init?: RequestInit }> = []
+		globalThis.fetch = mockFetch(
+			requests,
+			textResponse(200, 'token'),
+			textResponse(200, ''),
+			textResponse(400, 'Path conflict for f/evals/add_two_numbers with non-archived hash 123'),
+			jsonResponse(200, { hash: '123' }),
+			textResponse(200, '456'),
+			jsonResponse(200, { lock: 'script.lock', lock_error_logs: null })
+		)
+
+		const client = new BackendPreviewClient(
+			buildSettings({ baseUrl: 'http://backend.test/script-upsert' })
+		)
+
+		await client.createScript({
+			workspaceId: 'test',
+			path: 'f/evals/add_two_numbers',
+			summary: 'Add two numbers',
+			content: 'export async function main(a: number, b: number) { return a + b }',
+			language: 'bun'
+		})
+
+		expect(requests.map((entry) => entry.url)).toEqual([
+			'http://backend.test/script-upsert/api/auth/login',
+			'http://backend.test/script-upsert/api/w/test/folders/create',
+			'http://backend.test/script-upsert/api/w/test/scripts/create',
+			'http://backend.test/script-upsert/api/w/test/scripts/get/p/f/evals/add_two_numbers',
+			'http://backend.test/script-upsert/api/w/test/scripts/create',
+			'http://backend.test/script-upsert/api/w/test/scripts/deployment_status/h/456'
+		])
+
+		const updateRequest = requests[4]
+		expect(updateRequest.init?.method).toBe('POST')
+		expect(JSON.parse(String(updateRequest.init?.body))).toMatchObject({
+			path: 'f/evals/add_two_numbers',
+			parent_hash: '123',
+			language: 'bun'
+		})
+	})
+
+	it('updates an existing seeded flow on create conflict', async () => {
+		const requests: Array<{ url: string; init?: RequestInit }> = []
+		globalThis.fetch = mockFetch(
+			requests,
+			textResponse(200, 'token'),
+			textResponse(200, ''),
+			textResponse(400, 'Flow f/evals/add_numbers_flow already exists'),
+			textResponse(200, '')
+		)
+
+		const client = new BackendPreviewClient(
+			buildSettings({ baseUrl: 'http://backend.test/flow-upsert' })
+		)
+
+		await client.createFlow({
+			workspaceId: 'test',
+			path: 'f/evals/add_numbers_flow',
+			summary: 'Add numbers',
+			value: { modules: [] }
+		})
+
+		expect(requests.map((entry) => entry.url)).toEqual([
+			'http://backend.test/flow-upsert/api/auth/login',
+			'http://backend.test/flow-upsert/api/w/test/folders/create',
+			'http://backend.test/flow-upsert/api/w/test/flows/create',
+			'http://backend.test/flow-upsert/api/w/test/flows/update/f/evals/add_numbers_flow'
+		])
+
+		const updateRequest = requests[3]
+		expect(updateRequest.init?.method).toBe('POST')
+		expect(JSON.parse(String(updateRequest.init?.body))).toMatchObject({
+			path: 'f/evals/add_numbers_flow',
+			value: { modules: [] }
+		})
+	})
+
+	it('serializes shared-workspace validations inside the overridden workspace', async () => {
+		globalThis.fetch = async (input) => {
+			const url = String(input)
+			if (url.endsWith('/api/auth/login')) {
+				return textResponse(200, 'token')
+			}
+			if (url.endsWith('/api/workspaces/exists')) {
+				return textResponse(200, 'true')
+			}
+			if (url.endsWith('/api/w/shared-preview/flows/list_paths')) {
+				return jsonResponse(200, [])
+			}
+			if (url.endsWith('/api/w/shared-preview/scripts/list_paths')) {
+				return jsonResponse(200, [])
+			}
+			throw new Error(`Unexpected fetch: ${url}`)
+		}
+
+		const client = new BackendPreviewClient(
+			buildSettings({
+				baseUrl: 'http://backend.test/shared-lock',
+				workspaceOverride: 'shared-preview'
+			})
+		)
+
+		const order: string[] = []
+		let releaseFirst: (() => void) | undefined
+		let notifyFirstStart: (() => void) | undefined
+		const firstStarted = new Promise<void>((resolve) => {
+			notifyFirstStart = resolve
+		})
+
+		const first = client.withWorkspace('flow-test1', 1, async () => {
+			order.push('first:start')
+			notifyFirstStart?.()
+			await new Promise<void>((resolve) => {
+				releaseFirst = resolve
+			})
+			order.push('first:end')
+		})
+
+		const second = client.withWorkspace('flow-test2', 1, async () => {
+			order.push('second:start')
+			order.push('second:end')
+		})
+
+		await firstStarted
+		expect(order).toEqual(['first:start'])
+
+		releaseFirst?.()
+		await Promise.all([first, second])
+
+		expect(order).toEqual(['first:start', 'first:end', 'second:start', 'second:end'])
+	})
+
+	it('clears managed shared-workspace assets before preview runs', async () => {
+		const requests: Array<{ url: string; init?: RequestInit }> = []
+		globalThis.fetch = mockFetch(
+			requests,
+			textResponse(200, 'token'),
+			textResponse(200, 'true'),
+			jsonResponse(200, ['f/evals/old_subflow', 'u/admin/keep_flow']),
+			textResponse(200, ''),
+			jsonResponse(200, ['f/evals/old_script', 'f/shared/keep_script']),
+			textResponse(200, '')
+		)
+
+		const client = new BackendPreviewClient(
+			buildSettings({
+				baseUrl: 'http://backend.test/shared-cleanup',
+				workspaceOverride: 'shared-preview'
+			})
+		)
+
+		await client.withWorkspace('flow-test1', 1, async () => undefined)
+
+		expect(requests.map((entry) => entry.url)).toEqual([
+			'http://backend.test/shared-cleanup/api/auth/login',
+			'http://backend.test/shared-cleanup/api/workspaces/exists',
+			'http://backend.test/shared-cleanup/api/w/shared-preview/flows/list_paths',
+			'http://backend.test/shared-cleanup/api/w/shared-preview/flows/delete/f/evals/old_subflow',
+			'http://backend.test/shared-cleanup/api/w/shared-preview/scripts/list_paths',
+			'http://backend.test/shared-cleanup/api/w/shared-preview/scripts/delete/p/f/evals/old_script'
+		])
+	})
+
+	it('retries login after a cached login failure', async () => {
+		const requests: Array<{ url: string; init?: RequestInit }> = []
+		globalThis.fetch = mockFetch(
+			requests,
+			textResponse(503, 'backend starting'),
+			textResponse(200, 'token'),
+			textResponse(200, 'true'),
+			jsonResponse(200, []),
+			jsonResponse(200, [])
+		)
+
+		const client = new BackendPreviewClient(
+			buildSettings({
+				baseUrl: 'http://backend.test/login-retry',
+				workspaceOverride: 'shared-preview'
+			})
+		)
+
+		await expect(client.withWorkspace('flow-test1', 1, async () => undefined)).rejects.toThrow(
+			'login for backend validation failed'
+		)
+		await expect(client.withWorkspace('flow-test1', 1, async () => 'ok')).resolves.toBe('ok')
+
+		expect(
+			requests.filter((entry) => entry.url === 'http://backend.test/login-retry/api/auth/login')
+		).toHaveLength(2)
+	})
+})
+
+function buildSettings(
+	overrides: Partial<BackendValidationSettings> = {}
+): BackendValidationSettings {
+	return {
+		mode: 'preview',
+		baseUrl: 'http://backend.test/default',
+		email: 'admin@windmill.dev',
+		password: 'changeme',
+		keepWorkspaces: true,
+		workspacePrefix: 'ai-evals',
+		pollIntervalMs: 1,
+		maxWaitMs: 50,
+		...overrides
+	}
+}
+
+function mockFetch(
+	requests: Array<{ url: string; init?: RequestInit }>,
+	...responses: Response[]
+): typeof fetch {
+	const queue = [...responses]
+	return async (input, init) => {
+		const url = String(input)
+		requests.push({ url, init })
+		const next = queue.shift()
+		if (!next) {
+			throw new Error(`Unexpected fetch: ${url}`)
+		}
+		return next
+	}
+}
+
+function jsonResponse(status: number, body: unknown): Response {
+	return new Response(JSON.stringify(body), {
+		status,
+		headers: { 'Content-Type': 'application/json' }
+	})
+}
+
+function textResponse(status: number, body: string): Response {
+	return new Response(body, { status })
+}
--- a/ai_evals/adapters/frontend/backendPreview.ts
+++ b/ai_evals/adapters/frontend/backendPreview.ts
@@ -0,0 +1,502 @@
+import { randomUUID } from 'node:crypto'
+import type { BackendValidationSettings } from '../../core/backendValidation'
+
+interface CompletedJobResultMaybe {
+	completed: boolean
+	result: unknown
+	success?: boolean
+	started?: boolean
+}
+
+interface ScriptDeploymentStatus {
+	lock?: unknown
+	lock_error_logs?: string | null
+}
+
+export interface CompletedPreviewJob {
+	id: string
+	success: boolean
+	result: unknown
+	logs?: string | null
+	raw: Record<string, unknown>
+}
+
+const tokenCache = new Map<string, Promise<string>>()
+const sharedWorkspaceQueue = new Map<string, Promise<void>>()
+const managedSharedWorkspacePrefixes = ['f/evals/']
+
+export class BackendPreviewClient {
+	constructor(private readonly settings: BackendValidationSettings) {}
+
+	async withWorkspace<T>(
+		caseId: string,
+		attempt: number,
+		body: (workspaceId: string) => Promise<T>
+	): Promise<T> {
+		const workspaceId =
+			this.settings.workspaceOverride ??
+			buildWorkspaceId(this.settings.workspacePrefix, caseId, attempt)
+
+		const run = async () => {
+			await this.ensureWorkspace(workspaceId)
+			if (this.settings.workspaceOverride) {
+				await this.clearManagedSharedWorkspaceAssets(workspaceId)
+			}
+
+			try {
+				return await body(workspaceId)
+			} finally {
+				if (!this.settings.keepWorkspaces && !this.settings.workspaceOverride) {
+					await this.deleteWorkspace(workspaceId).catch(() => undefined)
+				}
+			}
+		}
+
+		if (this.settings.workspaceOverride) {
+			return await withSharedWorkspaceLock(workspaceId, run)
+		}
+
+		return await run()
+	}
+
+	async createScript(input: {
+		workspaceId: string
+		path: string
+		summary: string
+		description?: string
+		schema?: Record<string, unknown>
+		content: string
+		language: string
+	}): Promise<void> {
+		await this.ensureFolderForPath(input.workspaceId, input.path)
+
+		const payload = {
+			path: input.path,
+			summary: input.summary,
+			description: input.description ?? '',
+			content: input.content,
+			schema: input.schema ?? { type: 'object', properties: {}, required: [] },
+			is_template: false,
+			language: input.language,
+			kind: 'script'
+		}
+
+		const response = await this.request(`/w/${encodeURIComponent(input.workspaceId)}/scripts/create`, {
+			method: 'POST',
+			headers: { 'Content-Type': 'application/json' },
+			body: JSON.stringify(payload)
+		})
+
+		if (response.ok) {
+			await this.waitForScriptDeployment(input.workspaceId, input.path, (await response.text()).trim())
+			return
+		}
+
+		const message = await response.text()
+		if (!isConflictMessage(message)) {
+			throw new Error(`create script ${input.path} failed: ${response.status} ${response.statusText} - ${message}`)
+		}
+
+		const currentScript = await this.getScriptByPath(input.workspaceId, input.path)
+		const currentHash = readStringField(currentScript, 'hash', `script ${input.path}`)
+		const updateResponse = await this.request(
+			`/w/${encodeURIComponent(input.workspaceId)}/scripts/create`,
+			{
+				method: 'POST',
+				headers: { 'Content-Type': 'application/json' },
+				body: JSON.stringify({
+					...payload,
+					parent_hash: currentHash
+				})
+			}
+		)
+		await expectOk(updateResponse, `update script ${input.path}`)
+		await this.waitForScriptDeployment(input.workspaceId, input.path, (await updateResponse.text()).trim())
+	}
+
+	async createFlow(input: {
+		workspaceId: string
+		path: string
+		summary: string
+		description?: string
+		schema?: Record<string, unknown>
+		value: Record<string, unknown>
+	}): Promise<void> {
+		await this.ensureFolderForPath(input.workspaceId, input.path)
+
+		const payload = {
+			path: input.path,
+			summary: input.summary,
+			description: input.description ?? '',
+			schema: input.schema ?? { type: 'object', properties: {}, required: [] },
+			value: input.value
+		}
+
+		const response = await this.request(`/w/${encodeURIComponent(input.workspaceId)}/flows/create`, {
+			method: 'POST',
+			headers: { 'Content-Type': 'application/json' },
+			body: JSON.stringify(payload)
+		})
+
+		if (response.ok) {
+			return
+		}
+
+		const message = await response.text()
+		if (!isConflictMessage(message)) {
+			throw new Error(`create flow ${input.path} failed: ${response.status} ${response.statusText} - ${message}`)
+		}
+
+		const updateResponse = await this.request(
+			`/w/${encodeURIComponent(input.workspaceId)}/flows/update/${input.path}`,
+			{
+				method: 'POST',
+				headers: { 'Content-Type': 'application/json' },
+				body: JSON.stringify(payload)
+			}
+		)
+		await expectOk(updateResponse, `update flow ${input.path}`)
+	}
+
+	async runScriptPreview(input: {
+		workspaceId: string
+		content: string
+		args: Record<string, unknown>
+		language: string
+		path?: string
+		timeoutSeconds?: number
+	}): Promise<CompletedPreviewJob> {
+		const response = await this.request(
+			withQuery(`/w/${encodeURIComponent(input.workspaceId)}/jobs/run/preview`, {
+				timeout: input.timeoutSeconds
+			}),
+			{
+				method: 'POST',
+				headers: { 'Content-Type': 'application/json' },
+				body: JSON.stringify({
+					content: input.content,
+					args: input.args,
+					language: input.language,
+					path: input.path
+				})
+			}
+		)
+
+		await expectOk(response, 'start script preview')
+		const jobId = (await response.text()).trim()
+		return await this.waitForCompletedJob(input.workspaceId, jobId)
+	}
+
+	async runFlowPreview(input: {
+		workspaceId: string
+		value: Record<string, unknown>
+		args: Record<string, unknown>
+		timeoutSeconds?: number
+		path?: string
+	}): Promise<CompletedPreviewJob> {
+		const response = await this.request(
+			withQuery(`/w/${encodeURIComponent(input.workspaceId)}/jobs/run/preview_flow`, {
+				timeout: input.timeoutSeconds
+			}),
+			{
+				method: 'POST',
+				headers: { 'Content-Type': 'application/json' },
+				body: JSON.stringify({
+					value: input.value,
+					args: input.args,
+					path: input.path
+				})
+			}
+		)
+
+		await expectOk(response, 'start flow preview')
+		const jobId = (await response.text()).trim()
+		return await this.waitForCompletedJob(input.workspaceId, jobId)
+	}
+
+	private async ensureWorkspace(workspaceId: string): Promise<void> {
+		const existsResponse = await this.request('/workspaces/exists', {
+			method: 'POST',
+			headers: { 'Content-Type': 'application/json' },
+			body: JSON.stringify({ id: workspaceId })
+		})
+		await expectOk(existsResponse, `check workspace ${workspaceId}`)
+
+		if ((await existsResponse.text()).trim() === 'true') {
+			return
+		}
+
+		const createResponse = await this.request('/workspaces/create', {
+			method: 'POST',
+			headers: { 'Content-Type': 'application/json' },
+			body: JSON.stringify({ id: workspaceId, name: workspaceId })
+		})
+		try {
+			await expectOk(createResponse, `create workspace ${workspaceId}`)
+		} catch (error) {
+			const message = error instanceof Error ? error.message : String(error)
+			if (message.includes('maximum number of workspaces')) {
+				throw new Error(
+					`${message}. Reuse an existing workspace with WMILL_AI_EVAL_BACKEND_WORKSPACE=<workspace-id>.`
+				)
+			}
+			throw error
+		}
+	}
+
+	private async deleteWorkspace(workspaceId: string): Promise<void> {
+		const response = await this.request(`/workspaces/delete/${encodeURIComponent(workspaceId)}`, {
+			method: 'DELETE'
+		})
+		await expectOk(response, `delete workspace ${workspaceId}`)
+	}
+
+	private async ensureFolderForPath(workspaceId: string, path: string): Promise<void> {
+		const folderName = extractFolderName(path)
+		if (!folderName) {
+			return
+		}
+
+		const response = await this.request(`/w/${encodeURIComponent(workspaceId)}/folders/create`, {
+			method: 'POST',
+			headers: { 'Content-Type': 'application/json' },
+			body: JSON.stringify({ name: folderName })
+		})
+
+		if (response.ok) {
+			return
+		}
+
+		const message = await response.text()
+		if (!message.toLowerCase().includes('already exists')) {
+			throw new Error(`Failed to create folder ${folderName}: ${message}`)
+		}
+	}
+
+	private async waitForCompletedJob(
+		workspaceId: string,
+		jobId: string
+	): Promise<CompletedPreviewJob> {
+		const deadline = Date.now() + this.settings.maxWaitMs
+
+		while (Date.now() < deadline) {
+			const maybeResponse = await this.request(
+				`/w/${encodeURIComponent(workspaceId)}/jobs_u/completed/get_result_maybe/${encodeURIComponent(jobId)}?get_started=false`
+			)
+			await expectOk(maybeResponse, `poll job ${jobId}`)
+			const maybeResult = (await maybeResponse.json()) as CompletedJobResultMaybe
+
+			if (maybeResult.completed) {
+				const completedResponse = await this.request(
+					`/w/${encodeURIComponent(workspaceId)}/jobs_u/completed/get/${encodeURIComponent(jobId)}`
+				)
+				await expectOk(completedResponse, `get completed job ${jobId}`)
+				const completedJob = (await completedResponse.json()) as Record<string, unknown>
+				return {
+					id: jobId,
+					success: Boolean(maybeResult.success),
+					result: maybeResult.result,
+					logs:
+						typeof completedJob.logs === 'string' || completedJob.logs === null
+							? (completedJob.logs as string | null)
+							: null,
+					raw: completedJob
+				}
+			}
+
+			await new Promise((resolve) => setTimeout(resolve, this.settings.pollIntervalMs))
+		}
+
+		throw new Error(`Timed out waiting for preview job ${jobId} to complete`)
+	}
+
+	private async getScriptByPath(workspaceId: string, path: string): Promise<Record<string, unknown>> {
+		const response = await this.request(`/w/${encodeURIComponent(workspaceId)}/scripts/get/p/${path}`)
+		await expectOk(response, `get script ${path}`)
+		return (await response.json()) as Record<string, unknown>
+	}
+
+	private async clearManagedSharedWorkspaceAssets(workspaceId: string): Promise<void> {
+		const flowPaths = await this.listFlowPaths(workspaceId)
+		for (const path of flowPaths.filter(isManagedSharedWorkspacePath)) {
+			await this.deleteFlowByPath(workspaceId, path)
+		}
+
+		const scriptPaths = await this.listScriptPaths(workspaceId)
+		for (const path of scriptPaths.filter(isManagedSharedWorkspacePath)) {
+			await this.deleteScriptByPath(workspaceId, path)
+		}
+	}
+
+	private async listFlowPaths(workspaceId: string): Promise<string[]> {
+		const response = await this.request(`/w/${encodeURIComponent(workspaceId)}/flows/list_paths`)
+		await expectOk(response, `list flows in workspace ${workspaceId}`)
+		return await response.json()
+	}
+
+	private async listScriptPaths(workspaceId: string): Promise<string[]> {
+		const response = await this.request(`/w/${encodeURIComponent(workspaceId)}/scripts/list_paths`)
+		await expectOk(response, `list scripts in workspace ${workspaceId}`)
+		return await response.json()
+	}
+
+	private async deleteFlowByPath(workspaceId: string, path: string): Promise<void> {
+		const response = await this.request(`/w/${encodeURIComponent(workspaceId)}/flows/delete/${path}`, {
+			method: 'DELETE'
+		})
+		await expectOk(response, `delete flow ${path}`)
+	}
+
+	private async deleteScriptByPath(workspaceId: string, path: string): Promise<void> {
+		const response = await this.request(`/w/${encodeURIComponent(workspaceId)}/scripts/delete/p/${path}`, {
+			method: 'POST'
+		})
+		await expectOk(response, `delete script ${path}`)
+	}
+
+	private async waitForScriptDeployment(
+		workspaceId: string,
+		path: string,
+		hash: string
+	): Promise<void> {
+		const deadline = Date.now() + this.settings.maxWaitMs
+
+		while (Date.now() < deadline) {
+			const response = await this.request(
+				`/w/${encodeURIComponent(workspaceId)}/scripts/deployment_status/h/${encodeURIComponent(hash)}`
+			)
+			await expectOk(response, `check deployment status for script ${path}`)
+			const deployment = (await response.json()) as ScriptDeploymentStatus
+			if (deployment.lock != null) {
+				return
+			}
+			if (deployment.lock_error_logs) {
+				throw new Error(`Script deployment failed for ${path}: ${deployment.lock_error_logs}`)
+			}
+			await new Promise((resolve) => setTimeout(resolve, this.settings.pollIntervalMs))
+		}
+
+		throw new Error(`Timed out waiting for script ${path} (${hash}) to deploy`)
+	}
+
+	private async request(path: string, init?: RequestInit): Promise<Response> {
+		const token = await this.getToken()
+		return await fetch(`${this.settings.baseUrl}/api${path}`, {
+			...init,
+			headers: {
+				Authorization: `Bearer ${token}`,
+				...(init?.headers ?? {})
+			}
+		})
+	}
+
+	private async getToken(): Promise<string> {
+		const cacheKey = `${this.settings.baseUrl}|${this.settings.email}`
+		let tokenPromise = tokenCache.get(cacheKey)
+		if (!tokenPromise) {
+			tokenPromise = this.login().catch((error) => {
+				if (tokenCache.get(cacheKey) === tokenPromise) {
+					tokenCache.delete(cacheKey)
+				}
+				throw error
+			})
+			tokenCache.set(cacheKey, tokenPromise)
+		}
+		return await tokenPromise
+	}
+
+	private async login(): Promise<string> {
+		const response = await fetch(`${this.settings.baseUrl}/api/auth/login`, {
+			method: 'POST',
+			headers: { 'Content-Type': 'application/json' },
+			body: JSON.stringify({
+				email: this.settings.email,
+				password: this.settings.password
+			})
+		})
+		await expectOk(response, 'login for backend validation')
+		return (await response.text()).trim()
+	}
+}
+
+async function withSharedWorkspaceLock<T>(workspaceId: string, body: () => Promise<T>): Promise<T> {
+	const previous = sharedWorkspaceQueue.get(workspaceId) ?? Promise.resolve()
+	let releaseCurrent: (() => void) | undefined
+	const current = new Promise<void>((resolve) => {
+		releaseCurrent = resolve
+	})
+	const tail = previous.catch(() => undefined).then(() => current)
+	sharedWorkspaceQueue.set(workspaceId, tail)
+
+	await previous.catch(() => undefined)
+
+	try {
+		return await body()
+	} finally {
+		releaseCurrent?.()
+		if (sharedWorkspaceQueue.get(workspaceId) === tail) {
+			sharedWorkspaceQueue.delete(workspaceId)
+		}
+	}
+}
+
+function buildWorkspaceId(prefix: string, caseId: string, attempt: number): string {
+	const caseSlug = caseId
+		.toLowerCase()
+		.replace(/[^a-z0-9-]+/g, '-')
+		.replace(/^-+|-+$/g, '')
+		.slice(0, 30)
+	const suffix = randomUUID().slice(0, 8)
+	return `${prefix}-${caseSlug || 'case'}-a${attempt}-${suffix}`
+}
+
+function extractFolderName(path: string): string | null {
+	if (!path.startsWith('f/')) {
+		return null
+	}
+	const segments = path.split('/').slice(1, -1)
+	return segments.length > 0 ? segments.join('/') : null
+}
+
+function withQuery(
+	path: string,
+	params: Record<string, string | number | undefined>
+): string {
+	const query = new URLSearchParams()
+	for (const [key, value] of Object.entries(params)) {
+		if (value === undefined) {
+			continue
+		}
+		query.set(key, String(value))
+	}
+	const suffix = query.toString()
+	return suffix ? `${path}?${suffix}` : path
+}
+
+async function expectOk(response: Response, context: string): Promise<void> {
+	if (response.ok) {
+		return
+	}
+	throw new Error(`${context} failed: ${response.status} ${response.statusText} - ${await response.text()}`)
+}
+
+function readStringField(
+	value: Record<string, unknown>,
+	field: string,
+	context: string
+): string {
+	const candidate = value[field]
+	if (typeof candidate === 'string' && candidate.length > 0) {
+		return candidate
+	}
+	throw new Error(`${context} is missing string field ${field}`)
+}
+
+function isConflictMessage(message: string): boolean {
+	const normalized = message.toLowerCase()
+	return normalized.includes('already exists') || normalized.includes('path conflict')
+}
+
+function isManagedSharedWorkspacePath(path: string): boolean {
+	return managedSharedWorkspacePrefixes.some((prefix) => path.startsWith(prefix))
+}
--- a/ai_evals/adapters/frontend/benchmarkRunner.ts
+++ b/ai_evals/adapters/frontend/benchmarkRunner.ts
@@ -0,0 +1,93 @@
+import { loadSelectedCases } from "../../core/cases";
+import { resolveBackendValidationSettings } from "../../core/backendValidation";
+import {
+  formatRunModelLabel,
+  getFrontendEvalModel,
+  resolveEvalModel,
+} from "../../core/models";
+import { buildRunResult } from "../../core/results";
+import { runSuite } from "../../core/runSuite";
+import type { BenchmarkRunResult, ModeRunner } from "../../core/types";
+import { emitFrontendBenchmarkProgress } from "./progress";
+import { createAppModeRunner } from "../../modes/app";
+import { createFlowModeRunner } from "../../modes/flow";
+import { createScriptModeRunner } from "../../modes/script";
+import { DEFAULT_JUDGE_MODEL } from "../../core/judge";
+
+export type FrontendBenchmarkMode = "flow" | "app" | "script";
+
+export async function runFrontendBenchmarkFromEnv(): Promise<BenchmarkRunResult> {
+  const mode = parseMode(process.env.WMILL_FRONTEND_AI_EVAL_MODE);
+  const caseIds = parseOptionalJsonStringArray(process.env.WMILL_FRONTEND_AI_EVAL_CASE_IDS);
+  const runs = parsePositiveInteger(process.env.WMILL_FRONTEND_AI_EVAL_RUNS, "WMILL_FRONTEND_AI_EVAL_RUNS");
+  const emitProgress = process.env.WMILL_FRONTEND_AI_EVAL_PROGRESS === "1";
+  const verbose = process.env.WMILL_FRONTEND_AI_EVAL_VERBOSE === "1";
+  const model = resolveEvalModel(mode, process.env.WMILL_FRONTEND_AI_EVAL_MODEL);
+  const backendValidation = resolveBackendValidationSettings({
+    evalMode: mode,
+    requestedMode: process.env.WMILL_FRONTEND_AI_EVAL_BACKEND_VALIDATION,
+  });
+
+  const selectedCases = await loadSelectedCases(mode, caseIds);
+  const modeRunner = getModeRunner(mode, getFrontendEvalModel(model), backendValidation);
+  const runModel = formatRunModelLabel(mode, model);
+  const caseResults = await runSuite({
+    modeRunner,
+    cases: selectedCases,
+    runs,
+    runModel,
+    judgeModel: DEFAULT_JUDGE_MODEL,
+    concurrency: verbose ? 1 : undefined,
+    verbose,
+    onProgress: emitProgress ? (event) => emitFrontendBenchmarkProgress(event) : undefined,
+  });
+
+  return buildRunResult({
+    mode,
+    runs,
+    runModel,
+    judgeModel: DEFAULT_JUDGE_MODEL,
+    caseResults,
+  });
+}
+
+function getModeRunner(
+  mode: FrontendBenchmarkMode,
+  model: ReturnType<typeof getFrontendEvalModel>,
+  backendValidation: ReturnType<typeof resolveBackendValidationSettings>
+): ModeRunner<any, any, any> {
+  switch (mode) {
+    case "flow":
+      return createFlowModeRunner(model, backendValidation);
+    case "app":
+      return createAppModeRunner(model);
+    case "script":
+      return createScriptModeRunner(model, backendValidation);
+  }
+}
+
+function parseMode(value: string | undefined): FrontendBenchmarkMode {
+  if (value === "flow" || value === "app" || value === "script") {
+    return value;
+  }
+  throw new Error(`Unsupported frontend benchmark mode: ${String(value)}`);
+}
+
+function parseOptionalJsonStringArray(value: string | undefined): string[] {
+  if (!value) {
+    return [];
+  }
+  const parsed = JSON.parse(value) as unknown;
+  if (!Array.isArray(parsed) || parsed.some((entry) => typeof entry !== "string")) {
+    throw new Error("WMILL_FRONTEND_AI_EVAL_CASE_IDS must be a JSON string array");
+  }
+  return parsed;
+}
+
+function parsePositiveInteger(value: string | undefined, envName: string): number {
+  const parsed = Number(value);
+  if (!Number.isInteger(parsed) || parsed <= 0) {
+    throw new Error(`${envName} must be a positive integer`);
+  }
+  return parsed;
+}
--- a/ai_evals/adapters/frontend/core/app/appEvalRunner.ts
+++ b/ai_evals/adapters/frontend/core/app/appEvalRunner.ts
@@ -0,0 +1,92 @@
+import { mkdtemp } from 'fs/promises'
+import { tmpdir } from 'os'
+import { join } from 'path'
+import type {
+	AppFiles,
+	BackendRunnable,
+	AppAIChatHelpers
+} from '../../../../../frontend/src/lib/components/copilot/chat/app/core'
+import {
+	getAppTools,
+	prepareAppSystemMessage,
+	prepareAppUserMessage
+} from '../../../../../frontend/src/lib/components/copilot/chat/app/core'
+import type { Tool as ProductionTool } from '../../../../../frontend/src/lib/components/copilot/chat/shared'
+import { createAppFileHelpers } from './fileHelpers'
+import { runEval } from '../shared'
+import type { AIProvider } from '$lib/gen/types.gen'
+import type { ModeRunContext } from '../../../../core/types'
+import type { TokenUsage } from '../shared/types'
+
+export interface AppEvalResult {
+	success: boolean
+	files: AppFiles
+	error?: string
+	assistantMessageCount: number
+	toolCallCount: number
+	toolsUsed: string[]
+	tokenUsage: TokenUsage
+}
+
+export interface AppEvalOptions {
+	initialFrontend?: Record<string, string>
+	initialBackend?: Record<string, BackendRunnable>
+	model?: string
+	maxIterations?: number
+	provider?: AIProvider
+	workspaceRoot?: string
+	runContext?: ModeRunContext
+}
+
+export async function runAppEval(
+	userPrompt: string,
+	apiKey: string,
+	options?: AppEvalOptions
+): Promise<AppEvalResult> {
+	const workspaceRoot =
+		options?.workspaceRoot ??
+		(await mkdtemp(join(tmpdir(), 'wmill-frontend-app-benchmark-')))
+	const { helpers, getFiles, cleanup } = await createAppFileHelpers(
+		options?.initialFrontend ?? {},
+		options?.initialBackend ?? {},
+		workspaceRoot
+	)
+
+	try {
+		const systemMessage = prepareAppSystemMessage()
+		const tools = getAppTools() as ProductionTool<AppAIChatHelpers>[]
+		const model = options?.model ?? 'claude-haiku-4-5-20251001'
+		const userMessage = prepareAppUserMessage(userPrompt, helpers.getSelectedContext())
+
+		const rawResult = await runEval({
+			userPrompt,
+			systemMessage,
+			userMessage,
+			tools,
+			helpers,
+			apiKey,
+			getOutput: getFiles,
+			onAssistantMessageStart: options?.runContext?.onAssistantMessageStart,
+			onAssistantToken: options?.runContext?.onAssistantChunk,
+			onAssistantMessageEnd: options?.runContext?.onAssistantMessageEnd,
+			options: {
+				maxIterations: options?.maxIterations,
+				model,
+				workspace: workspaceRoot,
+				provider: options?.provider
+			}
+		})
+
+		return {
+			files: rawResult.output,
+			success: rawResult.success,
+			error: rawResult.error,
+			assistantMessageCount: rawResult.iterations,
+			toolCallCount: rawResult.toolCallsCount,
+			toolsUsed: rawResult.toolsCalled,
+			tokenUsage: rawResult.tokenUsage
+		}
+	} finally {
+		await cleanup()
+	}
+}
--- a/frontend/src/lib/components/copilot/chat/tests/app/appFixtureLoader.ts
+++ b/frontend/src/lib/components/copilot/chat/tests/app/appFixtureLoader.ts
@@ -1,4 +1,8 @@
-import type { AppFiles, BackendRunnable, InlineScript } from '../../app/core'
+import type {
+	AppFiles,
+	BackendRunnable,
+	InlineScript
+} from '../../../../../frontend/src/lib/components/copilot/chat/app/core'

 /**
 * Backend runnable metadata stored in meta.json files.
--- a/ai_evals/adapters/frontend/core/app/fileHelpers.ts
+++ b/ai_evals/adapters/frontend/core/app/fileHelpers.ts
@@ -0,0 +1,255 @@
+import { mkdir, rm, writeFile } from 'fs/promises'
+import { dirname, join } from 'path'
+import type {
+	AppAIChatHelpers,
+	AppFiles,
+	BackendRunnable,
+	DataTableSchema,
+	LintResult,
+	SelectedContext
+} from '../../../../../frontend/src/lib/components/copilot/chat/app/core'
+
+function createEmptyLintResult(): LintResult {
+	return {
+		errorCount: 0,
+		warningCount: 0,
+		errors: { frontend: {}, backend: {} },
+		warnings: { frontend: {}, backend: {} }
+	}
+}
+
+async function writeFrontendFile(
+	workspaceRoot: string | undefined,
+	path: string,
+	content: string
+): Promise<void> {
+	if (!workspaceRoot) {
+		return
+	}
+	const relativePath = path.startsWith('/') ? path.slice(1) : path
+	const fullPath = join(workspaceRoot, 'frontend', relativePath)
+	await mkdir(dirname(fullPath), { recursive: true })
+	await writeFile(fullPath, content, 'utf8')
+}
+
+async function removeFrontendFile(workspaceRoot: string | undefined, path: string): Promise<void> {
+	if (!workspaceRoot) {
+		return
+	}
+	const relativePath = path.startsWith('/') ? path.slice(1) : path
+	await rm(join(workspaceRoot, 'frontend', relativePath), { force: true })
+}
+
+async function writeBackendRunnable(
+	workspaceRoot: string | undefined,
+	key: string,
+	runnable: BackendRunnable
+): Promise<void> {
+	if (!workspaceRoot) {
+		return
+	}
+	const runnableDir = join(workspaceRoot, 'backend', key)
+	await mkdir(runnableDir, { recursive: true })
+
+	const meta: { name: string; language?: string; type?: string; path?: string } = {
+		name: runnable.name
+	}
+
+	if (runnable.type === 'inline' && runnable.inlineScript) {
+		meta.language = runnable.inlineScript.language
+		const extension = runnable.inlineScript.language === 'python3' ? 'py' : 'ts'
+		await writeFile(
+			join(runnableDir, `main.${extension}`),
+			runnable.inlineScript.content,
+			'utf8'
+		)
+	} else {
+		meta.type = runnable.type
+		if (runnable.path) {
+			meta.path = runnable.path
+		}
+	}
+
+	await writeFile(join(runnableDir, 'meta.json'), JSON.stringify(meta, null, 2) + '\n', 'utf8')
+}
+
+async function removeBackendRunnable(workspaceRoot: string | undefined, key: string): Promise<void> {
+	if (!workspaceRoot) {
+		return
+	}
+	await rm(join(workspaceRoot, 'backend', key), { recursive: true, force: true })
+}
+
+async function persistDatatables(
+	workspaceRoot: string | undefined,
+	datatables: DataTableSchema[]
+): Promise<void> {
+	if (!workspaceRoot) {
+		return
+	}
+	await writeFile(
+		join(workspaceRoot, 'datatables.json'),
+		JSON.stringify(datatables, null, 2) + '\n',
+		'utf8'
+	)
+}
+
+export async function createAppFileHelpers(
+	initialFrontend: Record<string, string> = {},
+	initialBackend: Record<string, BackendRunnable> = {},
+	workspaceRoot?: string
+): Promise<{
+	helpers: AppAIChatHelpers
+	getFiles: () => AppFiles
+	getFrontend: () => Record<string, string>
+	getBackend: () => Record<string, BackendRunnable>
+	cleanup: () => Promise<void>
+	workspaceDir: string | null
+}> {
+	let frontend = { ...initialFrontend }
+	let backend = { ...initialBackend }
+	let snapshotId = 0
+	const snapshots = new Map<
+		number,
+		{ frontend: Record<string, string>; backend: Record<string, BackendRunnable> }
+	>()
+	const datatables: DataTableSchema[] = []
+
+	for (const [path, content] of Object.entries(frontend)) {
+		await writeFrontendFile(workspaceRoot, path, content)
+	}
+	for (const [key, runnable] of Object.entries(backend)) {
+		await writeBackendRunnable(workspaceRoot, key, runnable)
+	}
+	await persistDatatables(workspaceRoot, datatables)
+
+	const helpers: AppAIChatHelpers = {
+		listFrontendFiles: () => Object.keys(frontend),
+		getFrontendFile: (path: string) => frontend[path],
+		getFrontendFiles: () => ({ ...frontend }),
+		setFrontendFile: (path: string, content: string) => {
+			frontend[path] = content
+			void writeFrontendFile(workspaceRoot, path, content)
+			return createEmptyLintResult()
+		},
+		deleteFrontendFile: (path: string) => {
+			delete frontend[path]
+			void removeFrontendFile(workspaceRoot, path)
+		},
+		listBackendRunnables: () =>
+			Object.entries(backend).map(([key, runnable]) => ({
+				key,
+				name: runnable.name
+			})),
+		getBackendRunnable: (key: string) => backend[key],
+		getBackendRunnables: () => ({ ...backend }),
+		setBackendRunnable: async (key: string, runnable: BackendRunnable) => {
+			backend[key] = runnable
+			await writeBackendRunnable(workspaceRoot, key, runnable)
+			return createEmptyLintResult()
+		},
+		deleteBackendRunnable: (key: string) => {
+			delete backend[key]
+			void removeBackendRunnable(workspaceRoot, key)
+		},
+		getFiles: (): AppFiles => ({
+			frontend: { ...frontend },
+			backend: { ...backend }
+		}),
+		getSelectedContext: (): SelectedContext => ({ type: 'none' }),
+		snapshot: () => {
+			const id = ++snapshotId
+			snapshots.set(id, {
+				frontend: { ...frontend },
+				backend: { ...backend }
+			})
+			return id
+		},
+		revertToSnapshot: (id: number) => {
+			const snapshot = snapshots.get(id)
+			if (!snapshot) {
+				return
+			}
+			frontend = { ...snapshot.frontend }
+			backend = { ...snapshot.backend }
+			void syncWorkspace()
+		},
+		lint: () => createEmptyLintResult(),
+		getDatatables: async () => structuredClone(datatables),
+		getAvailableDatatableNames: () => datatables.map((datatable) => datatable.datatable_name),
+		execDatatableSql: async (
+			datatableName: string,
+			sql: string,
+			newTable?: { schema: string; name: string }
+		) => {
+			if (newTable) {
+				datatables.push({
+					datatable_name: datatableName,
+					schemas: {
+						[newTable.schema]: {
+							[newTable.name]: {}
+						}
+					}
+				})
+				await persistDatatables(workspaceRoot, datatables)
+			}
+			return {
+				success: true,
+				result: [
+					{
+						datatableName,
+						sql
+					}
+				]
+			}
+		},
+		addTableToWhitelist: (datatableName: string, schemaName: string, tableName: string) => {
+			const existing = datatables.find((entry) => entry.datatable_name === datatableName)
+			if (existing) {
+				existing.schemas[schemaName] ??= {}
+				existing.schemas[schemaName][tableName] ??= {}
+			} else {
+				datatables.push({
+					datatable_name: datatableName,
+					schemas: {
+						[schemaName]: {
+							[tableName]: {}
+						}
+					}
+				})
+			}
+			void persistDatatables(workspaceRoot, datatables)
+		}
+	}
+
+	async function syncWorkspace(): Promise<void> {
+		if (!workspaceRoot) {
+			return
+		}
+		await rm(join(workspaceRoot, 'frontend'), { recursive: true, force: true })
+		await rm(join(workspaceRoot, 'backend'), { recursive: true, force: true })
+		for (const [path, content] of Object.entries(frontend)) {
+			await writeFrontendFile(workspaceRoot, path, content)
+		}
+		for (const [key, runnable] of Object.entries(backend)) {
+			await writeBackendRunnable(workspaceRoot, key, runnable)
+		}
+		await persistDatatables(workspaceRoot, datatables)
+	}
+
+	return {
+		helpers,
+		getFiles: () => ({
+			frontend: { ...frontend },
+			backend: { ...backend }
+		}),
+		getFrontend: () => ({ ...frontend }),
+		getBackend: () => ({ ...backend }),
+		cleanup: async () => {
+			if (workspaceRoot) {
+				await rm(workspaceRoot, { recursive: true, force: true })
+			}
+		},
+		workspaceDir: workspaceRoot ?? null
+	}
+}
--- a/ai_evals/adapters/frontend/core/flow/fileHelpers.ts
+++ b/ai_evals/adapters/frontend/core/flow/fileHelpers.ts
@@ -0,0 +1,169 @@
+import { mkdir, rm, writeFile } from 'fs/promises'
+import { dirname, join } from 'path'
+import type { FlowModule, InputTransform } from '../../../../../frontend/src/lib/gen'
+import type { ExtendedOpenFlow } from '../../../../../frontend/src/lib/components/flows/types'
+import type { FlowAIChatHelpers } from '../../../../../frontend/src/lib/components/copilot/chat/flow/core'
+import type { ScriptLintResult } from '../../../../../frontend/src/lib/components/copilot/chat/shared'
+import { getSubModules } from '../../../../../frontend/src/lib/components/flows/flowExplorer'
+import {
+	createInlineScriptSession
+} from '../../../../../frontend/src/lib/components/copilot/chat/flow/inlineScriptsUtils'
+import {
+	applyFlowJsonUpdate,
+	getFlowModuleById,
+	updateRawScriptModuleContent
+} from '../../../../../frontend/src/lib/components/copilot/chat/flow/helperUtils'
+import {
+	registerBenchmarkWorkspace,
+	registerBenchmarkWorkspaceRunnables,
+	unregisterBenchmarkWorkspaceRunnables,
+	createBenchmarkCompletedJob,
+	type BenchmarkWorkspaceFlow,
+	type BenchmarkWorkspaceScript
+} from '../../mockBackend'
+
+const EMPTY_SCRIPT_LINT_RESULT: ScriptLintResult = {
+	errorCount: 0,
+	warningCount: 0,
+	errors: [],
+	warnings: []
+}
+
+export interface FlowWorkspaceFixtures {
+	scripts?: BenchmarkWorkspaceScript[]
+	flows?: BenchmarkWorkspaceFlow[]
+}
+
+export async function createFlowFileHelpers(
+	initialModules: FlowModule[] = [],
+	initialSchema?: Record<string, any>,
+	initialPreprocessorModule?: FlowModule,
+	initialFailureModule?: FlowModule,
+	workspaceRoot?: string,
+	workspaceFixtures?: FlowWorkspaceFixtures
+): Promise<{
+	helpers: FlowAIChatHelpers
+	getFlow: () => ExtendedOpenFlow
+	getModules: () => FlowModule[]
+	cleanup: () => Promise<void>
+	workspaceDir: string | null
+}> {
+	let flow: ExtendedOpenFlow = {
+		value: {
+			modules: structuredClone(initialModules),
+			preprocessor_module: structuredClone(initialPreprocessorModule),
+			failure_module: structuredClone(initialFailureModule)
+		},
+		summary: '',
+		schema: initialSchema ?? {
+			$schema: 'https://json-schema.org/draft/2020-12/schema',
+			properties: {},
+			required: [],
+			type: 'object'
+		}
+	}
+	const inlineScriptSession = createInlineScriptSession()
+
+	const flowFilePath = workspaceRoot ? join(workspaceRoot, 'flow.json') : null
+
+	async function persistFlow(): Promise<void> {
+		if (!flowFilePath) {
+			return
+		}
+		await mkdir(dirname(flowFilePath), { recursive: true })
+		await writeFile(flowFilePath, JSON.stringify(flow, null, 2) + '\n', 'utf8')
+	}
+
+	await persistFlow()
+
+	if (workspaceRoot) {
+		registerBenchmarkWorkspace(workspaceRoot)
+		if (workspaceFixtures) {
+			registerBenchmarkWorkspaceRunnables(workspaceRoot, workspaceFixtures)
+		}
+	}
+
+	const helpers: FlowAIChatHelpers = {
+		getFlowAndSelectedId: () => ({ flow, selectedId: '' }),
+		getModules: (id?: string) => {
+			if (!id) return flow.value.modules
+			const module = getFlowModuleById(flow, id)
+			return module ? getSubModules(module).flat() : []
+		},
+		inlineScriptSession,
+		setSnapshot: () => {},
+		revertToSnapshot: () => {},
+		setCode: async (id: string, code: string) => {
+			updateRawScriptModuleContent(flow, id, code)
+			inlineScriptSession.set(id, code)
+			await persistFlow()
+		},
+		setFlowJson: async (
+			modules: FlowModule[] | undefined,
+			schema: Record<string, any> | undefined,
+			preprocessorModule: FlowModule | null | undefined,
+			failureModule: FlowModule | null | undefined
+		) => {
+			applyFlowJsonUpdate(flow, inlineScriptSession, {
+				modules,
+				schema,
+				preprocessorModule,
+				failureModule
+			})
+			await persistFlow()
+		},
+		getFlowInputsSchema: async () => flow.schema ?? {},
+		updateExprsToSet: (_id: string, _inputTransforms: Record<string, InputTransform>) => {},
+		acceptAllModuleActions: () => {},
+		rejectAllModuleActions: () => {},
+		hasPendingChanges: () => false,
+		selectStep: (_id: string) => {},
+		testFlow: async (args?: Record<string, any>) => {
+			if (workspaceRoot) {
+				const runPath = join(workspaceRoot, 'test-run.json')
+				await writeFile(
+					runPath,
+					JSON.stringify(
+						{
+							requestedArgs: args ?? {},
+							modules: flow.value.modules.map((module) => module.id),
+							preprocessor_module: flow.value.preprocessor_module?.id ?? null,
+							failure_module: flow.value.failure_module?.id ?? null
+						},
+						null,
+						2
+					) + '\n',
+					'utf8'
+				)
+			}
+			return createBenchmarkCompletedJob({
+				workspace: workspaceRoot ?? 'benchmark',
+				jobKind: 'flowpreview',
+				result: {
+					requestedArgs: args ?? {},
+					modules: flow.value.modules.map((module) => module.id),
+					preprocessor_module: flow.value.preprocessor_module?.id ?? null,
+					failure_module: flow.value.failure_module?.id ?? null,
+					mocked: true
+				},
+				logs: 'Mock benchmark flow test run completed successfully.'
+			})
+		},
+		getLintErrors: async () => EMPTY_SCRIPT_LINT_RESULT
+	}
+
+	return {
+		helpers,
+		getFlow: () => flow,
+		getModules: () => flow.value.modules,
+		cleanup: async () => {
+			if (workspaceRoot) {
+				unregisterBenchmarkWorkspaceRunnables(workspaceRoot)
+			}
+			if (workspaceRoot) {
+				await rm(workspaceRoot, { recursive: true, force: true })
+			}
+		},
+		workspaceDir: workspaceRoot ?? null
+	}
+}
--- a/ai_evals/adapters/frontend/core/flow/flowEvalRunner.ts
+++ b/ai_evals/adapters/frontend/core/flow/flowEvalRunner.ts
@@ -0,0 +1,107 @@
+import { mkdtemp } from 'fs/promises'
+import { tmpdir } from 'os'
+import { join } from 'path'
+import type { FlowModule } from '$lib/gen'
+import type { AIProvider } from '$lib/gen/types.gen'
+import type { ExtendedOpenFlow } from '$lib/components/flows/types'
+import {
+	flowTools,
+	prepareFlowSystemMessage,
+	prepareFlowUserMessage,
+	type FlowAIChatHelpers
+} from '../../../../../frontend/src/lib/components/copilot/chat/flow/core'
+import type { Tool as ProductionTool } from '../../../../../frontend/src/lib/components/copilot/chat/shared'
+import { createFlowFileHelpers, type FlowWorkspaceFixtures } from './fileHelpers'
+import { runEval } from '../shared'
+import type { ModeRunContext } from '../../../../core/types'
+import type { TokenUsage } from '../shared/types'
+
+export interface FlowFixture {
+	value?: {
+		modules?: FlowModule[]
+		preprocessor_module?: FlowModule
+		failure_module?: FlowModule
+	}
+	schema?: Record<string, unknown>
+}
+
+export interface FlowEvalResult {
+	success: boolean
+	flow: ExtendedOpenFlow
+	error?: string
+	assistantMessageCount: number
+	toolCallCount: number
+	toolsUsed: string[]
+	tokenUsage: TokenUsage
+}
+
+export interface FlowEvalOptions {
+	initialFlow?: FlowFixture
+	workspaceFixtures?: FlowWorkspaceFixtures
+	model?: string
+	maxIterations?: number
+	provider?: AIProvider
+	workspaceRoot?: string
+	runContext?: ModeRunContext
+}
+
+export async function runFlowEval(
+	userPrompt: string,
+	apiKey: string,
+	options?: FlowEvalOptions
+): Promise<FlowEvalResult> {
+	const workspaceRoot =
+		options?.workspaceRoot ??
+		(await mkdtemp(join(tmpdir(), 'wmill-frontend-flow-benchmark-')))
+	const { helpers, getFlow, cleanup } = await createFlowFileHelpers(
+		options?.initialFlow?.value?.modules ?? [],
+		options?.initialFlow?.schema,
+		options?.initialFlow?.value?.preprocessor_module,
+		options?.initialFlow?.value?.failure_module,
+		workspaceRoot,
+		options?.workspaceFixtures
+	)
+
+	try {
+		const systemMessage = prepareFlowSystemMessage()
+		const tools = flowTools as ProductionTool<FlowAIChatHelpers>[]
+		const model = options?.model ?? 'claude-haiku-4-5-20251001'
+		const userMessage = prepareFlowUserMessage(
+			userPrompt,
+			helpers.getFlowAndSelectedId(),
+			[],
+			helpers.inlineScriptSession
+		)
+
+		const rawResult = await runEval({
+			userPrompt,
+			systemMessage,
+			userMessage,
+			tools,
+			helpers,
+			apiKey,
+			getOutput: getFlow,
+			onAssistantMessageStart: options?.runContext?.onAssistantMessageStart,
+			onAssistantToken: options?.runContext?.onAssistantChunk,
+			onAssistantMessageEnd: options?.runContext?.onAssistantMessageEnd,
+			options: {
+				maxIterations: options?.maxIterations,
+				model,
+				workspace: workspaceRoot,
+				provider: options?.provider
+			}
+		})
+
+		return {
+			flow: rawResult.output,
+			success: rawResult.success,
+			error: rawResult.error,
+			assistantMessageCount: rawResult.iterations,
+			toolCallCount: rawResult.toolCallsCount,
+			toolsUsed: rawResult.toolsCalled,
+			tokenUsage: rawResult.tokenUsage
+		}
+	} finally {
+		await cleanup()
+	}
+}
--- a/ai_evals/adapters/frontend/core/script/fileHelpers.ts
+++ b/ai_evals/adapters/frontend/core/script/fileHelpers.ts
@@ -0,0 +1,73 @@
+import { mkdir, rm, writeFile } from 'fs/promises'
+import { dirname, join } from 'path'
+import type { ScriptLang } from '../../../../../frontend/src/lib/gen/types.gen'
+import type { ReviewChangesOpts } from '../../../../../frontend/src/lib/components/copilot/chat/monaco-adapter'
+import type { ScriptChatHelpers } from '../../../../../frontend/src/lib/components/copilot/chat/script/core'
+import { buildScriptLintResult } from './preview'
+import { registerBenchmarkWorkspace, unregisterBenchmarkWorkspace } from '../../mockBackend'
+
+export interface ScriptEvalState {
+	code: string
+	lang: ScriptLang | 'bunnative'
+	path: string
+	args: Record<string, any>
+}
+
+export async function createScriptFileHelpers(
+	initialScript: ScriptEvalState,
+	workspaceRoot?: string
+): Promise<{
+	helpers: ScriptChatHelpers
+	getScript: () => ScriptEvalState
+	cleanup: () => Promise<void>
+	workspaceDir: string | null
+}> {
+	let script = structuredClone(initialScript)
+	const scriptFilePath = workspaceRoot ? join(workspaceRoot, script.path) : null
+
+	async function persistScript(): Promise<void> {
+		if (!scriptFilePath) {
+			return
+		}
+		await mkdir(dirname(scriptFilePath), { recursive: true })
+		await writeFile(scriptFilePath, script.code, 'utf8')
+	}
+
+	await persistScript()
+
+	if (workspaceRoot) {
+		registerBenchmarkWorkspace(workspaceRoot)
+	}
+
+	const helpers: ScriptChatHelpers = {
+		getScriptOptions: () => ({
+			code: script.code,
+			lang: script.lang,
+			path: script.path,
+			args: structuredClone(script.args)
+		}),
+		applyCode: async (code: string, opts?: ReviewChangesOpts) => {
+			if (opts?.mode === 'revert') {
+				return
+			}
+			script = {
+				...script,
+				code
+			}
+			await persistScript()
+		},
+		getLintErrors: () => buildScriptLintResult(script.code, script.lang)
+	}
+
+	return {
+		helpers,
+		getScript: () => structuredClone(script),
+		cleanup: async () => {
+			if (workspaceRoot) {
+				unregisterBenchmarkWorkspace(workspaceRoot)
+				await rm(workspaceRoot, { recursive: true, force: true })
+			}
+		},
+		workspaceDir: workspaceRoot ?? null
+	}
+}
--- a/ai_evals/adapters/frontend/core/script/preview.ts
+++ b/ai_evals/adapters/frontend/core/script/preview.ts
@@ -0,0 +1,96 @@
+import ts from 'typescript'
+import type { ScriptLang } from '../../../../../frontend/src/lib/gen/types.gen'
+import type { ScriptLintResult } from '../../../../../frontend/src/lib/components/copilot/chat/shared'
+
+export type ScriptPreviewLanguage = ScriptLang | 'bunnative'
+
+const TS_LIKE_LANGUAGES = new Set<ScriptPreviewLanguage>(['bun', 'deno', 'nativets', 'bunnative'])
+const JS_LIKE_LANGUAGES = new Set<ScriptPreviewLanguage>(['bun', 'deno', 'nativets', 'bunnative'])
+
+function hasSupportedEntrypoint(code: string): boolean {
+	return (
+		/export\s+(async\s+)?function\s+main\s*\(/.test(code) ||
+		/export\s+(async\s+)?function\s+preprocessor\s*\(/.test(code)
+	)
+}
+
+function compilerOptionsForLanguage(lang: ScriptPreviewLanguage): ts.CompilerOptions | null {
+	if (!TS_LIKE_LANGUAGES.has(lang)) {
+		return null
+	}
+
+	return {
+		target: ts.ScriptTarget.ES2022,
+		module: ts.ModuleKind.ESNext,
+		moduleResolution: ts.ModuleResolutionKind.Bundler,
+		noEmit: true,
+		allowJs: true,
+		checkJs: false,
+		strict: false,
+		skipLibCheck: true
+	}
+}
+
+function getLineAndColumn(sourceText: string, start: number): { line: number; column: number } {
+	const prefix = sourceText.slice(0, Math.max(0, start))
+	const line = prefix.split('\n').length
+	const lastNewline = prefix.lastIndexOf('\n')
+	const column = lastNewline === -1 ? prefix.length + 1 : prefix.length - lastNewline
+	return { line, column }
+}
+
+export function buildScriptLintResult(
+	code: string,
+	lang: ScriptPreviewLanguage
+): ScriptLintResult {
+	const diagnostics: ScriptLintResult['errors'] = []
+	const compilerOptions = compilerOptionsForLanguage(lang)
+
+	if (compilerOptions) {
+		const sourceFile = ts.createSourceFile(
+			'script.ts',
+			code,
+			ts.ScriptTarget.ES2022,
+			true,
+			JS_LIKE_LANGUAGES.has(lang) ? ts.ScriptKind.TS : ts.ScriptKind.JS
+		)
+		const output = ts.transpileModule(code, {
+			compilerOptions,
+			fileName: sourceFile.fileName,
+			reportDiagnostics: true
+		})
+
+		for (const diagnostic of output.diagnostics ?? []) {
+			const start = diagnostic.start ?? 0
+			const length = diagnostic.length ?? 1
+			const { line, column } = getLineAndColumn(code, start)
+			const message = ts.flattenDiagnosticMessageText(diagnostic.messageText, '\n')
+			diagnostics.push({
+				startLineNumber: line,
+				startColumn: column,
+				endLineNumber: line,
+				endColumn: column + Math.max(1, length),
+				message,
+				severity: 8
+			} as ScriptLintResult['errors'][number])
+		}
+	}
+
+	if (!hasSupportedEntrypoint(code)) {
+		diagnostics.push({
+			startLineNumber: 1,
+			startColumn: 1,
+			endLineNumber: 1,
+			endColumn: 1,
+			message: 'Script must export a main or preprocessor function.',
+			severity: 8
+		} as ScriptLintResult['errors'][number])
+	}
+
+	return {
+		errorCount: diagnostics.length,
+		warningCount: 0,
+		errors: diagnostics,
+		warnings: []
+	}
+}
--- a/ai_evals/adapters/frontend/core/script/scriptEvalRunner.ts
+++ b/ai_evals/adapters/frontend/core/script/scriptEvalRunner.ts
@@ -0,0 +1,109 @@
+import { mkdtemp } from 'fs/promises'
+import { tmpdir } from 'os'
+import { join } from 'path'
+import type { AIProvider, AIProviderModel, ScriptLang } from '$lib/gen/types.gen'
+import type { ContextElement } from '../../../../../frontend/src/lib/components/copilot/chat/context'
+import {
+	prepareScriptSystemMessage,
+	prepareScriptTools,
+	prepareScriptUserMessage,
+	type ScriptChatHelpers
+} from '../../../../../frontend/src/lib/components/copilot/chat/script/core'
+import type { Tool as ProductionTool } from '../../../../../frontend/src/lib/components/copilot/chat/shared'
+import { createScriptFileHelpers, type ScriptEvalState } from './fileHelpers'
+import { runEval } from '../shared'
+import type { ModeRunContext } from '../../../../core/types'
+import type { TokenUsage } from '../shared/types'
+
+export interface ScriptEvalResult {
+	success: boolean
+	script: ScriptEvalState
+	error?: string
+	assistantMessageCount: number
+	toolCallCount: number
+	toolsUsed: string[]
+	tokenUsage: TokenUsage
+}
+
+export interface ScriptEvalOptions {
+	initialScript: ScriptEvalState
+	model?: string
+	maxIterations?: number
+	provider?: AIProvider
+	workspaceRoot?: string
+	runContext?: ModeRunContext
+}
+
+function resolveModelProvider(
+	model: string,
+	provider?: AIProvider
+): AIProviderModel {
+	if (provider) {
+		return { provider, model }
+	}
+	if (model.startsWith('claude')) {
+		return { provider: 'anthropic', model }
+	}
+	return { provider: 'openai', model }
+}
+
+export async function runScriptEval(
+	userPrompt: string,
+	apiKey: string,
+	options: ScriptEvalOptions
+): Promise<ScriptEvalResult> {
+	const workspaceRoot =
+		options.workspaceRoot ?? (await mkdtemp(join(tmpdir(), 'wmill-frontend-script-benchmark-')))
+	const { helpers, getScript, cleanup } = await createScriptFileHelpers(
+		options.initialScript,
+		workspaceRoot
+	)
+
+	try {
+		const model = options.model ?? 'claude-haiku-4-5-20251001'
+		const modelProvider = resolveModelProvider(model, options.provider)
+		const selectedContext: ContextElement[] = []
+		const systemMessage = prepareScriptSystemMessage(
+			modelProvider,
+			options.initialScript.lang,
+			{}
+		)
+		const tools = prepareScriptTools(
+			modelProvider,
+			options.initialScript.lang,
+			selectedContext
+		) as ProductionTool<ScriptChatHelpers>[]
+		const userMessage = prepareScriptUserMessage(userPrompt, selectedContext)
+
+		const rawResult = await runEval({
+			userPrompt,
+			systemMessage,
+			userMessage,
+			tools,
+			helpers,
+			apiKey,
+			getOutput: getScript,
+			onAssistantMessageStart: options.runContext?.onAssistantMessageStart,
+			onAssistantToken: options.runContext?.onAssistantChunk,
+			onAssistantMessageEnd: options.runContext?.onAssistantMessageEnd,
+			options: {
+				maxIterations: options.maxIterations,
+				model,
+				workspace: workspaceRoot,
+				provider: modelProvider.provider
+			}
+		})
+
+		return {
+			script: rawResult.output,
+			success: rawResult.success,
+			error: rawResult.error,
+			assistantMessageCount: rawResult.iterations,
+			toolCallCount: rawResult.toolCallsCount,
+			toolsUsed: rawResult.toolsCalled,
+			tokenUsage: rawResult.tokenUsage
+		}
+	} finally {
+		await cleanup()
+	}
+}
--- a/ai_evals/adapters/frontend/core/shared/baseEvalRunner.ts
+++ b/ai_evals/adapters/frontend/core/shared/baseEvalRunner.ts
@@ -0,0 +1,173 @@
+import type {
+	ChatCompletionMessageParam,
+	ChatCompletionSystemMessageParam
+} from 'openai/resources/chat/completions.mjs'
+import type { AIProviderModel } from '$lib/gen/types.gen'
+import type { TokenUsage, ToolCallDetail, EvalRunnerOptions, RawEvalResult } from './types'
+import { runChatLoop, type ChatClients } from '../../../../../frontend/src/lib/components/copilot/chat/chatLoop'
+import type {
+	Tool as ProductionTool,
+	ToolCallbacks
+} from '../../../../../frontend/src/lib/components/copilot/chat/shared'
+import {
+	createEvalClients,
+	type FrontendEvalProvider,
+	resolveEvalModelProvider
+} from './providerConfig'
+
+/**
+ * Parameters for running a base evaluation.
+ */
+export interface RunEvalParams<THelpers, TOutput> {
+	/** The user's prompt/instruction */
+	userPrompt: string
+	/** System message for the LLM */
+	systemMessage: ChatCompletionSystemMessageParam
+	/** User message for the LLM */
+	userMessage: ChatCompletionMessageParam
+	/** Tool definitions for the LLM API (unused — derived from tools) */
+	toolDefs?: unknown
+	/** Full tool implementations for execution */
+	tools: ProductionTool<THelpers>[]
+	/** Domain-specific helpers for tool execution */
+	helpers: THelpers
+	/** API key for the provider */
+	apiKey: string
+	/** Function to get the current output state */
+	getOutput: () => TOutput
+	/** Optional configuration */
+	options?: EvalRunnerOptions
+	onAssistantMessageStart?: () => void
+	onAssistantToken?: (token: string) => void
+	onAssistantMessageEnd?: () => void
+}
+
+/**
+ * Runs a generic evaluation using the shared chat loop (same code path as production).
+ * Uses streaming via real provider SDKs instead of OpenRouter non-streaming.
+ */
+export async function runEval<THelpers, TOutput>(
+	params: RunEvalParams<THelpers, TOutput>
+): Promise<RawEvalResult<TOutput>> {
+	const {
+		systemMessage,
+		userMessage,
+		tools,
+		helpers,
+		apiKey,
+		getOutput,
+		options,
+		onAssistantMessageStart,
+		onAssistantToken,
+		onAssistantMessageEnd
+	} = params
+	let shouldEmitMessageStart = true
+
+	const model = options?.model ?? 'gpt-4o'
+	const maxIterations = options?.maxIterations ?? 20
+	const workspace = options?.workspace ?? 'test-workspace'
+	const provider = options?.provider
+
+	const modelProvider = resolveEvalModelProvider(
+		model,
+		provider as FrontendEvalProvider | undefined
+	) as AIProviderModel
+	const clients = createEvalClients(modelProvider.provider, apiKey) as ChatClients
+
+	const messages: ChatCompletionMessageParam[] = [userMessage]
+	let toolCallsCount = 0
+	const toolsCalled: string[] = []
+	const toolCallDetails: ToolCallDetail[] = []
+
+	// Wrap tools to intercept fn calls for tracking.
+	// Cast to ProductionTool since the eval Tool has a narrower toolCallbacks type
+	// but the actual callbacks passed at runtime will satisfy both interfaces.
+	const wrappedTools = tools.map((tool) => ({
+		...tool,
+		fn: async (p: any) => {
+			toolCallsCount++
+			toolsCalled.push(tool.def.function.name)
+			try {
+				const args =
+					typeof p.args === 'string' ? JSON.parse(p.args) : p.args
+				toolCallDetails.push({ name: tool.def.function.name, arguments: args })
+			} catch {
+				toolCallDetails.push({
+					name: tool.def.function.name,
+					arguments: p.args
+				})
+			}
+			return tool.fn(p)
+		}
+	}))
+
+	// No-op callbacks for eval
+	const callbacks: ToolCallbacks & {
+		onNewToken: (token: string) => void
+		onMessageEnd: () => void
+	} = {
+		setToolStatus: () => {},
+		removeToolStatus: () => {},
+		onNewToken: (token: string) => {
+			if (shouldEmitMessageStart) {
+				onAssistantMessageStart?.()
+				shouldEmitMessageStart = false
+			}
+			onAssistantToken?.(token)
+		},
+		onMessageEnd: () => {
+			if (!shouldEmitMessageStart) {
+				onAssistantMessageEnd?.()
+			}
+			shouldEmitMessageStart = true
+		}
+	}
+
+	const abortController = new AbortController()
+
+	try {
+		const result = await runChatLoop({
+			messages,
+			systemMessage,
+			tools: wrappedTools,
+			helpers,
+			abortController,
+			callbacks,
+			modelProvider,
+			clients,
+			workspace,
+			maxIterations,
+			skipResponsesApi: modelProvider.provider !== 'openai' && modelProvider.provider !== 'azure_openai'
+		})
+
+		return {
+			success: true,
+			output: getOutput(),
+			tokenUsage: result.tokenUsage,
+			toolCallsCount,
+			toolsCalled,
+			toolCallDetails,
+			iterations: Math.max(1, result.addedMessages.filter((m) => m.role === 'assistant').length),
+			messages
+		}
+	} catch (err) {
+		let errorMessage: string
+		if (err instanceof Error) {
+			errorMessage = err.stack ?? err.message
+		} else {
+			errorMessage = String(err)
+		}
+
+		return {
+			success: false,
+			output: getOutput(),
+			error: errorMessage,
+			tokenUsage: { prompt: 0, completion: 0, total: 0 },
+			toolCallsCount,
+			toolsCalled,
+			toolCallDetails,
+			iterations: 0,
+			messages
+		}
+	}
+}
--- a/ai_evals/adapters/frontend/core/shared/index.ts
+++ b/ai_evals/adapters/frontend/core/shared/index.ts
@@ -0,0 +1,3 @@
+export type { TokenUsage, ToolCallDetail, EvalRunnerOptions, RawEvalResult } from './types'
+export type { RunEvalParams } from './baseEvalRunner'
+export { runEval } from './baseEvalRunner'
--- a/ai_evals/adapters/frontend/core/shared/providerConfig.test.ts
+++ b/ai_evals/adapters/frontend/core/shared/providerConfig.test.ts
@@ -0,0 +1,41 @@
+import { describe, expect, it } from "bun:test";
+import {
+  buildOpenAICompatibleClientOptions,
+  resolveEvalModelProvider,
+} from "./providerConfig";
+
+describe("buildOpenAICompatibleClientOptions", () => {
+  it("adds Gemini's OpenAI-compatible base URL and client header", () => {
+    const options = buildOpenAICompatibleClientOptions("googleai", "gemini-test-key");
+
+    expect(options).toMatchObject({
+      apiKey: "gemini-test-key",
+      baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
+      defaultHeaders: {
+        "x-goog-api-client": "windmill-ai-evals/1.0",
+      },
+    });
+  });
+
+  it("keeps the default OpenAI-compatible config for OpenAI", () => {
+    expect(buildOpenAICompatibleClientOptions("openai", "openai-test-key")).toEqual({
+      apiKey: "openai-test-key",
+    });
+  });
+});
+
+describe("resolveEvalModelProvider", () => {
+  it("infers googleai from Gemini model ids", () => {
+    expect(resolveEvalModelProvider("gemini-2.5-flash")).toEqual({
+      provider: "googleai",
+      model: "gemini-2.5-flash",
+    });
+  });
+
+  it("preserves an explicit provider", () => {
+    expect(resolveEvalModelProvider("gemini-2.5-pro", "googleai")).toEqual({
+      provider: "googleai",
+      model: "gemini-2.5-pro",
+    });
+  });
+});
--- a/ai_evals/adapters/frontend/core/shared/providerConfig.ts
+++ b/ai_evals/adapters/frontend/core/shared/providerConfig.ts
@@ -0,0 +1,71 @@
+import Anthropic from "@anthropic-ai/sdk";
+import OpenAI from "openai";
+import type { FrontendEvalModelConfig } from "../../../../core/models";
+
+export type FrontendEvalProvider = FrontendEvalModelConfig["provider"];
+
+export interface EvalClients {
+  openai: OpenAI;
+  anthropic: Anthropic;
+}
+
+export interface ResolvedEvalModelProvider {
+  provider: FrontendEvalProvider;
+  model: string;
+}
+
+const GEMINI_OPENAI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/";
+const GEMINI_GOOG_API_CLIENT = "windmill-ai-evals/1.0";
+
+export function buildOpenAICompatibleClientOptions(
+  provider: Exclude<FrontendEvalProvider, "anthropic">,
+  apiKey: string
+): ConstructorParameters<typeof OpenAI>[0] {
+  if (provider === "googleai") {
+    return {
+      apiKey,
+      baseURL: GEMINI_OPENAI_BASE_URL,
+      defaultHeaders: {
+        "x-goog-api-client": GEMINI_GOOG_API_CLIENT,
+      },
+    };
+  }
+
+  return { apiKey };
+}
+
+export function createEvalClients(
+  provider: FrontendEvalProvider,
+  apiKey: string
+): EvalClients {
+  if (provider === "anthropic") {
+    return {
+      openai: new OpenAI({ apiKey: "unused" }),
+      anthropic: new Anthropic({ apiKey }),
+    };
+  }
+
+  return {
+    openai: new OpenAI(buildOpenAICompatibleClientOptions(provider, apiKey)),
+    anthropic: new Anthropic({ apiKey: "unused" }),
+  };
+}
+
+export function resolveEvalModelProvider(
+  model: string,
+  provider?: FrontendEvalProvider
+): ResolvedEvalModelProvider {
+  if (provider) {
+    return { provider, model };
+  }
+  if (model.startsWith("claude")) {
+    return { provider: "anthropic", model };
+  }
+  if (model.startsWith("gemini")) {
+    return { provider: "googleai", model };
+  }
+  if (model.startsWith("gpt") || model.startsWith("o")) {
+    return { provider: "openai", model };
+  }
+  return { provider: "openai", model };
+}
--- a/ai_evals/adapters/frontend/core/shared/types.ts
+++ b/ai_evals/adapters/frontend/core/shared/types.ts
@@ -0,0 +1,32 @@
+import type { ChatCompletionMessageParam } from 'openai/resources/chat/completions.mjs'
+import type { AIProvider } from '$lib/gen/types.gen'
+
+export interface TokenUsage {
+	prompt: number
+	completion: number
+	total: number
+}
+
+export interface ToolCallDetail {
+	name: string
+	arguments: Record<string, unknown>
+}
+
+export interface EvalRunnerOptions {
+	maxIterations?: number
+	model?: string
+	workspace?: string
+	provider?: AIProvider
+}
+
+export interface RawEvalResult<TOutput> {
+	success: boolean
+	output: TOutput
+	error?: string
+	tokenUsage: TokenUsage
+	toolCallsCount: number
+	toolsCalled: string[]
+	toolCallDetails: ToolCallDetail[]
+	iterations: number
+	messages: ChatCompletionMessageParam[]
+}
--- a/ai_evals/adapters/frontend/mockBackend.ts
+++ b/ai_evals/adapters/frontend/mockBackend.ts
@@ -0,0 +1,270 @@
+import { randomUUID } from 'node:crypto'
+import type { CompletedJob, Flow, Script } from '../../../frontend/src/lib/gen'
+import type { ScriptLang } from '../../../frontend/src/lib/gen/types.gen'
+import { buildScriptLintResult } from './core/script/preview'
+
+const BENCHMARK_TIMESTAMP = '1970-01-01T00:00:00.000Z'
+
+export interface BenchmarkWorkspaceScript {
+	path: string
+	summary: string
+	description?: string
+	language: Script['language']
+	schema?: Record<string, unknown>
+	content: string
+}
+
+export interface BenchmarkWorkspaceFlow {
+	path: string
+	summary: string
+	description?: string
+	schema?: Record<string, unknown>
+	value: Flow['value']
+}
+
+export interface BenchmarkWorkspaceRunnables {
+	scripts?: BenchmarkWorkspaceScript[]
+	flows?: BenchmarkWorkspaceFlow[]
+}
+
+type BenchmarkCompletedJob = CompletedJob & { type: 'CompletedJob' }
+
+const benchmarkWorkspaces = new Set<string>()
+const benchmarkWorkspaceRunnables = new Map<string, BenchmarkWorkspaceRunnables>()
+const benchmarkJobs = new Map<string, { workspace: string; job: BenchmarkCompletedJob }>()
+
+export function resetBenchmarkMockBackend(): void {
+	benchmarkWorkspaces.clear()
+	benchmarkWorkspaceRunnables.clear()
+	benchmarkJobs.clear()
+}
+
+export function registerBenchmarkWorkspace(workspace: string): void {
+	benchmarkWorkspaces.add(workspace)
+}
+
+export function registerBenchmarkWorkspaceRunnables(
+	workspace: string,
+	runnables: BenchmarkWorkspaceRunnables
+): void {
+	benchmarkWorkspaces.add(workspace)
+	benchmarkWorkspaceRunnables.set(workspace, runnables)
+}
+
+export function unregisterBenchmarkWorkspace(workspace: string): void {
+	benchmarkWorkspaces.delete(workspace)
+	benchmarkWorkspaceRunnables.delete(workspace)
+	for (const [jobId, entry] of benchmarkJobs.entries()) {
+		if (entry.workspace === workspace) {
+			benchmarkJobs.delete(jobId)
+		}
+	}
+}
+
+export function unregisterBenchmarkWorkspaceRunnables(workspace: string): void {
+	unregisterBenchmarkWorkspace(workspace)
+}
+
+export function hasBenchmarkWorkspace(workspace: string): boolean {
+	return benchmarkWorkspaces.has(workspace)
+}
+
+export function listBenchmarkScripts(workspace: string): Script[] | null {
+	const runnables = benchmarkWorkspaceRunnables.get(workspace)
+	if (!runnables) {
+		return null
+	}
+	return (runnables.scripts ?? []).map(buildBenchmarkScript)
+}
+
+export function listBenchmarkFlows(workspace: string): Flow[] | null {
+	const runnables = benchmarkWorkspaceRunnables.get(workspace)
+	if (!runnables) {
+		return null
+	}
+	return (runnables.flows ?? []).map(buildBenchmarkFlow)
+}
+
+export function getBenchmarkScriptByPath(workspace: string, path: string): Script | null {
+	const script = benchmarkWorkspaceRunnables
+		.get(workspace)
+		?.scripts?.find((entry) => entry.path === path)
+
+	return script ? buildBenchmarkScript(script) : null
+}
+
+export function getBenchmarkScriptByHash(workspace: string, hash: string): Script | null {
+	const script = benchmarkWorkspaceRunnables
+		.get(workspace)
+		?.scripts?.find((entry) => buildBenchmarkScriptHash(entry.path) === hash)
+
+	return script ? buildBenchmarkScript(script) : null
+}
+
+export function getBenchmarkFlowByPath(workspace: string, path: string): Flow | null {
+	const flow = benchmarkWorkspaceRunnables
+		.get(workspace)
+		?.flows?.find((entry) => entry.path === path)
+
+	return flow ? buildBenchmarkFlow(flow) : null
+}
+
+export function createBenchmarkCompletedJob(input: {
+	workspace: string
+	jobKind: CompletedJob['job_kind']
+	success?: boolean
+	result?: unknown
+	logs?: string
+	scriptPath?: string
+	scriptHash?: string
+	args?: Record<string, unknown>
+}): string {
+	const jobId = `benchmark-job-${randomUUID()}`
+	const now = new Date().toISOString()
+	const job: BenchmarkCompletedJob = {
+		type: 'CompletedJob',
+		id: jobId,
+		workspace_id: input.workspace,
+		created_by: 'ai-evals',
+		created_at: now,
+		started_at: now,
+		completed_at: now,
+		duration_ms: 0,
+		success: input.success ?? true,
+		script_path: input.scriptPath,
+		script_hash: input.scriptHash,
+		args: input.args,
+		result: input.result,
+		logs: input.logs,
+		canceled: false,
+		job_kind: input.jobKind,
+		permissioned_as: 'u/ai-evals',
+		is_flow_step: false,
+		is_skipped: false,
+		email: 'ai-evals@local',
+		visible_to_owner: true,
+		tag: 'benchmark'
+	}
+
+	benchmarkJobs.set(jobId, { workspace: input.workspace, job })
+	return jobId
+}
+
+export function getBenchmarkCompletedJob(
+	workspace: string,
+	jobId: string
+): BenchmarkCompletedJob | null {
+	const entry = benchmarkJobs.get(jobId)
+	if (!entry || entry.workspace !== workspace) {
+		return null
+	}
+	return structuredClone(entry.job)
+}
+
+export function runBenchmarkScriptPreview(input: {
+	workspace: string
+	requestBody: {
+		content?: string
+		language?: ScriptLang | 'bunnative'
+		args?: Record<string, unknown>
+		path?: string
+	}
+}): string {
+	const content = input.requestBody.content ?? ''
+	const language = input.requestBody.language ?? 'bun'
+	const lintResult = buildScriptLintResult(content, language)
+	const success = lintResult.errorCount === 0
+
+	return createBenchmarkCompletedJob({
+		workspace: input.workspace,
+		jobKind: 'preview',
+		success,
+		scriptPath: input.requestBody.path,
+		args: input.requestBody.args,
+		result: success
+			? {
+					path: input.requestBody.path,
+					args: input.requestBody.args ?? {},
+					validated: true
+				}
+			: {
+					path: input.requestBody.path,
+					args: input.requestBody.args ?? {},
+					errorCount: lintResult.errorCount,
+					errors: lintResult.errors.map((entry) => ({
+						line: entry.startLineNumber,
+						message: entry.message
+					}))
+				}
+	})
+}
+
+export function runBenchmarkFlowByPath(input: {
+	workspace: string
+	path: string
+	args?: Record<string, unknown>
+}): string {
+	const flow = getBenchmarkFlowByPath(input.workspace, input.path)
+	return createBenchmarkCompletedJob({
+		workspace: input.workspace,
+		jobKind: 'flowpreview',
+		success: flow !== null,
+		args: input.args,
+		result:
+			flow !== null
+				? {
+						path: input.path,
+						args: input.args ?? {},
+						mocked: true
+					}
+				: {
+						error: `Flow "${input.path}" not found in benchmark workspace`
+					},
+		logs:
+			flow !== null
+				? 'Mock benchmark flow run completed successfully.'
+				: `Flow "${input.path}" not found in benchmark workspace.`
+	})
+}
+
+function buildBenchmarkScriptHash(path: string): string {
+	return `benchmark:${path}`
+}
+
+function buildBenchmarkScript(script: BenchmarkWorkspaceScript): Script {
+	return {
+		workspace_id: 'benchmark',
+		hash: buildBenchmarkScriptHash(script.path),
+		path: script.path,
+		parent_hashes: [],
+		summary: script.summary,
+		description: script.description ?? '',
+		content: script.content,
+		created_by: 'benchmark',
+		created_at: BENCHMARK_TIMESTAMP,
+		archived: false,
+		schema: script.schema ?? {},
+		deleted: false,
+		is_template: false,
+		extra_perms: {},
+		language: script.language,
+		kind: 'script',
+		starred: false,
+		has_preprocessor: false,
+		modules: null
+	}
+}
+
+function buildBenchmarkFlow(flow: BenchmarkWorkspaceFlow): Flow {
+	return {
+		path: flow.path,
+		summary: flow.summary,
+		description: flow.description ?? '',
+		value: flow.value,
+		schema: flow.schema ?? {},
+		edited_by: 'benchmark',
+		edited_at: BENCHMARK_TIMESTAMP,
+		archived: false,
+		extra_perms: {}
+	} as Flow
+}
--- a/ai_evals/adapters/frontend/progress.ts
+++ b/ai_evals/adapters/frontend/progress.ts
@@ -0,0 +1,133 @@
+export type FrontendBenchmarkProgressSurface = 'flow' | 'app' | 'script'
+
+export type FrontendBenchmarkProgressEvent =
+	| {
+			type: 'run-start'
+			surface: FrontendBenchmarkProgressSurface
+			totalCases: number
+			runs: number
+			concurrency: number
+	  }
+	| {
+			type: 'attempt-start'
+			surface: FrontendBenchmarkProgressSurface
+			caseId: string
+			caseNumber: number
+			totalCases: number
+			attempt: number
+			runs: number
+	  }
+	| {
+			type: 'attempt-finish'
+			surface: FrontendBenchmarkProgressSurface
+			caseId: string
+			caseNumber: number
+			totalCases: number
+			attempt: number
+			runs: number
+			passed: boolean
+			durationMs: number
+			judgeScore: number | null
+			error: string | null
+	  }
+	| {
+			type: 'assistant-message-start'
+			surface: FrontendBenchmarkProgressSurface
+			caseId: string
+			caseNumber: number
+			totalCases: number
+			attempt: number
+			runs: number
+	  }
+	| {
+			type: 'assistant-chunk'
+			surface: FrontendBenchmarkProgressSurface
+			caseId: string
+			caseNumber: number
+			totalCases: number
+			attempt: number
+			runs: number
+			chunk: string
+	  }
+	| {
+			type: 'assistant-message-end'
+			surface: FrontendBenchmarkProgressSurface
+			caseId: string
+			caseNumber: number
+			totalCases: number
+			attempt: number
+			runs: number
+	  }
+
+export const FRONTEND_BENCHMARK_PROGRESS_PREFIX = 'WMILL_FRONTEND_AI_EVAL_PROGRESS '
+
+export function emitFrontendBenchmarkProgress(event: FrontendBenchmarkProgressEvent): void {
+	process.stderr.write(
+		`${FRONTEND_BENCHMARK_PROGRESS_PREFIX}${JSON.stringify(event)}\n`
+	)
+}
+
+export function parseFrontendBenchmarkProgressLine(
+	line: string
+): FrontendBenchmarkProgressEvent | null {
+	if (!line.startsWith(FRONTEND_BENCHMARK_PROGRESS_PREFIX)) {
+		return null
+	}
+
+	try {
+		const parsed = JSON.parse(
+			line.slice(FRONTEND_BENCHMARK_PROGRESS_PREFIX.length)
+		) as FrontendBenchmarkProgressEvent
+		return parsed?.type ? parsed : null
+	} catch {
+		return null
+	}
+}
+
+export function formatFrontendBenchmarkProgressEvent(
+	event: FrontendBenchmarkProgressEvent
+): string {
+	switch (event.type) {
+		case 'run-start':
+			return `Running ${event.surface}: ${event.totalCases} cases x ${event.runs} run${event.runs === 1 ? '' : 's'}, concurrency ${event.concurrency}`
+		case 'attempt-start':
+			return `${formatCasePrefix(event.caseNumber, event.totalCases)} ${event.caseId} attempt ${event.attempt}/${event.runs}...`
+		case 'attempt-finish': {
+			const parts = [
+				`${formatCasePrefix(event.caseNumber, event.totalCases)} ${event.caseId} attempt ${event.attempt}/${event.runs} ${event.passed ? 'pass' : 'fail'}`,
+				formatDuration(event.durationMs)
+			]
+			if (event.judgeScore !== null) {
+				parts.push(`judge ${formatNumber(event.judgeScore)}`)
+			}
+			if (event.error) {
+				parts.push(truncateSingleLine(event.error, 120))
+			}
+			return parts.join(' | ')
+		}
+		case 'assistant-message-start':
+		case 'assistant-chunk':
+		case 'assistant-message-end':
+			return ''
+	}
+}
+
+function formatCasePrefix(caseNumber: number, totalCases: number): string {
+	return `[${caseNumber}/${totalCases}]`
+}
+
+function formatDuration(durationMs: number): string {
+	return `${formatNumber(durationMs / 1000)}s`
+}
+
+function formatNumber(value: number): string {
+	return Number.isInteger(value) ? String(value) : value.toFixed(1)
+}
+
+function truncateSingleLine(value: string, maxLength: number): string {
+	const normalized = value.replace(/\s+/g, ' ').trim()
+	if (normalized.length <= maxLength) {
+		return normalized
+	}
+	return `${normalized.slice(0, Math.max(0, maxLength - 3))}...`
+}
--- a/ai_evals/adapters/frontend/runtime.ts
+++ b/ai_evals/adapters/frontend/runtime.ts
@@ -0,0 +1,218 @@
+import { spawn } from 'node:child_process'
+import { mkdtemp, readFile, rm } from 'node:fs/promises'
+import { tmpdir } from 'node:os'
+import path from 'node:path'
+import { fileURLToPath } from 'node:url'
+import {
+	formatFrontendBenchmarkProgressEvent,
+	parseFrontendBenchmarkProgressLine
+} from './progress'
+import type { BenchmarkRunResult } from '../../core/types'
+
+const REPO_ROOT = fileURLToPath(new URL('../../../', import.meta.url))
+const FRONTEND_DIR = path.join(REPO_ROOT, 'frontend')
+const FRONTEND_BENCHMARK_TEST = '../ai_evals/adapters/frontend/vitestAdapter.test.ts'
+const FRONTEND_BENCHMARK_CONFIG = '../ai_evals/adapters/frontend/vitest.config.ts'
+
+export type FrontendMode = 'flow' | 'app' | 'script'
+
+export async function runFrontendBenchmarkAdapter(input: {
+	mode: FrontendMode
+	caseIds: string[]
+	runs: number
+	model?: string
+	verbose?: boolean
+	backendValidation?: string
+}): Promise<BenchmarkRunResult> {
+	const tempDir = await mkdtemp(path.join(tmpdir(), 'wmill-frontend-benchmark-'))
+	const outputPath = path.join(tempDir, 'result.json')
+
+	try {
+		await runVitestBenchmark(
+			path.join(FRONTEND_DIR, 'node_modules', '.bin', 'vitest'),
+			[
+				'run',
+				FRONTEND_BENCHMARK_TEST,
+				'--project',
+				'server',
+				'--config',
+				FRONTEND_BENCHMARK_CONFIG
+			],
+			{
+				cwd: FRONTEND_DIR,
+				env: {
+					...process.env,
+					BROWSERSLIST_IGNORE_OLD_DATA: '1',
+					WMILL_FRONTEND_AI_EVAL_OUTPUT_PATH: outputPath,
+					WMILL_FRONTEND_AI_EVAL_MODE: input.mode,
+					WMILL_FRONTEND_AI_EVAL_CASE_IDS: JSON.stringify(input.caseIds),
+					WMILL_FRONTEND_AI_EVAL_RUNS: String(input.runs),
+					WMILL_FRONTEND_AI_EVAL_MODEL: input.model ?? "",
+					WMILL_FRONTEND_AI_EVAL_PROGRESS: '1',
+					WMILL_FRONTEND_AI_EVAL_VERBOSE: input.verbose ? '1' : '0',
+					WMILL_FRONTEND_AI_EVAL_BACKEND_VALIDATION: input.backendValidation ?? ''
+				}
+			}
+		)
+
+		const raw = await readFile(outputPath, 'utf8')
+		return JSON.parse(raw) as BenchmarkRunResult
+	} catch (error) {
+		throw new Error(`Frontend benchmark adapter failed:\n${toErrorMessage(error)}`)
+	} finally {
+		await rm(tempDir, { recursive: true, force: true })
+	}
+}
+
+async function runVitestBenchmark(
+	command: string,
+	args: string[],
+	options: {
+		cwd: string
+		env: NodeJS.ProcessEnv
+	}
+): Promise<void> {
+	const child = spawn(command, args, {
+		cwd: options.cwd,
+		env: options.env,
+		stdio: ['ignore', 'pipe', 'pipe']
+	})
+
+	let stdout = ''
+	let stderr = ''
+	let stderrLineBuffer = ''
+	let assistantStreamOpen = false
+
+	child.stdout?.setEncoding('utf8')
+	child.stdout?.on('data', (chunk: string) => {
+		stdout += chunk
+	})
+
+	child.stderr?.setEncoding('utf8')
+	child.stderr?.on('data', (chunk: string) => {
+		stderrLineBuffer += chunk
+		const { remainder, passthrough, nextAssistantStreamOpen } = drainProgressLines(
+			stderrLineBuffer,
+			assistantStreamOpen
+		)
+		stderrLineBuffer = remainder
+		stderr += passthrough
+		assistantStreamOpen = nextAssistantStreamOpen
+	})
+
+	await new Promise<void>((resolve, reject) => {
+		child.once('error', reject)
+		child.once('close', (code) => {
+			if (stderrLineBuffer.length > 0) {
+				const {
+					remainder,
+					passthrough,
+					nextAssistantStreamOpen
+				} = drainProgressLines(`${stderrLineBuffer}\n`, assistantStreamOpen)
+				stderrLineBuffer = remainder
+				stderr += passthrough
+				assistantStreamOpen = nextAssistantStreamOpen
+			}
+
+			if (code === 0) {
+				if (assistantStreamOpen) {
+					process.stderr.write('\n')
+				}
+				resolve()
+				return
+			}
+
+			const details = [`vitest exited with code ${code}`, stdout, stderr].filter(Boolean).join('\n')
+			reject(new Error(details))
+		})
+	})
+}
+
+function drainProgressLines(buffer: string): {
+	remainder: string
+	passthrough: string
+	nextAssistantStreamOpen: boolean
+}
+function drainProgressLines(
+	buffer: string,
+	initialAssistantStreamOpen: boolean
+): {
+	remainder: string
+	passthrough: string
+	nextAssistantStreamOpen: boolean
+} {
+	let remainder = buffer
+	let passthrough = ''
+	let assistantStreamOpen = initialAssistantStreamOpen
+
+	while (true) {
+		const newlineIndex = remainder.indexOf('\n')
+		if (newlineIndex === -1) {
+			return { remainder, passthrough, nextAssistantStreamOpen: assistantStreamOpen }
+		}
+
+		const line = remainder.slice(0, newlineIndex).replace(/\r$/, '')
+		remainder = remainder.slice(newlineIndex + 1)
+
+		const progressEvent = parseFrontendBenchmarkProgressLine(line)
+		if (progressEvent) {
+			if (progressEvent.type === 'assistant-message-start') {
+				if (assistantStreamOpen) {
+					process.stderr.write('\n')
+				}
+				process.stderr.write(
+					`${formatCasePrefix(progressEvent.caseNumber, progressEvent.totalCases)} ${progressEvent.caseId} attempt ${progressEvent.attempt}/${progressEvent.runs} assistant:\n`
+				)
+				assistantStreamOpen = true
+				continue
+			}
+
+			if (progressEvent.type === 'assistant-chunk') {
+				process.stderr.write(progressEvent.chunk)
+				continue
+			}
+
+			if (progressEvent.type === 'assistant-message-end') {
+				if (assistantStreamOpen) {
+					process.stderr.write('\n')
+				}
+				assistantStreamOpen = false
+				continue
+			}
+
+			if (assistantStreamOpen) {
+				process.stderr.write('\n')
+				assistantStreamOpen = false
+			}
+			process.stderr.write(`${formatFrontendBenchmarkProgressEvent(progressEvent)}\n`)
+			continue
+		}
+
+		if (shouldSuppressFrontendStderrLine(line)) {
+			continue
+		}
+
+		passthrough += `${line}\n`
+		process.stderr.write(`${line}\n`)
+	}
+}
+
+function formatCasePrefix(caseNumber: number, totalCases: number): string {
+	return `[${caseNumber}/${totalCases}]`
+}
+
+function shouldSuppressFrontendStderrLine(line: string): boolean {
+	return (
+		line.startsWith('[baseline-browser-mapping] ') ||
+		line.startsWith('Browserslist: browsers data (caniuse-lite) is ') ||
+		line.includes('update-browserslist-db@latest') ||
+		line.includes('update-db#readme')
+	)
+}
+
+function toErrorMessage(error: unknown): string {
+	if (error instanceof Error) {
+		return error.message
+	}
+	return String(error)
+}
--- a/ai_evals/adapters/frontend/vitest.config.ts
+++ b/ai_evals/adapters/frontend/vitest.config.ts
@@ -0,0 +1,28 @@
+import { fileURLToPath } from 'node:url'
+import frontendConfig from '../../../frontend/vite.config.js'
+
+const FRONTEND_VITE_CONFIG_PATH = fileURLToPath(new URL('../../../frontend/vite.config.js', import.meta.url))
+const FRONTEND_TEST_SETUP_PATH = fileURLToPath(
+	new URL('../../../frontend/src/lib/test-setup.ts', import.meta.url)
+)
+const ADAPTER_TEST_PATH = fileURLToPath(new URL('./vitestAdapter.test.ts', import.meta.url))
+
+const config = {
+	...frontendConfig,
+	test: {
+		...frontendConfig.test,
+		projects: [
+			{
+				extends: FRONTEND_VITE_CONFIG_PATH,
+				test: {
+					name: 'server',
+					environment: 'node',
+					include: [ADAPTER_TEST_PATH],
+					setupFiles: [FRONTEND_TEST_SETUP_PATH]
+				}
+			}
+		]
+	}
+}
+
+export default config
--- a/ai_evals/adapters/frontend/vitestAdapter.test.ts
+++ b/ai_evals/adapters/frontend/vitestAdapter.test.ts
@@ -0,0 +1,165 @@
+import { expect, it, vi } from 'vitest'
+// @ts-ignore - Node.js fs/promises
+import { mkdir, writeFile } from 'fs/promises'
+// @ts-ignore - Node.js path
+import { dirname, resolve } from 'path'
+
+vi.mock('monaco-editor', () => ({
+	editor: {},
+	languages: {},
+	KeyCode: {},
+	Uri: {
+		parse: (value: string) => ({ toString: () => value })
+	},
+	MarkerSeverity: {
+		Error: 8,
+		Warning: 4,
+		Info: 2,
+		Hint: 1
+	}
+}))
+
+vi.mock('@codingame/monaco-vscode-standalone-typescript-language-features', () => ({
+	getTypeScriptWorker: async () => async () => ({}),
+	typescriptVersion: 'test'
+}))
+
+vi.mock('@codingame/monaco-vscode-languages-service-override', () => ({
+	default: () => ({})
+}))
+
+vi.mock('$lib/components/vscode', () => ({}))
+
+vi.mock('$lib/gen', async () => {
+	const actual = await vi.importActual<any>('$lib/gen')
+	const {
+		getBenchmarkCompletedJob,
+		getBenchmarkFlowByPath,
+		getBenchmarkScriptByHash,
+		getBenchmarkScriptByPath,
+		hasBenchmarkWorkspace,
+		listBenchmarkFlows,
+		listBenchmarkScripts,
+		runBenchmarkFlowByPath,
+		runBenchmarkScriptPreview
+	} = await import('./mockBackend')
+
+	function wrapService<T extends object>(target: T, overrides: Record<string, unknown>): T {
+		return new Proxy(target, {
+			get(source, property, receiver) {
+				if (typeof property === 'string' && property in overrides) {
+					return overrides[property]
+				}
+				return Reflect.get(source, property, receiver)
+			}
+		})
+	}
+
+	return {
+		...actual,
+		ScriptService: wrapService(actual.ScriptService, {
+			listScripts: async (data: { workspace: string }) =>
+				hasBenchmarkWorkspace(data.workspace)
+					? (listBenchmarkScripts(data.workspace) ?? [])
+					: actual.ScriptService.listScripts(data),
+			getScriptByPath: async (data: { workspace: string; path: string }) => {
+				if (hasBenchmarkWorkspace(data.workspace)) {
+					const script = getBenchmarkScriptByPath(data.workspace, data.path)
+					if (!script) {
+						throw new Error(`Script "${data.path}" not found in benchmark workspace`)
+					}
+					return script
+				}
+				return actual.ScriptService.getScriptByPath(data)
+			},
+			getScriptByHash: async (data: { workspace: string; hash: string }) => {
+				if (hasBenchmarkWorkspace(data.workspace)) {
+					const script = getBenchmarkScriptByHash(data.workspace, data.hash)
+					if (!script) {
+						throw new Error(`Script hash "${data.hash}" not found in benchmark workspace`)
+					}
+					return script
+				}
+				return actual.ScriptService.getScriptByHash(data)
+			}
+		}),
+		FlowService: wrapService(actual.FlowService, {
+			listFlows: async (data: { workspace: string }) =>
+				hasBenchmarkWorkspace(data.workspace)
+					? (listBenchmarkFlows(data.workspace) ?? [])
+					: actual.FlowService.listFlows(data),
+			getFlowByPath: async (data: { workspace: string; path: string }) => {
+				if (hasBenchmarkWorkspace(data.workspace)) {
+					const flow = getBenchmarkFlowByPath(data.workspace, data.path)
+					if (!flow) {
+						throw new Error(`Flow "${data.path}" not found in benchmark workspace`)
+					}
+					return flow
+				}
+				return actual.FlowService.getFlowByPath(data)
+			}
+		}),
+		JobService: wrapService(actual.JobService, {
+			runScriptPreview: async (data: {
+				workspace: string
+				requestBody?: {
+					content?: string
+					language?: string
+					args?: Record<string, unknown>
+					path?: string
+				}
+			}) =>
+				hasBenchmarkWorkspace(data.workspace)
+					? runBenchmarkScriptPreview({
+							workspace: data.workspace,
+							requestBody: data.requestBody ?? {}
+						})
+					: actual.JobService.runScriptPreview(data),
+			runFlowByPath: async (data: {
+				workspace: string
+				path: string
+				requestBody?: Record<string, unknown>
+			}) =>
+				hasBenchmarkWorkspace(data.workspace)
+					? runBenchmarkFlowByPath({
+							workspace: data.workspace,
+							path: data.path,
+							args: data.requestBody
+						})
+					: actual.JobService.runFlowByPath(data),
+			getJob: async (data: { workspace: string; id: string }) => {
+				if (hasBenchmarkWorkspace(data.workspace)) {
+					const job = getBenchmarkCompletedJob(data.workspace, data.id)
+					if (!job) {
+						throw new Error(`Job "${data.id}" not found in benchmark workspace`)
+					}
+					return job
+				}
+				return actual.JobService.getJob(data)
+			}
+		})
+	}
+})
+
+const benchmarkOutputPath = process.env.WMILL_FRONTEND_AI_EVAL_OUTPUT_PATH
+const benchmarkIt = benchmarkOutputPath ? it : it.skip
+
+benchmarkIt(
+	'runs the frontend benchmark adapter from environment input',
+	async () => {
+		const { resetBenchmarkMockBackend } = await import('./mockBackend')
+		resetBenchmarkMockBackend()
+		const { runFrontendBenchmarkFromEnv } = await import('./benchmarkRunner')
+		try {
+			const payload = await runFrontendBenchmarkFromEnv()
+			const absoluteOutputPath = resolve(benchmarkOutputPath!)
+			await mkdir(dirname(absoluteOutputPath), { recursive: true })
+			await writeFile(absoluteOutputPath, JSON.stringify(payload, null, 2) + '\n', 'utf8')
+
+			expect(payload.cases.length).toBeGreaterThan(0)
+		} finally {
+			resetBenchmarkMockBackend()
+		}
+	},
+	600_000
+)
--- a/ai_evals/bun.lock
+++ b/ai_evals/bun.lock
@@ -0,0 +1,313 @@
+{
+  "lockfileVersion": 1,
+  "configVersion": 1,
+  "workspaces": {
+    "": {
+      "name": "windmill-ai-evals",
+      "dependencies": {
+        "@anthropic-ai/claude-agent-sdk": "^0.2.25",
+        "@anthropic-ai/sdk": "^0.39.0",
+        "commander": "^14.0.3",
+        "openai": "^6.9.1",
+        "yaml": "^2.8.3",
+      },
+      "devDependencies": {
+        "@types/bun": "latest",
+        "typescript": "^5.0.0",
+      },
+    },
+  },
+  "packages": {
+    "@anthropic-ai/claude-agent-sdk": ["@anthropic-ai/claude-agent-sdk@0.2.87", "", { "dependencies": { "@anthropic-ai/sdk": "^0.74.0", "@modelcontextprotocol/sdk": "^1.27.1" }, "optionalDependencies": { "@img/sharp-darwin-arm64": "^0.34.2", "@img/sharp-darwin-x64": "^0.34.2", "@img/sharp-linux-arm": "^0.34.2", "@img/sharp-linux-arm64": "^0.34.2", "@img/sharp-linux-x64": "^0.34.2", "@img/sharp-linuxmusl-arm64": "^0.34.2", "@img/sharp-linuxmusl-x64": "^0.34.2", "@img/sharp-win32-arm64": "^0.34.2", "@img/sharp-win32-x64": "^0.34.2" }, "peerDependencies": { "zod": "^4.0.0" } }, "sha512-WWmgBPxPhBOvNT0ujI8vPTI2lK+w5YEkEZ/y1mH0EDkK/0kBnxVJNhCtG5vnueiAViwLoUOFn66pbkDiivijdA=="],
+
+    "@anthropic-ai/sdk": ["@anthropic-ai/sdk@0.39.0", "", { "dependencies": { "@types/node": "^18.11.18", "@types/node-fetch": "^2.6.4", "abort-controller": "^3.0.0", "agentkeepalive": "^4.2.1", "form-data-encoder": "1.7.2", "formdata-node": "^4.3.2", "node-fetch": "^2.6.7" } }, "sha512-eMyDIPRZbt1CCLErRCi3exlAvNkBtRe+kW5vvJyef93PmNr/clstYgHhtvmkxN82nlKgzyGPCyGxrm0JQ1ZIdg=="],
+
+    "@babel/runtime": ["@babel/runtime@7.29.2", "", {}, "sha512-JiDShH45zKHWyGe4ZNVRrCjBz8Nh9TMmZG1kh4QTK8hCBTWBi8Da+i7s1fJw7/lYpM4ccepSNfqzZ/QvABBi5g=="],
+
+    "@hono/node-server": ["@hono/node-server@1.19.12", "", { "peerDependencies": { "hono": "^4" } }, "sha512-txsUW4SQ1iilgE0l9/e9VQWmELXifEFvmdA1j6WFh/aFPj99hIntrSsq/if0UWyGVkmrRPKA1wCeP+UCr1B9Uw=="],
+
+    "@img/sharp-darwin-arm64": ["@img/sharp-darwin-arm64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-darwin-arm64": "1.2.4" }, "os": "darwin", "cpu": "arm64" }, "sha512-imtQ3WMJXbMY4fxb/Ndp6HBTNVtWCUI0WdobyheGf5+ad6xX8VIDO8u2xE4qc/fr08CKG/7dDseFtn6M6g/r3w=="],
+
+    "@img/sharp-darwin-x64": ["@img/sharp-darwin-x64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-darwin-x64": "1.2.4" }, "os": "darwin", "cpu": "x64" }, "sha512-YNEFAF/4KQ/PeW0N+r+aVVsoIY0/qxxikF2SWdp+NRkmMB7y9LBZAVqQ4yhGCm/H3H270OSykqmQMKLBhBJDEw=="],
+
+    "@img/sharp-libvips-darwin-arm64": ["@img/sharp-libvips-darwin-arm64@1.2.4", "", { "os": "darwin", "cpu": "arm64" }, "sha512-zqjjo7RatFfFoP0MkQ51jfuFZBnVE2pRiaydKJ1G/rHZvnsrHAOcQALIi9sA5co5xenQdTugCvtb1cuf78Vf4g=="],
+
+    "@img/sharp-libvips-darwin-x64": ["@img/sharp-libvips-darwin-x64@1.2.4", "", { "os": "darwin", "cpu": "x64" }, "sha512-1IOd5xfVhlGwX+zXv2N93k0yMONvUlANylbJw1eTah8K/Jtpi15KC+WSiaX/nBmbm2HxRM1gZ0nSdjSsrZbGKg=="],
+
+    "@img/sharp-libvips-linux-arm": ["@img/sharp-libvips-linux-arm@1.2.4", "", { "os": "linux", "cpu": "arm" }, "sha512-bFI7xcKFELdiNCVov8e44Ia4u2byA+l3XtsAj+Q8tfCwO6BQ8iDojYdvoPMqsKDkuoOo+X6HZA0s0q11ANMQ8A=="],
+
+    "@img/sharp-libvips-linux-arm64": ["@img/sharp-libvips-linux-arm64@1.2.4", "", { "os": "linux", "cpu": "arm64" }, "sha512-excjX8DfsIcJ10x1Kzr4RcWe1edC9PquDRRPx3YVCvQv+U5p7Yin2s32ftzikXojb1PIFc/9Mt28/y+iRklkrw=="],
+
+    "@img/sharp-libvips-linux-x64": ["@img/sharp-libvips-linux-x64@1.2.4", "", { "os": "linux", "cpu": "x64" }, "sha512-tJxiiLsmHc9Ax1bz3oaOYBURTXGIRDODBqhveVHonrHJ9/+k89qbLl0bcJns+e4t4rvaNBxaEZsFtSfAdquPrw=="],
+
+    "@img/sharp-libvips-linuxmusl-arm64": ["@img/sharp-libvips-linuxmusl-arm64@1.2.4", "", { "os": "linux", "cpu": "arm64" }, "sha512-FVQHuwx1IIuNow9QAbYUzJ+En8KcVm9Lk5+uGUQJHaZmMECZmOlix9HnH7n1TRkXMS0pGxIJokIVB9SuqZGGXw=="],
+
+    "@img/sharp-libvips-linuxmusl-x64": ["@img/sharp-libvips-linuxmusl-x64@1.2.4", "", { "os": "linux", "cpu": "x64" }, "sha512-+LpyBk7L44ZIXwz/VYfglaX/okxezESc6UxDSoyo2Ks6Jxc4Y7sGjpgU9s4PMgqgjj1gZCylTieNamqA1MF7Dg=="],
+
+    "@img/sharp-linux-arm": ["@img/sharp-linux-arm@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-arm": "1.2.4" }, "os": "linux", "cpu": "arm" }, "sha512-9dLqsvwtg1uuXBGZKsxem9595+ujv0sJ6Vi8wcTANSFpwV/GONat5eCkzQo/1O6zRIkh0m/8+5BjrRr7jDUSZw=="],
+
+    "@img/sharp-linux-arm64": ["@img/sharp-linux-arm64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-arm64": "1.2.4" }, "os": "linux", "cpu": "arm64" }, "sha512-bKQzaJRY/bkPOXyKx5EVup7qkaojECG6NLYswgktOZjaXecSAeCWiZwwiFf3/Y+O1HrauiE3FVsGxFg8c24rZg=="],
+
+    "@img/sharp-linux-x64": ["@img/sharp-linux-x64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-x64": "1.2.4" }, "os": "linux", "cpu": "x64" }, "sha512-MEzd8HPKxVxVenwAa+JRPwEC7QFjoPWuS5NZnBt6B3pu7EG2Ge0id1oLHZpPJdn3OQK+BQDiw9zStiHBTJQQQQ=="],
+
+    "@img/sharp-linuxmusl-arm64": ["@img/sharp-linuxmusl-arm64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linuxmusl-arm64": "1.2.4" }, "os": "linux", "cpu": "arm64" }, "sha512-fprJR6GtRsMt6Kyfq44IsChVZeGN97gTD331weR1ex1c1rypDEABN6Tm2xa1wE6lYb5DdEnk03NZPqA7Id21yg=="],
+
+    "@img/sharp-linuxmusl-x64": ["@img/sharp-linuxmusl-x64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-linuxmusl-x64": "1.2.4" }, "os": "linux", "cpu": "x64" }, "sha512-Jg8wNT1MUzIvhBFxViqrEhWDGzqymo3sV7z7ZsaWbZNDLXRJZoRGrjulp60YYtV4wfY8VIKcWidjojlLcWrd8Q=="],
+
+    "@img/sharp-win32-arm64": ["@img/sharp-win32-arm64@0.34.5", "", { "os": "win32", "cpu": "arm64" }, "sha512-WQ3AgWCWYSb2yt+IG8mnC6Jdk9Whs7O0gxphblsLvdhSpSTtmu69ZG1Gkb6NuvxsNACwiPV6cNSZNzt0KPsw7g=="],
+
+    "@img/sharp-win32-x64": ["@img/sharp-win32-x64@0.34.5", "", { "os": "win32", "cpu": "x64" }, "sha512-+29YMsqY2/9eFEiW93eqWnuLcWcufowXewwSNIT6UwZdUUCrM3oFjMWH/Z6/TMmb4hlFenmfAVbpWeup2jryCw=="],
+
+    "@modelcontextprotocol/sdk": ["@modelcontextprotocol/sdk@1.29.0", "", { "dependencies": { "@hono/node-server": "^1.19.9", "ajv": "^8.17.1", "ajv-formats": "^3.0.1", "content-type": "^1.0.5", "cors": "^2.8.5", "cross-spawn": "^7.0.5", "eventsource": "^3.0.2", "eventsource-parser": "^3.0.0", "express": "^5.2.1", "express-rate-limit": "^8.2.1", "hono": "^4.11.4", "jose": "^6.1.3", "json-schema-typed": "^8.0.2", "pkce-challenge": "^5.0.0", "raw-body": "^3.0.0", "zod": "^3.25 || ^4.0", "zod-to-json-schema": "^3.25.1" }, "peerDependencies": { "@cfworker/json-schema": "^4.1.1" }, "optionalPeers": ["@cfworker/json-schema"] }, "sha512-zo37mZA9hJWpULgkRpowewez1y6ML5GsXJPY8FI0tBBCd77HEvza4jDqRKOXgHNn867PVGCyTdzqpz0izu5ZjQ=="],
+
+    "@types/bun": ["@types/bun@1.3.11", "", { "dependencies": { "bun-types": "1.3.11" } }, "sha512-5vPne5QvtpjGpsGYXiFyycfpDF2ECyPcTSsFBMa0fraoxiQyMJ3SmuQIGhzPg2WJuWxVBoxWJ2kClYTcw/4fAg=="],
+
+    "@types/node": ["@types/node@18.19.130", "", { "dependencies": { "undici-types": "~5.26.4" } }, "sha512-GRaXQx6jGfL8sKfaIDD6OupbIHBr9jv7Jnaml9tB7l4v068PAOXqfcujMMo5PhbIs6ggR1XODELqahT2R8v0fg=="],
+
+    "@types/node-fetch": ["@types/node-fetch@2.6.13", "", { "dependencies": { "@types/node": "*", "form-data": "^4.0.4" } }, "sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw=="],
+
+    "abort-controller": ["abort-controller@3.0.0", "", { "dependencies": { "event-target-shim": "^5.0.0" } }, "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg=="],
+
+    "accepts": ["accepts@2.0.0", "", { "dependencies": { "mime-types": "^3.0.0", "negotiator": "^1.0.0" } }, "sha512-5cvg6CtKwfgdmVqY1WIiXKc3Q1bkRqGLi+2W/6ao+6Y7gu/RCwRuAhGEzh5B4KlszSuTLgZYuqFqo5bImjNKng=="],
+
+    "agentkeepalive": ["agentkeepalive@4.6.0", "", { "dependencies": { "humanize-ms": "^1.2.1" } }, "sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ=="],
+
+    "ajv": ["ajv@8.18.0", "", { "dependencies": { "fast-deep-equal": "^3.1.3", "fast-uri": "^3.0.1", "json-schema-traverse": "^1.0.0", "require-from-string": "^2.0.2" } }, "sha512-PlXPeEWMXMZ7sPYOHqmDyCJzcfNrUr3fGNKtezX14ykXOEIvyK81d+qydx89KY5O71FKMPaQ2vBfBFI5NHR63A=="],
+
+    "ajv-formats": ["ajv-formats@3.0.1", "", { "dependencies": { "ajv": "^8.0.0" } }, "sha512-8iUql50EUR+uUcdRQ3HDqa6EVyo3docL8g5WJ3FNcWmu62IbkGUue/pEyLBW8VGKKucTPgqeks4fIU1DA4yowQ=="],
+
+    "asynckit": ["asynckit@0.4.0", "", {}, "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="],
+
+    "body-parser": ["body-parser@2.2.2", "", { "dependencies": { "bytes": "^3.1.2", "content-type": "^1.0.5", "debug": "^4.4.3", "http-errors": "^2.0.0", "iconv-lite": "^0.7.0", "on-finished": "^2.4.1", "qs": "^6.14.1", "raw-body": "^3.0.1", "type-is": "^2.0.1" } }, "sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA=="],
+
+    "bun-types": ["bun-types@1.3.11", "", { "dependencies": { "@types/node": "*" } }, "sha512-1KGPpoxQWl9f6wcZh57LvrPIInQMn2TQ7jsgxqpRzg+l0QPOFvJVH7HmvHo/AiPgwXy+/Thf6Ov3EdVn1vOabg=="],
+
+    "bytes": ["bytes@3.1.2", "", {}, "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg=="],
+
+    "call-bind-apply-helpers": ["call-bind-apply-helpers@1.0.2", "", { "dependencies": { "es-errors": "^1.3.0", "function-bind": "^1.1.2" } }, "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ=="],
+
+    "call-bound": ["call-bound@1.0.4", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "get-intrinsic": "^1.3.0" } }, "sha512-+ys997U96po4Kx/ABpBCqhA9EuxJaQWDQg7295H4hBphv3IZg0boBKuwYpt4YXp6MZ5AmZQnU/tyMTlRpaSejg=="],
+
+    "combined-stream": ["combined-stream@1.0.8", "", { "dependencies": { "delayed-stream": "~1.0.0" } }, "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg=="],
+
+    "commander": ["commander@14.0.3", "", {}, "sha512-H+y0Jo/T1RZ9qPP4Eh1pkcQcLRglraJaSLoyOtHxu6AapkjWVCy2Sit1QQ4x3Dng8qDlSsZEet7g5Pq06MvTgw=="],
+
+    "content-disposition": ["content-disposition@1.0.1", "", {}, "sha512-oIXISMynqSqm241k6kcQ5UwttDILMK4BiurCfGEREw6+X9jkkpEe5T9FZaApyLGGOnFuyMWZpdolTXMtvEJ08Q=="],
+
+    "content-type": ["content-type@1.0.5", "", {}, "sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA=="],
+
+    "cookie": ["cookie@0.7.2", "", {}, "sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w=="],
+
+    "cookie-signature": ["cookie-signature@1.2.2", "", {}, "sha512-D76uU73ulSXrD1UXF4KE2TMxVVwhsnCgfAyTg9k8P6KGZjlXKrOLe4dJQKI3Bxi5wjesZoFXJWElNWBjPZMbhg=="],
+
+    "cors": ["cors@2.8.6", "", { "dependencies": { "object-assign": "^4", "vary": "^1" } }, "sha512-tJtZBBHA6vjIAaF6EnIaq6laBBP9aq/Y3ouVJjEfoHbRBcHBAHYcMh/w8LDrk2PvIMMq8gmopa5D4V8RmbrxGw=="],
+
+    "cross-spawn": ["cross-spawn@7.0.6", "", { "dependencies": { "path-key": "^3.1.0", "shebang-command": "^2.0.0", "which": "^2.0.1" } }, "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA=="],
+
+    "debug": ["debug@4.4.3", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA=="],
+
+    "delayed-stream": ["delayed-stream@1.0.0", "", {}, "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ=="],
+
+    "depd": ["depd@2.0.0", "", {}, "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw=="],
+
+    "dunder-proto": ["dunder-proto@1.0.1", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.1", "es-errors": "^1.3.0", "gopd": "^1.2.0" } }, "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A=="],
+
+    "ee-first": ["ee-first@1.1.1", "", {}, "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow=="],
+
+    "encodeurl": ["encodeurl@2.0.0", "", {}, "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg=="],
+
+    "es-define-property": ["es-define-property@1.0.1", "", {}, "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g=="],
+
+    "es-errors": ["es-errors@1.3.0", "", {}, "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw=="],
+
+    "es-object-atoms": ["es-object-atoms@1.1.1", "", { "dependencies": { "es-errors": "^1.3.0" } }, "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA=="],
+
+    "es-set-tostringtag": ["es-set-tostringtag@2.1.0", "", { "dependencies": { "es-errors": "^1.3.0", "get-intrinsic": "^1.2.6", "has-tostringtag": "^1.0.2", "hasown": "^2.0.2" } }, "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA=="],
+
+    "escape-html": ["escape-html@1.0.3", "", {}, "sha512-NiSupZ4OeuGwr68lGIeym/ksIZMJodUGOSCZ/FSnTxcrekbvqrgdUxlJOMpijaKZVjAJrWrGs/6Jy8OMuyj9ow=="],
+
+    "etag": ["etag@1.8.1", "", {}, "sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg=="],
+
+    "event-target-shim": ["event-target-shim@5.0.1", "", {}, "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ=="],
+
+    "eventsource": ["eventsource@3.0.7", "", { "dependencies": { "eventsource-parser": "^3.0.1" } }, "sha512-CRT1WTyuQoD771GW56XEZFQ/ZoSfWid1alKGDYMmkt2yl8UXrVR4pspqWNEcqKvVIzg6PAltWjxcSSPrboA4iA=="],
+
+    "eventsource-parser": ["eventsource-parser@3.0.6", "", {}, "sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg=="],
+
+    "express": ["express@5.2.1", "", { "dependencies": { "accepts": "^2.0.0", "body-parser": "^2.2.1", "content-disposition": "^1.0.0", "content-type": "^1.0.5", "cookie": "^0.7.1", "cookie-signature": "^1.2.1", "debug": "^4.4.0", "depd": "^2.0.0", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "etag": "^1.8.1", "finalhandler": "^2.1.0", "fresh": "^2.0.0", "http-errors": "^2.0.0", "merge-descriptors": "^2.0.0", "mime-types": "^3.0.0", "on-finished": "^2.4.1", "once": "^1.4.0", "parseurl": "^1.3.3", "proxy-addr": "^2.0.7", "qs": "^6.14.0", "range-parser": "^1.2.1", "router": "^2.2.0", "send": "^1.1.0", "serve-static": "^2.2.0", "statuses": "^2.0.1", "type-is": "^2.0.1", "vary": "^1.1.2" } }, "sha512-hIS4idWWai69NezIdRt2xFVofaF4j+6INOpJlVOLDO8zXGpUVEVzIYk12UUi2JzjEzWL3IOAxcTubgz9Po0yXw=="],
+
+    "express-rate-limit": ["express-rate-limit@8.3.2", "", { "dependencies": { "ip-address": "10.1.0" }, "peerDependencies": { "express": ">= 4.11" } }, "sha512-77VmFeJkO0/rvimEDuUC5H30oqUC4EyOhyGccfqoLebB0oiEYfM7nwPrsDsBL1gsTpwfzX8SFy2MT3TDyRq+bg=="],
+
+    "fast-deep-equal": ["fast-deep-equal@3.1.3", "", {}, "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q=="],
+
+    "fast-uri": ["fast-uri@3.1.0", "", {}, "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA=="],
+
+    "finalhandler": ["finalhandler@2.1.1", "", { "dependencies": { "debug": "^4.4.0", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "on-finished": "^2.4.1", "parseurl": "^1.3.3", "statuses": "^2.0.1" } }, "sha512-S8KoZgRZN+a5rNwqTxlZZePjT/4cnm0ROV70LedRHZ0p8u9fRID0hJUZQpkKLzro8LfmC8sx23bY6tVNxv8pQA=="],
+
+    "form-data": ["form-data@4.0.5", "", { "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", "es-set-tostringtag": "^2.1.0", "hasown": "^2.0.2", "mime-types": "^2.1.12" } }, "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w=="],
+
+    "form-data-encoder": ["form-data-encoder@1.7.2", "", {}, "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A=="],
+
+    "formdata-node": ["formdata-node@4.4.1", "", { "dependencies": { "node-domexception": "1.0.0", "web-streams-polyfill": "4.0.0-beta.3" } }, "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ=="],
+
+    "forwarded": ["forwarded@0.2.0", "", {}, "sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow=="],
+
+    "fresh": ["fresh@2.0.0", "", {}, "sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A=="],
+
+    "function-bind": ["function-bind@1.1.2", "", {}, "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA=="],
+
+    "get-intrinsic": ["get-intrinsic@1.3.0", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "es-define-property": "^1.0.1", "es-errors": "^1.3.0", "es-object-atoms": "^1.1.1", "function-bind": "^1.1.2", "get-proto": "^1.0.1", "gopd": "^1.2.0", "has-symbols": "^1.1.0", "hasown": "^2.0.2", "math-intrinsics": "^1.1.0" } }, "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ=="],
+
+    "get-proto": ["get-proto@1.0.1", "", { "dependencies": { "dunder-proto": "^1.0.1", "es-object-atoms": "^1.0.0" } }, "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g=="],
+
+    "gopd": ["gopd@1.2.0", "", {}, "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg=="],
+
+    "has-symbols": ["has-symbols@1.1.0", "", {}, "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ=="],
+
+    "has-tostringtag": ["has-tostringtag@1.0.2", "", { "dependencies": { "has-symbols": "^1.0.3" } }, "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw=="],
+
+    "hasown": ["hasown@2.0.2", "", { "dependencies": { "function-bind": "^1.1.2" } }, "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ=="],
+
+    "hono": ["hono@4.12.9", "", {}, "sha512-wy3T8Zm2bsEvxKZM5w21VdHDDcwVS1yUFFY6i8UobSsKfFceT7TOwhbhfKsDyx7tYQlmRM5FLpIuYvNFyjctiA=="],
+
+    "http-errors": ["http-errors@2.0.1", "", { "dependencies": { "depd": "~2.0.0", "inherits": "~2.0.4", "setprototypeof": "~1.2.0", "statuses": "~2.0.2", "toidentifier": "~1.0.1" } }, "sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ=="],
+
+    "humanize-ms": ["humanize-ms@1.2.1", "", { "dependencies": { "ms": "^2.0.0" } }, "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ=="],
+
+    "iconv-lite": ["iconv-lite@0.7.2", "", { "dependencies": { "safer-buffer": ">= 2.1.2 < 3.0.0" } }, "sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw=="],
+
+    "inherits": ["inherits@2.0.4", "", {}, "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="],
+
+    "ip-address": ["ip-address@10.1.0", "", {}, "sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q=="],
+
+    "ipaddr.js": ["ipaddr.js@1.9.1", "", {}, "sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g=="],
+
+    "is-promise": ["is-promise@4.0.0", "", {}, "sha512-hvpoI6korhJMnej285dSg6nu1+e6uxs7zG3BYAm5byqDsgJNWwxzM6z6iZiAgQR4TJ30JmBTOwqZUw3WlyH3AQ=="],
+
+    "isexe": ["isexe@2.0.0", "", {}, "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw=="],
+
+    "jose": ["jose@6.2.2", "", {}, "sha512-d7kPDd34KO/YnzaDOlikGpOurfF0ByC2sEV4cANCtdqLlTfBlw2p14O/5d/zv40gJPbIQxfES3nSx1/oYNyuZQ=="],
+
+    "json-schema-to-ts": ["json-schema-to-ts@3.1.1", "", { "dependencies": { "@babel/runtime": "^7.18.3", "ts-algebra": "^2.0.0" } }, "sha512-+DWg8jCJG2TEnpy7kOm/7/AxaYoaRbjVB4LFZLySZlWn8exGs3A4OLJR966cVvU26N7X9TWxl+Jsw7dzAqKT6g=="],
+
+    "json-schema-traverse": ["json-schema-traverse@1.0.0", "", {}, "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="],
+
+    "json-schema-typed": ["json-schema-typed@8.0.2", "", {}, "sha512-fQhoXdcvc3V28x7C7BMs4P5+kNlgUURe2jmUT1T//oBRMDrqy1QPelJimwZGo7Hg9VPV3EQV5Bnq4hbFy2vetA=="],
+
+    "math-intrinsics": ["math-intrinsics@1.1.0", "", {}, "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g=="],
+
+    "media-typer": ["media-typer@1.1.0", "", {}, "sha512-aisnrDP4GNe06UcKFnV5bfMNPBUw4jsLGaWwWfnH3v02GnBuXX2MCVn5RbrWo0j3pczUilYblq7fQ7Nw2t5XKw=="],
+
+    "merge-descriptors": ["merge-descriptors@2.0.0", "", {}, "sha512-Snk314V5ayFLhp3fkUREub6WtjBfPdCPY1Ln8/8munuLuiYhsABgBVWsozAG+MWMbVEvcdcpbi9R7ww22l9Q3g=="],
+
+    "mime-db": ["mime-db@1.54.0", "", {}, "sha512-aU5EJuIN2WDemCcAp2vFBfp/m4EAhWJnUNSSw0ixs7/kXbd6Pg64EmwJkNdFhB8aWt1sH2CTXrLxo/iAGV3oPQ=="],
+
+    "mime-types": ["mime-types@3.0.2", "", { "dependencies": { "mime-db": "^1.54.0" } }, "sha512-Lbgzdk0h4juoQ9fCKXW4by0UJqj+nOOrI9MJ1sSj4nI8aI2eo1qmvQEie4VD1glsS250n15LsWsYtCugiStS5A=="],
+
+    "ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="],
+
+    "negotiator": ["negotiator@1.0.0", "", {}, "sha512-8Ofs/AUQh8MaEcrlq5xOX0CQ9ypTF5dl78mjlMNfOK08fzpgTHQRQPBxcPlEtIw0yRpws+Zo/3r+5WRby7u3Gg=="],
+
+    "node-domexception": ["node-domexception@1.0.0", "", {}, "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ=="],
+
+    "node-fetch": ["node-fetch@2.7.0", "", { "dependencies": { "whatwg-url": "^5.0.0" }, "peerDependencies": { "encoding": "^0.1.0" }, "optionalPeers": ["encoding"] }, "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A=="],
+
+    "object-assign": ["object-assign@4.1.1", "", {}, "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg=="],
+
+    "object-inspect": ["object-inspect@1.13.4", "", {}, "sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew=="],
+
+    "on-finished": ["on-finished@2.4.1", "", { "dependencies": { "ee-first": "1.1.1" } }, "sha512-oVlzkg3ENAhCk2zdv7IJwd/QUD4z2RxRwpkcGY8psCVcCYZNq4wYnVWALHM+brtuJjePWiYF/ClmuDr8Ch5+kg=="],
+
+    "once": ["once@1.4.0", "", { "dependencies": { "wrappy": "1" } }, "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w=="],
+
+    "openai": ["openai@6.34.0", "", { "peerDependencies": { "ws": "^8.18.0", "zod": "^3.25 || ^4.0" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-yEr2jdGf4tVFYG6ohmr3pF6VJuveP0EA/sS8TBx+4Eq5NT10alu5zg2dmxMXMgqpihRDQlFGpRt2XwsGj+Fyxw=="],
+
+    "parseurl": ["parseurl@1.3.3", "", {}, "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ=="],
+
+    "path-key": ["path-key@3.1.1", "", {}, "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q=="],
+
+    "path-to-regexp": ["path-to-regexp@8.4.1", "", {}, "sha512-fvU78fIjZ+SBM9YwCknCvKOUKkLVqtWDVctl0s7xIqfmfb38t2TT4ZU2gHm+Z8xGwgW+QWEU3oQSAzIbo89Ggw=="],
+
+    "pkce-challenge": ["pkce-challenge@5.0.1", "", {}, "sha512-wQ0b/W4Fr01qtpHlqSqspcj3EhBvimsdh0KlHhH8HRZnMsEa0ea2fTULOXOS9ccQr3om+GcGRk4e+isrZWV8qQ=="],
+
+    "proxy-addr": ["proxy-addr@2.0.7", "", { "dependencies": { "forwarded": "0.2.0", "ipaddr.js": "1.9.1" } }, "sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg=="],
+
+    "qs": ["qs@6.15.0", "", { "dependencies": { "side-channel": "^1.1.0" } }, "sha512-mAZTtNCeetKMH+pSjrb76NAM8V9a05I9aBZOHztWy/UqcJdQYNsf59vrRKWnojAT9Y+GbIvoTBC++CPHqpDBhQ=="],
+
+    "range-parser": ["range-parser@1.2.1", "", {}, "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg=="],
+
+    "raw-body": ["raw-body@3.0.2", "", { "dependencies": { "bytes": "~3.1.2", "http-errors": "~2.0.1", "iconv-lite": "~0.7.0", "unpipe": "~1.0.0" } }, "sha512-K5zQjDllxWkf7Z5xJdV0/B0WTNqx6vxG70zJE4N0kBs4LovmEYWJzQGxC9bS9RAKu3bgM40lrd5zoLJ12MQ5BA=="],
+
+    "require-from-string": ["require-from-string@2.0.2", "", {}, "sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw=="],
+
+    "router": ["router@2.2.0", "", { "dependencies": { "debug": "^4.4.0", "depd": "^2.0.0", "is-promise": "^4.0.0", "parseurl": "^1.3.3", "path-to-regexp": "^8.0.0" } }, "sha512-nLTrUKm2UyiL7rlhapu/Zl45FwNgkZGaCpZbIHajDYgwlJCOzLSk+cIPAnsEqV955GjILJnKbdQC1nVPz+gAYQ=="],
+
+    "safer-buffer": ["safer-buffer@2.1.2", "", {}, "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="],
+
+    "send": ["send@1.2.1", "", { "dependencies": { "debug": "^4.4.3", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "etag": "^1.8.1", "fresh": "^2.0.0", "http-errors": "^2.0.1", "mime-types": "^3.0.2", "ms": "^2.1.3", "on-finished": "^2.4.1", "range-parser": "^1.2.1", "statuses": "^2.0.2" } }, "sha512-1gnZf7DFcoIcajTjTwjwuDjzuz4PPcY2StKPlsGAQ1+YH20IRVrBaXSWmdjowTJ6u8Rc01PoYOGHXfP1mYcZNQ=="],
+
+    "serve-static": ["serve-static@2.2.1", "", { "dependencies": { "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "parseurl": "^1.3.3", "send": "^1.2.0" } }, "sha512-xRXBn0pPqQTVQiC8wyQrKs2MOlX24zQ0POGaj0kultvoOCstBQM5yvOhAVSUwOMjQtTvsPWoNCHfPGwaaQJhTw=="],
+
+    "setprototypeof": ["setprototypeof@1.2.0", "", {}, "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw=="],
+
+    "shebang-command": ["shebang-command@2.0.0", "", { "dependencies": { "shebang-regex": "^3.0.0" } }, "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA=="],
+
+    "shebang-regex": ["shebang-regex@3.0.0", "", {}, "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A=="],
+
+    "side-channel": ["side-channel@1.1.0", "", { "dependencies": { "es-errors": "^1.3.0", "object-inspect": "^1.13.3", "side-channel-list": "^1.0.0", "side-channel-map": "^1.0.1", "side-channel-weakmap": "^1.0.2" } }, "sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw=="],
+
+    "side-channel-list": ["side-channel-list@1.0.0", "", { "dependencies": { "es-errors": "^1.3.0", "object-inspect": "^1.13.3" } }, "sha512-FCLHtRD/gnpCiCHEiJLOwdmFP+wzCmDEkc9y7NsYxeF4u7Btsn1ZuwgwJGxImImHicJArLP4R0yX4c2KCrMrTA=="],
+
+    "side-channel-map": ["side-channel-map@1.0.1", "", { "dependencies": { "call-bound": "^1.0.2", "es-errors": "^1.3.0", "get-intrinsic": "^1.2.5", "object-inspect": "^1.13.3" } }, "sha512-VCjCNfgMsby3tTdo02nbjtM/ewra6jPHmpThenkTYh8pG9ucZ/1P8So4u4FGBek/BjpOVsDCMoLA/iuBKIFXRA=="],
+
+    "side-channel-weakmap": ["side-channel-weakmap@1.0.2", "", { "dependencies": { "call-bound": "^1.0.2", "es-errors": "^1.3.0", "get-intrinsic": "^1.2.5", "object-inspect": "^1.13.3", "side-channel-map": "^1.0.1" } }, "sha512-WPS/HvHQTYnHisLo9McqBHOJk2FkHO/tlpvldyrnem4aeQp4hai3gythswg6p01oSoTl58rcpiFAjF2br2Ak2A=="],
+
+    "statuses": ["statuses@2.0.2", "", {}, "sha512-DvEy55V3DB7uknRo+4iOGT5fP1slR8wQohVdknigZPMpMstaKJQWhwiYBACJE3Ul2pTnATihhBYnRhZQHGBiRw=="],
+
+    "toidentifier": ["toidentifier@1.0.1", "", {}, "sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA=="],
+
+    "tr46": ["tr46@0.0.3", "", {}, "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw=="],
+
+    "ts-algebra": ["ts-algebra@2.0.0", "", {}, "sha512-FPAhNPFMrkwz76P7cdjdmiShwMynZYN6SgOujD1urY4oNm80Ou9oMdmbR45LotcKOXoy7wSmHkRFE6Mxbrhefw=="],
+
+    "type-is": ["type-is@2.0.1", "", { "dependencies": { "content-type": "^1.0.5", "media-typer": "^1.1.0", "mime-types": "^3.0.0" } }, "sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw=="],
+
+    "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
+
+    "undici-types": ["undici-types@5.26.5", "", {}, "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA=="],
+
+    "unpipe": ["unpipe@1.0.0", "", {}, "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ=="],
+
+    "vary": ["vary@1.1.2", "", {}, "sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg=="],
+
+    "web-streams-polyfill": ["web-streams-polyfill@4.0.0-beta.3", "", {}, "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug=="],
+
+    "webidl-conversions": ["webidl-conversions@3.0.1", "", {}, "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ=="],
+
+    "whatwg-url": ["whatwg-url@5.0.0", "", { "dependencies": { "tr46": "~0.0.3", "webidl-conversions": "^3.0.0" } }, "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw=="],
+
+    "which": ["which@2.0.2", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "node-which": "./bin/node-which" } }, "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA=="],
+
+    "wrappy": ["wrappy@1.0.2", "", {}, "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ=="],
+
+    "yaml": ["yaml@2.8.3", "", { "bin": { "yaml": "bin.mjs" } }, "sha512-AvbaCLOO2Otw/lW5bmh9d/WEdcDFdQp2Z2ZUH3pX9U2ihyUY0nvLv7J6TrWowklRGPYbB/IuIMfYgxaCPg5Bpg=="],
+
+    "zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="],
+
+    "zod-to-json-schema": ["zod-to-json-schema@3.25.2", "", { "peerDependencies": { "zod": "^3.25.28 || ^4" } }, "sha512-O/PgfnpT1xKSDeQYSCfRI5Gy3hPf91mKVDuYLUHZJMiDFptvP41MSnWofm8dnCm0256ZNfZIM7DSzuSMAFnjHA=="],
+
+    "@anthropic-ai/claude-agent-sdk/@anthropic-ai/sdk": ["@anthropic-ai/sdk@0.74.0", "", { "dependencies": { "json-schema-to-ts": "^3.1.1" }, "peerDependencies": { "zod": "^3.25.0 || ^4.0.0" }, "optionalPeers": ["zod"], "bin": { "anthropic-ai-sdk": "bin/cli" } }, "sha512-srbJV7JKsc5cQ6eVuFzjZO7UR3xEPJqPamHFIe29bs38Ij2IripoAhC0S5NslNbaFUYqBKypmmpzMTpqfHEUDw=="],
+
+    "@types/node-fetch/@types/node": ["@types/node@25.5.0", "", { "dependencies": { "undici-types": "~7.18.0" } }, "sha512-jp2P3tQMSxWugkCUKLRPVUpGaL5MVFwF8RDuSRztfwgN1wmqJeMSbKlnEtQqU8UrhTmzEmZdu2I6v2dpp7XIxw=="],
+
+    "bun-types/@types/node": ["@types/node@25.5.0", "", { "dependencies": { "undici-types": "~7.18.0" } }, "sha512-jp2P3tQMSxWugkCUKLRPVUpGaL5MVFwF8RDuSRztfwgN1wmqJeMSbKlnEtQqU8UrhTmzEmZdu2I6v2dpp7XIxw=="],
+
+    "form-data/mime-types": ["mime-types@2.1.35", "", { "dependencies": { "mime-db": "1.52.0" } }, "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw=="],
+
+    "@types/node-fetch/@types/node/undici-types": ["undici-types@7.18.2", "", {}, "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w=="],
+
+    "bun-types/@types/node/undici-types": ["undici-types@7.18.2", "", {}, "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w=="],
+
+    "form-data/mime-types/mime-db": ["mime-db@1.52.0", "", {}, "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg=="],
+  }
+}
--- a/ai_evals/cases/app.yaml
+++ b/ai_evals/cases/app.yaml
@@ -0,0 +1,93 @@
+- id: app-test1-counter-create
+  prompt: |-
+    Create a simple counter app with increment and decrement buttons.
+  judgeChecklist:
+    - shows the current count in the UI
+    - includes an increment button
+    - includes a decrement button
+    - clicking the buttons updates the count correctly
+
+- id: app-test2-counter-reset
+  prompt: |-
+    Add a reset button that sets the counter back to 0
+  initial: ai_evals/fixtures/frontend/app/initial/test1_counter_app
+  judgeChecklist:
+    - adds a reset control to the existing counter app
+    - clicking reset sets the count back to 0
+    - keeps the existing increment and decrement behavior working
+
+- id: app-test3-shopping-cart-quantity
+  prompt: |-
+    Add a quantity selector (+ and - buttons) to each cart item so users can adjust quantities without removing and re-adding items
+  initial: ai_evals/fixtures/frontend/app/initial/shopping_cart
+  judgeChecklist:
+    - each cart item has visible plus and minus quantity controls
+    - users can increase quantity without re-adding the product
+    - users can decrease quantity from the cart UI
+    - cart totals stay in sync with quantity changes
+
+- id: app-test4-shopping-cart-discount
+  prompt: |-
+    Add a discount code input field in the cart.
+    When the code "SAVE10" is entered, apply a 10% discount to the total
+  initial: ai_evals/fixtures/frontend/app/initial/shopping_cart
+  judgeChecklist:
+    - adds a discount code input to the cart
+    - recognizes the code SAVE10
+    - applies a 10 percent discount to the displayed total
+    - keeps the rest of the cart behavior intact
+
+- id: app-test5-file-manager-search
+  prompt: |-
+    Add a search bar in the toolbar that filters files and folders by name as the user types
+  initial: ai_evals/fixtures/frontend/app/initial/file_manager
+  judgeChecklist:
+    - adds a search input in the toolbar
+    - filters files and folders by name as the user types
+    - updates the visible file list from the search query
+    - keeps the rest of the file manager usable
+
+- id: app-test6-file-manager-inline-rename
+  prompt: |-
+    Let users rename files and folders directly from the file list without leaving the page.
+  initial: ai_evals/fixtures/frontend/app/initial/file_manager
+  judgeChecklist:
+    - adds a visible rename action or inline edit mode in the file list
+    - lets users edit an item's name directly from the list
+    - saves the renamed item through the app's existing rename behavior
+    - refreshes the displayed name after a successful rename
+
+- id: app-test7-file-manager-select-all
+  prompt: |-
+    Add a "Select All" checkbox in the file list header and individual checkboxes for each file.
+    Add a "Delete Selected" button that appears when items are selected
+  initial: ai_evals/fixtures/frontend/app/initial/file_manager
+  judgeChecklist:
+    - adds a select-all control in the file list header
+    - adds per-item selection controls
+    - shows a delete-selected action only when there is a selection
+    - deleting selected items updates the visible list
+
+- id: app-test8-inventory-tracker-create
+  prompt: |-
+    Create an inventory tracker app for a small store.
+    Users should be able to add items with a name, sku, quantity, and price, search items by name or sku, and delete items.
+    The inventory should persist between sessions.
+  judgeChecklist:
+    - includes a form to add inventory items with name, sku, quantity, and price
+    - shows a list or table of saved inventory items
+    - supports searching or filtering by name or sku
+    - lets users delete existing inventory items
+    - persists the inventory data appropriately for a raw Windmill app
+
+- id: app-test9-recipe-book-create
+  prompt: |-
+    Create a recipe book app where users can add recipes with a name, ingredients list, and instructions.
+    Include a search bar to filter recipes by name and the ability to delete recipes.
+    Recipes should persist between sessions.
+  judgeChecklist:
+    - includes a form to add recipes with name, ingredients, and instructions
+    - shows saved recipes in the app
+    - supports searching recipes by name
+    - lets users delete recipes
+    - persists recipes appropriately for a raw Windmill app
--- a/ai_evals/cases/cli.yaml
+++ b/ai_evals/cases/cli.yaml
@@ -0,0 +1,66 @@
+- id: bun-hello-script
+  prompt: |-
+    Create a Windmill Bun script at `f/evals/hello.ts`.
+    It should take a `name` input and return a greeting object like `{ greeting: "Hello, Alice!" }`.
+  expected: ai_evals/fixtures/cli/expected/bun-hello-script
+  judgeChecklist:
+    - creates the requested Bun script at f/evals/hello.ts
+    - takes a name input
+    - returns an object containing the greeting
+
+- id: bun-hello-flow
+  prompt: |-
+    Create a Windmill flow at `f/evals/hello__flow`.
+    It should take a `name` input and return a greeting object like `{ greeting: "Hello, Alice!" }`.
+    Put the step code in `hello.ts`.
+  expected: ai_evals/fixtures/cli/expected/bun-hello-flow
+  judgeChecklist:
+    - creates the requested flow folder with flow.yaml and hello.ts
+    - wires the name input into the flow step
+    - returns the greeting object
+
+- id: python-add-numbers-script
+  prompt: |-
+    Add a Windmill Python script at `f/evals/add_numbers.py`.
+    It should take `a` and `b` as inputs and return `{ "total": a + b }`.
+  expected: ai_evals/fixtures/cli/expected/python-add-numbers-script
+  judgeChecklist:
+    - creates the requested Python script at f/evals/add_numbers.py
+    - takes `a` and `b` as inputs
+    - returns an object with total equal to a plus b
+
+- id: bun-hello-script-uppercase
+  prompt: |-
+    Update `f/evals/hello.ts` so it accepts an optional `uppercase` boolean.
+    Keep returning `{ greeting: ... }`, but when `uppercase` is true the greeting should be uppercased before returning it.
+  initial: ai_evals/fixtures/cli/initial/bun-hello-script-uppercase
+  expected: ai_evals/fixtures/cli/expected/bun-hello-script-uppercase
+  judgeChecklist:
+    - updates the existing hello.ts file rather than creating a new script
+    - accepts an optional uppercase boolean input
+    - keeps returning an object with greeting
+    - uppercases the greeting when uppercase is true
+
+- id: bun-hello-flow-punctuation
+  prompt: |-
+    Update the existing flow in `f/evals/hello__flow` so it also accepts an optional `punctuation` input.
+    The greeting should use that punctuation and default to `!` when it is missing.
+  initial: ai_evals/fixtures/cli/initial/bun-hello-flow-punctuation
+  expected: ai_evals/fixtures/cli/expected/bun-hello-flow-punctuation
+  judgeChecklist:
+    - updates the existing hello flow instead of creating a new one
+    - adds an optional punctuation input to the flow
+    - updates the step code so the returned greeting uses punctuation
+    - defaults punctuation to an exclamation mark when omitted
+
+- id: flow-reuse-existing-script
+  prompt: |-
+    There is already a reusable greeting script at `f/lib/format_greeting.ts`.
+    Create a flow at `f/evals/reuse_greeting__flow` that takes a `name` input and reuses that existing script instead of duplicating the logic inline.
+  initial: ai_evals/fixtures/cli/initial/flow-reuse-existing-script
+  expected: ai_evals/fixtures/cli/expected/flow-reuse-existing-script
+  judgeChecklist:
+    - creates the requested flow at f/evals/reuse_greeting__flow
+    - reuses the existing script from f/lib by path
+    - does not duplicate the greeting logic in a new inline script
+    - wires the name input into the reused script
--- a/ai_evals/cases/flow.yaml
+++ b/ai_evals/cases/flow.yaml
@@ -0,0 +1,335 @@
+- id: flow-test0-sum-two-numbers
+  prompt: |-
+    Create a flow that takes two numbers, `a` and `b`, and returns their sum.
+    Keep it simple and use a single step named `sum_numbers`.
+  expected: ai_evals/fixtures/frontend/flow/expected/test0_sum_two_numbers.json
+  runtime:
+    backendPreview:
+      args:
+        a: 4
+        b: 5
+  judgeChecklist:
+    - "the flow takes `a` and `b` as inputs"
+    - "the main step is named `sum_numbers`"
+    - the flow returns the sum of the two numbers
+
+- id: flow-test1-reuse-existing-script
+  prompt: |-
+    I need a flow that adds two numbers.
+    If there is already a script in the workspace that does that, reuse it instead of rewriting the logic.
+    The flow should take `a` and `b` as inputs and use a single step named `sum_numbers`.
+  initial: ai_evals/fixtures/frontend/flow/initial/test1_reuse_existing_script_initial.json
+  expected: ai_evals/fixtures/frontend/flow/expected/test1_reuse_existing_script.json
+  runtime:
+    backendPreview:
+      args:
+        a: 2
+        b: 3
+  judgeChecklist:
+    - "the flow takes `a` and `b` as inputs"
+    - "the main step is named `sum_numbers`"
+    - the flow reuses the existing workspace script instead of rewriting the addition logic
+
+- id: flow-test2-call-existing-subflow
+  prompt: |-
+    Create a parent flow that adds two numbers by reusing an existing flow in the workspace if one already exists.
+    The parent flow should take `a` and `b` as inputs and delegate the calculation instead of inlining it.
+    Use a single step named `call_add_numbers`.
+  initial: ai_evals/fixtures/frontend/flow/initial/test2_call_existing_subflow_initial.json
+  expected: ai_evals/fixtures/frontend/flow/expected/test2_call_existing_subflow.json
+  runtime:
+    backendPreview:
+      args:
+        a: 7
+        b: 8
+  judgeChecklist:
+    - "the parent flow takes `a` and `b` as inputs"
+    - "the main step is named `call_add_numbers`"
+    - the parent flow delegates to an existing workspace subflow instead of inlining the addition logic
+
+- id: flow-test3-branchone-routing
+  prompt: |-
+    Create a flow that routes incoming support requests based on the customer's tier.
+    The input should contain a string field named `tier`.
+    Free, pro, and enterprise requests should go to different queues, and unknown tiers should fall back to a default queue.
+    Name the main routing step `route_by_tier`.
+  expected: ai_evals/fixtures/frontend/flow/expected/test3_branchone_routing.json
+  judgeChecklist:
+    - "the input schema includes a string field named `tier`"
+    - "the main routing step is named `route_by_tier`"
+    - free requests go to a free queue
+    - pro requests go to a pro queue
+    - enterprise requests go to an enterprise queue
+    - unknown tiers fall back to a default queue
+
+- id: flow-test4-order-processing-loop
+  prompt: |-
+    Build an order-processing flow.
+
+    The input should include an order with:
+    - an `items` array containing `name`, `price`, and `quantity`
+    - `customer_email`
+    - `shipping_address`
+
+    The flow should:
+    - validate that every item has a positive price and quantity
+    - calculate the order total with 8% tax
+    - check inventory for each item using placeholder availability data
+    - create a shipment if everything is in stock, otherwise create a backorder
+    - send a confirmation using placeholder email logic
+    - return a final order summary with the status
+  validate:
+    schemaAnyOf:
+      - requiredPaths:
+          - order
+          - order.items
+          - order.customer_email
+          - order.shipping_address
+      - requiredPaths:
+          - items
+          - customer_email
+          - shipping_address
+    resolveResultsRefs: true
+  judgeChecklist:
+    - the flow validates that every item has a positive price and quantity
+    - the flow calculates the order total with 8% tax
+    - the flow checks inventory for each item using placeholder availability data
+    - the flow creates a shipment if everything is in stock, otherwise a backorder
+    - the flow sends a confirmation using placeholder email logic
+    - the flow returns a final order summary with the resulting status
+
+- id: flow-test5-parallel-data-pipeline
+  prompt: |-
+    Create a data-processing flow for three external data sources.
+
+    It should:
+    - load a small placeholder configuration listing the three sources
+    - fetch placeholder records from each source
+    - clean and validate each source's records
+    - combine everything into one dataset
+    - compute an overall quality score
+    - store the result differently depending on the score:
+      - 90 or above goes to the primary database
+      - 70 to 89 goes to a secondary database with a warning
+      - below 70 goes to quarantine and triggers an alert
+    - return a processing report with total records, quality score, and destination
+  judgeChecklist:
+    - the flow loads a placeholder configuration listing three external sources
+    - the flow fetches placeholder records from each source
+    - the flow cleans and validates each source's records
+    - the flow combines everything into one dataset
+    - the flow computes an overall quality score
+    - scores of 90 or above go to the primary database
+    - scores from 70 to 89 go to a secondary database with a warning
+    - scores below 70 go to quarantine and trigger an alert
+    - the final report includes total records, quality score, and destination
+
+- id: flow-test6-ai-agent-tools
+  prompt: |-
+    Create a customer support flow.
+
+    The input should include `customer_id` and `query_text`.
+    The flow should load the customer's profile and order history, then use an AI assistant to help with the request.
+    The assistant should be able to:
+    - look up orders
+    - check refund eligibility
+    - search FAQs
+    - open a support ticket when needed
+
+    After that, log the interaction and return the assistant's response.
+  judgeChecklist:
+    - "the input schema includes `customer_id` and `query_text`"
+    - the flow loads the customer's profile and order history
+    - the flow uses an AI assistant step
+    - the assistant can look up orders
+    - the assistant can check refund eligibility
+    - the assistant can search FAQs
+    - the assistant can open a support ticket
+    - the flow logs the interaction
+    - the final output returns the assistant response
+
+- id: flow-test7-simple-modification
+  prompt: |-
+    Update this flow so it validates processed data before saving it.
+
+    After `process_data`, add a `validate_data` step that checks the data array is not empty.
+    If the array is empty, the flow should surface the message `No data to save` and prevent saving.
+    If validation passes, let the save continue normally.
+    Update `save_results` so it uses the validation outcome instead of bypassing it.
+  initial: ai_evals/fixtures/frontend/flow/initial/test5_initial.json
+  validate:
+    topLevelStepIds:
+      - fetch_data
+      - process_data
+      - validate_data
+    topLevelStepOrder:
+      - fetch_data
+      - process_data
+      - validate_data
+    topLevelStepTypes:
+      - id: fetch_data
+        type: rawscript
+      - id: process_data
+        type: rawscript
+      - id: validate_data
+        type: rawscript
+  judgeChecklist:
+    - the updated flow keeps the original fetch and process steps intact
+    - "a `validate_data` step is added after `process_data`"
+    - "`validate_data` checks that the processed data array is not empty"
+    - "when processed data is empty, the flow surfaces the message `No data to save` and does not save results"
+    - "`save_results` uses the validation outcome instead of reading `results.process_data` directly"
+    - "exact field names or wrapper object shape for the validation result are not important"
+
+- id: flow-test8-branching-in-loop
+  prompt: |-
+    Update the order-processing logic inside `loop_orders` so different order types are handled differently.
+
+    For `express`, mark the order as priority and use a shipping cost of $15.99.
+    For `standard`, use a shipping cost of $5.99.
+    For `pickup`, mark it as no shipping required with a cost of $0.
+    Keep the existing processing as a fallback for unknown order types.
+    Each path should return the orderId, shipping cost, and shipping type.
+  initial: ai_evals/fixtures/frontend/flow/initial/test6_initial.json
+  judgeChecklist:
+    - "the existing `loop_orders` flow still handles per-order processing"
+    - exact branching topology is not required as long as `loop_orders` handles the order types correctly
+    - express orders are marked as priority and use a shipping cost of 15.99
+    - standard orders use a shipping cost of 5.99
+    - pickup orders use a shipping cost of 0 and are treated as no shipping required
+    - unknown order types still follow a fallback path
+    - "each processed order returns `orderId`, `shippingCost`, and `shippingType`"
+
+- id: flow-test9-parallel-refactor
+  prompt: |-
+    Refactor this flow so the enrichment work no longer runs one step at a time.
+
+    `enrich_price`, `enrich_inventory`, and `enrich_reviews` should run independently.
+    Each one should return a fallback value if it fails.
+    Update `combine_data` so it merges the enrichment results and sets a `hasFallbacks` flag when any fallback was used.
+    Keep `get_item` as the first step and `return_result` as the last step.
+  initial: ai_evals/fixtures/frontend/flow/initial/test7_initial.json
+  validate:
+    topLevelStepIds:
+      - get_item
+      - combine_data
+      - return_result
+    topLevelStepOrder:
+      - get_item
+      - combine_data
+      - return_result
+    topLevelStepTypeCountsAtLeast:
+      - type: branchall
+        count: 1
+    topLevelStepTypes:
+      - id: get_item
+        type: rawscript
+      - id: combine_data
+        type: rawscript
+      - id: return_result
+        type: rawscript
+    moduleRules:
+      - id: enrich_price
+      - id: enrich_inventory
+      - id: enrich_reviews
+  judgeChecklist:
+    - "the updated flow keeps `get_item` as the first step"
+    - "the updated flow keeps `return_result` as the last step"
+    - "`enrich_price`, `enrich_inventory`, and `enrich_reviews` run independently rather than sequentially"
+    - each enrichment path returns a fallback value if it fails
+    - "`combine_data` merges the enrichment results"
+    - "`combine_data` sets `hasFallbacks` when any fallback was used"
+
+- id: flow-test10-while-loop-counter
+  prompt: |-
+    Create a flow that keeps incrementing a counter until it reaches a target value.
+    The input should include a number field named `target`.
+    Use a top-level loop step named `count_until_target`.
+    Inside it, use a single step named `increment_counter` that increments the current counter.
+    The loop should stop once the counter reaches `target`.
+    After the loop, add a top-level step named `return_final_counter` that returns the last counter value.
+  validate:
+    exactTopLevelStepIds:
+      - count_until_target
+      - return_final_counter
+    topLevelStepOrder:
+      - count_until_target
+      - return_final_counter
+    topLevelStepTypes:
+      - id: count_until_target
+        type: whileloopflow
+      - id: return_final_counter
+        type: rawscript
+    moduleRules:
+      - id: count_until_target
+        hasStopAfterIf: true
+        hasStopAfterAllItersIf: false
+        exactImmediateChildStepIds:
+          - increment_counter
+        immediateChildStepTypes:
+          - id: increment_counter
+            type: rawscript
+    moduleFieldRules:
+      - id: count_until_target
+        path: stop_after_if.expr
+        equals: result >= flow_input.target
+  judgeChecklist:
+    - "the input schema includes a number field named `target`"
+    - "the top-level while loop step is named `count_until_target`"
+    - "`count_until_target` contains a single increment step named `increment_counter`"
+    - "`count_until_target` uses module-level `stop_after_if` to stop when the counter reaches `target`"
+    - "`increment_counter` uses `flow_input.iter.value` or an equivalent loop-state expression and falls back to `0` on the first iteration"
+    - "`return_final_counter` returns the final counter value"
+
+- id: flow-test11-preprocessor-and-failure-handler
+  prompt: |-
+    Create an event-processing flow for a string payload.
+
+    Before the main processing runs, trim the payload and reject empty strings.
+    The main step should be named `process_event` and return a simple success object.
+    If anything fails, return a compact error object with the error message and the failing step id.
+  expected: ai_evals/fixtures/frontend/flow/expected/test11_preprocessor_failure.json
+  validate:
+    requireSpecialModules:
+      - preprocessor_module
+      - failure_module
+  judgeChecklist:
+    - the flow trims the payload before the main processing runs
+    - the flow rejects empty payload strings
+    - "the main step is named `process_event`"
+    - "`process_event` returns a simple success object"
+    - failures return a compact error object with the error message and failing step id
+
+- id: flow-test12-approval-step
+  prompt: |-
+    Create a purchase approval flow.
+
+    The input should include `requester_email` and `amount`.
+    Add an approval step named `request_approval` that pauses the flow and asks the approver for a comment.
+    One approval should be enough to continue.
+    After approval, add a final step named `finalize_purchase` that returns an approved status object.
+  validate:
+    topLevelStepIds:
+      - request_approval
+      - finalize_purchase
+    topLevelStepOrder:
+      - request_approval
+      - finalize_purchase
+    topLevelStepTypes:
+      - id: finalize_purchase
+        type: rawscript
+    schemaRequiredPaths:
+      - requester_email
+      - amount
+    requireSuspendSteps:
+      - id: request_approval
+        requiredEvents: 1
+        resumeRequiredStringFieldAnyOf:
+          - comment
+          - approver_comment
+  judgeChecklist:
+    - "the flow includes an approval step named `request_approval`"
+    - "`request_approval` pauses the flow and asks the approver for a comment"
+    - one approval is enough to continue
+    - "the flow includes a final step named `finalize_purchase`"
+    - "`finalize_purchase` returns an approved status object after approval"
--- a/ai_evals/cases/script.yaml
+++ b/ai_evals/cases/script.yaml
@@ -0,0 +1,11 @@
+- id: script-test1-greet-user
+  prompt: |-
+    Update the current Bun script so it takes the existing `name` input and returns a plain greeting string like `Hello, Alice!`.
+    Do not wrap the result in an object or array.
+    Keep it simple and do not add external dependencies.
+  initial: ai_evals/fixtures/frontend/script/initial/test1_empty_bun.json
+  expected: ai_evals/fixtures/frontend/script/expected/test1_greet_user.json
+  judgeChecklist:
+    - uses the existing `name` input
+    - returns a plain greeting string
+    - does not wrap the result in an object or array
--- a/ai_evals/cli/index.ts
+++ b/ai_evals/cli/index.ts
@@ -0,0 +1,314 @@
+#!/usr/bin/env bun
+
+import { Command, InvalidArgumentError } from "commander";
+import { loadCases, loadSelectedCases } from "../core/cases";
+import {
+  BACKEND_VALIDATION_MODES,
+  parseBackendValidationMode,
+} from "../core/backendValidation";
+import {
+  EVAL_MODELS,
+  type EvalModelSpec,
+  formatRunModelLabel,
+  getCliEvalModel,
+  getEvalModelHelpText,
+  resolveEvalModel,
+} from "../core/models";
+import {
+  appendHistoryRecord,
+  buildRunResult,
+  formatRunSummary,
+  resolveRunOutputPath,
+  writeRunArtifacts,
+  writeRunResult,
+} from "../core/results";
+import { runSuite } from "../core/runSuite";
+import { EVAL_MODES, type EvalMode } from "../core/types";
+import { DEFAULT_JUDGE_MODEL } from "../core/judge";
+import { createCliModeRunner } from "../modes/cli";
+import { runFrontendBenchmarkAdapter } from "../adapters/frontend/runtime";
+
+async function main() {
+  const program = new Command()
+    .name("bun run cli --")
+    .description("Run AI eval cases against the current production prompts and guidance")
+    .showHelpAfterError()
+    .showSuggestionAfterError()
+    .addHelpText(
+      "after",
+      [
+        "",
+        "Examples:",
+        "  bun run cli -- models",
+        "  bun run cli -- cases",
+        "  bun run cli -- cases flow",
+        "  bun run cli -- run flow",
+        "  bun run cli -- run flow --model 4o",
+        "  bun run cli -- run flow --models haiku,opus,4o",
+        "  bun run cli -- run flow flow-test0-sum-two-numbers --verbose",
+        "  bun run cli -- run flow --record",
+        "  bun run cli -- run flow --backend-validation preview",
+        "  bun run cli -- run flow flow-test5-simple-modification --runs 3",
+        "  bun run cli -- run cli bun-hello-script",
+        "",
+        "Models:",
+        getEvalModelHelpText(),
+      ].join("\n")
+    );
+
+  program
+    .command("models")
+    .description("List available model aliases")
+    .action(() => {
+      handleModels();
+    });
+
+  program
+    .command("cases")
+    .description("List available cases")
+    .argument("[mode]", "cli, flow, script, or app", parseOptionalMode)
+    .action(async (mode?: EvalMode) => {
+      await handleCases(mode);
+    });
+
+  program
+    .command("run")
+    .description("Run one benchmark mode")
+    .argument("<mode>", "cli, flow, script, or app", parseMode)
+    .argument("[caseIds...]", "specific case ids to run")
+    .option("--runs <n>", "number of attempts per case", parsePositiveInteger, 1)
+    .option("--output <path>", "write the result JSON to this path")
+    .option("--model <name>", `model alias (${EVAL_MODELS.map((entry) => entry.id).join(", ")})`)
+    .option("--models <names>", "comma-separated model aliases to run sequentially")
+    .option("--verbose", "stream assistant output during frontend runs")
+    .option("--record", "append a compact summary line to ai_evals/history/<mode>.jsonl")
+    .option(
+      "--backend-validation <mode>",
+      `backend smoke validation (${BACKEND_VALIDATION_MODES.join(", ")})`
+    )
+    .action(
+      async (
+        mode: EvalMode,
+        caseIds: string[],
+        options: {
+          runs: number;
+          output?: string;
+          model?: string;
+          models?: string;
+          verbose?: boolean;
+          record?: boolean;
+          backendValidation?: string;
+        }
+      ) => {
+        await handleRun({
+          mode,
+          caseIds,
+          runs: options.runs,
+          outputPath: options.output,
+          model: options.model,
+          models: options.models,
+          verbose: options.verbose ?? false,
+          record: options.record ?? false,
+          backendValidation: options.backendValidation,
+        });
+      }
+    );
+
+  await program.parseAsync(process.argv);
+}
+
+async function handleCases(mode?: EvalMode) {
+  const modes = mode ? [mode] : [...EVAL_MODES];
+
+  for (const entry of modes) {
+    const cases = await loadCases(entry);
+    process.stdout.write(`${entry} (${cases.length})\n`);
+    for (const evalCase of cases) {
+      process.stdout.write(`- ${evalCase.id}\n`);
+    }
+    process.stdout.write("\n");
+  }
+}
+
+function handleModels() {
+  process.stdout.write("Available models\n");
+  for (const model of EVAL_MODELS) {
+    const supports = [
+      ...(model.frontend ? ["flow", "script", "app"] : []),
+      ...(model.cli ? ["cli"] : []),
+    ];
+    const aliases = [model.id, ...model.aliases.filter((alias) => alias !== model.id)];
+    process.stdout.write(`- ${model.id}: ${model.label}\n`);
+    process.stdout.write(`  aliases: ${aliases.join(", ")}\n`);
+    process.stdout.write(`  modes: ${supports.join(", ")}\n`);
+  }
+  process.stdout.write(`\nJudge model: ${DEFAULT_JUDGE_MODEL}\n`);
+}
+
+async function handleRun(input: {
+  mode: EvalMode;
+  caseIds: string[];
+  runs: number;
+  outputPath?: string;
+  model?: string;
+  models?: string;
+  verbose: boolean;
+  record: boolean;
+  backendValidation?: string;
+}) {
+  if (input.record && input.caseIds.length > 0) {
+    throw new Error("--record only supports full-suite runs; omit case ids to record history");
+  }
+  if (input.model && input.models) {
+    throw new Error("Use either --model or --models, not both");
+  }
+
+  const selectedCases = await loadSelectedCases(input.mode, input.caseIds);
+  const models = resolveRequestedModels(input.mode, input.model, input.models);
+  const backendValidation = parseBackendValidationMode(
+    input.backendValidation ?? process.env.WMILL_AI_EVAL_BACKEND_VALIDATION
+  );
+  if (input.outputPath && models.length > 1) {
+    throw new Error("--output only supports a single model run");
+  }
+  if (backendValidation !== "off" && input.mode !== "flow" && input.mode !== "script") {
+    throw new Error("--backend-validation currently supports only flow and script modes");
+  }
+
+  const summaries: Array<{ label: string; passRate: number; averageDurationMs: number }> = [];
+
+  for (const [index, model] of models.entries()) {
+    const runModel = formatRunModelLabel(input.mode, model);
+    if (models.length > 1) {
+      process.stdout.write(
+        `${index > 0 ? "\n" : ""}=== ${input.mode} ${model.id} (${runModel}) ===\n`
+      );
+    }
+    process.stderr.write(`Starting ${input.mode} benchmark...\n`);
+
+    const result =
+      input.mode === "cli"
+        ? await runCliBenchmark(selectedCases, input.runs, getCliEvalModel(model), runModel)
+        : await runFrontendBenchmarkAdapter({
+            mode: input.mode,
+            caseIds: input.caseIds,
+            runs: input.runs,
+            model: model.id,
+            verbose: input.verbose,
+            backendValidation,
+          });
+
+    const resolvedOutputPath =
+      models.length === 1
+        ? resolveRunOutputPath(input.mode, input.outputPath)
+        : resolveRunOutputPath(input.mode);
+    const artifactsPath = await writeRunArtifacts(result, resolvedOutputPath);
+    const resultPath = await writeRunResult(result, resolvedOutputPath);
+    const historyPath = input.record ? await appendHistoryRecord(result) : null;
+    process.stdout.write(`${formatRunSummary(result)}\n`);
+    process.stdout.write(`Saved: ${resultPath}\n`);
+    if (artifactsPath) {
+      process.stdout.write(`Artifacts: ${artifactsPath}\n`);
+    }
+    if (historyPath) {
+      process.stdout.write(`Recorded: ${historyPath}\n`);
+    }
+
+    summaries.push({
+      label: `${model.id} (${runModel})`,
+      passRate: result.passRate,
+      averageDurationMs: result.averageDurationMs,
+    });
+  }
+
+  if (summaries.length > 1) {
+    process.stdout.write("\nModel summary\n");
+    for (const summary of summaries) {
+      process.stdout.write(
+        `- ${summary.label}: ${formatPercent(summary.passRate)} | ${Math.round(summary.averageDurationMs)}ms\n`
+      );
+    }
+  }
+}
+
+async function runCliBenchmark(
+  cases: Awaited<ReturnType<typeof loadSelectedCases>>,
+  runs: number,
+  model: ReturnType<typeof getCliEvalModel>,
+  runModel: string
+) {
+  const caseResults = await runSuite({
+    modeRunner: createCliModeRunner(model),
+    cases,
+    runs,
+    runModel,
+    judgeModel: DEFAULT_JUDGE_MODEL,
+  });
+
+  return buildRunResult({
+    mode: "cli",
+    runs,
+    runModel,
+    judgeModel: DEFAULT_JUDGE_MODEL,
+    caseResults,
+  });
+}
+
+function parseMode(value: string): EvalMode {
+  if (EVAL_MODES.includes(value as EvalMode)) {
+    return value as EvalMode;
+  }
+  throw new InvalidArgumentError(`mode must be one of: ${EVAL_MODES.join(", ")}`);
+}
+
+function parseOptionalMode(value: string | undefined): EvalMode | undefined {
+  return value ? parseMode(value) : undefined;
+}
+
+function parsePositiveInteger(value: string): number {
+  const parsed = Number(value);
+  if (!Number.isInteger(parsed) || parsed <= 0) {
+    throw new InvalidArgumentError("must be a positive integer");
+  }
+  return parsed;
+}
+
+function resolveRequestedModels(
+  mode: EvalMode,
+  singleModel?: string,
+  multipleModels?: string
+): EvalModelSpec[] {
+  if (!multipleModels) {
+    return [resolveEvalModel(mode, singleModel)];
+  }
+
+  const aliases = multipleModels
+    .split(",")
+    .map((value) => value.trim())
+    .filter(Boolean);
+  if (aliases.length === 0) {
+    throw new Error("--models requires at least one model alias");
+  }
+
+  const seen = new Set<string>();
+  const models: EvalModelSpec[] = [];
+  for (const alias of aliases) {
+    const model = resolveEvalModel(mode, alias);
+    if (seen.has(model.id)) {
+      continue;
+    }
+    seen.add(model.id);
+    models.push(model);
+  }
+  return models;
+}
+
+function formatPercent(value: number): string {
+  return `${(value * 100).toFixed(1)}%`;
+}
+
+void main().catch((error) => {
+  const message = error instanceof Error ? error.message : String(error);
+  process.stderr.write(`${message}\n`);
+  process.exit(1);
+});
--- a/ai_evals/core/backendValidation.test.ts
+++ b/ai_evals/core/backendValidation.test.ts
@@ -0,0 +1,36 @@
+import { describe, expect, it } from "bun:test";
+import {
+  parseBackendValidationMode,
+  resolveBackendValidationSettings,
+} from "./backendValidation";
+
+describe("parseBackendValidationMode", () => {
+  it("defaults to off", () => {
+    expect(parseBackendValidationMode(undefined)).toBe("off");
+    expect(parseBackendValidationMode("0")).toBe("off");
+    expect(parseBackendValidationMode("false")).toBe("off");
+  });
+
+  it("accepts preview aliases", () => {
+    expect(parseBackendValidationMode("preview")).toBe("preview");
+    expect(parseBackendValidationMode("1")).toBe("preview");
+    expect(parseBackendValidationMode("true")).toBe("preview");
+  });
+
+  it("rejects unknown modes", () => {
+    expect(() => parseBackendValidationMode("maybe")).toThrow(
+      "Unsupported backend validation mode: maybe"
+    );
+  });
+});
+
+describe("resolveBackendValidationSettings", () => {
+  it("rejects unsupported eval modes", () => {
+    expect(() =>
+      resolveBackendValidationSettings({
+        evalMode: "app",
+        requestedMode: "preview",
+      })
+    ).toThrow('Backend validation mode "preview" is only supported for flow and script evals');
+  });
+});
--- a/ai_evals/core/backendValidation.ts
+++ b/ai_evals/core/backendValidation.ts
@@ -0,0 +1,104 @@
+import type { EvalMode } from "./types";
+
+export const BACKEND_VALIDATION_MODES = ["off", "preview"] as const;
+
+export type BackendValidationMode = (typeof BACKEND_VALIDATION_MODES)[number];
+
+export interface BackendValidationSettings {
+  mode: BackendValidationMode;
+  baseUrl: string;
+  email: string;
+  password: string;
+  keepWorkspaces: boolean;
+  workspaceOverride?: string;
+  workspacePrefix: string;
+  pollIntervalMs: number;
+  maxWaitMs: number;
+}
+
+export function parseBackendValidationMode(value?: string | null): BackendValidationMode {
+  const normalized = value?.trim().toLowerCase();
+
+  if (!normalized || normalized === "off" || normalized === "false" || normalized === "0") {
+    return "off";
+  }
+
+  if (normalized === "preview" || normalized === "true" || normalized === "1") {
+    return "preview";
+  }
+
+  throw new Error(
+    `Unsupported backend validation mode: ${value}. Use one of: ${BACKEND_VALIDATION_MODES.join(", ")}`
+  );
+}
+
+export function resolveBackendValidationSettings(input: {
+  evalMode: EvalMode;
+  requestedMode?: string | null;
+}): BackendValidationSettings {
+  const mode = parseBackendValidationMode(
+    input.requestedMode ?? process.env.WMILL_AI_EVAL_BACKEND_VALIDATION
+  );
+
+  if (mode !== "off" && input.evalMode !== "flow" && input.evalMode !== "script") {
+    throw new Error(
+      `Backend validation mode "${mode}" is only supported for flow and script evals`
+    );
+  }
+
+  return {
+    mode,
+    baseUrl: normalizeBaseUrl(
+      process.env.WMILL_AI_EVAL_BACKEND_URL ??
+        process.env.WINDMILL_URL ??
+        process.env.WINDMILL_BASE_URL ??
+        process.env.REMOTE ??
+        "http://127.0.0.1:8000"
+    ),
+    email: process.env.WMILL_AI_EVAL_BACKEND_EMAIL ?? "admin@windmill.dev",
+    password: process.env.WMILL_AI_EVAL_BACKEND_PASSWORD ?? "changeme",
+    keepWorkspaces: isTruthy(process.env.WMILL_AI_EVAL_KEEP_WORKSPACES),
+    workspaceOverride: sanitizeOptionalWorkspaceId(process.env.WMILL_AI_EVAL_BACKEND_WORKSPACE),
+    workspacePrefix: sanitizeWorkspacePrefix(
+      process.env.WMILL_AI_EVAL_WORKSPACE_PREFIX ?? "ai-evals"
+    ),
+    pollIntervalMs: parsePositiveInteger(
+      process.env.WMILL_AI_EVAL_BACKEND_POLL_INTERVAL_MS,
+      2000
+    ),
+    maxWaitMs: parsePositiveInteger(process.env.WMILL_AI_EVAL_BACKEND_MAX_WAIT_MS, 120000),
+  };
+}
+
+function normalizeBaseUrl(value: string): string {
+  return value.replace(/\/+$/, "");
+}
+
+function sanitizeWorkspacePrefix(value: string): string {
+  const sanitized = value
+    .trim()
+    .toLowerCase()
+    .replace(/[^a-z0-9-]+/g, "-")
+    .replace(/^-+|-+$/g, "");
+  return sanitized.length > 0 ? sanitized : "ai-evals";
+}
+
+function sanitizeOptionalWorkspaceId(value: string | undefined): string | undefined {
+  const trimmed = value?.trim();
+  return trimmed ? trimmed : undefined;
+}
+
+function isTruthy(value: string | undefined): boolean {
+  if (!value) {
+    return false;
+  }
+  return ["1", "true", "yes", "on"].includes(value.trim().toLowerCase());
+}
+
+function parsePositiveInteger(value: string | undefined, fallback: number): number {
+  if (!value) {
+    return fallback;
+  }
+  const parsed = Number(value);
+  return Number.isInteger(parsed) && parsed > 0 ? parsed : fallback;
+}
--- a/ai_evals/core/cases.test.ts
+++ b/ai_evals/core/cases.test.ts
@@ -0,0 +1,18 @@
+import { describe, expect, it } from "bun:test";
+import { loadCases } from "./cases";
+
+describe("loadCases", () => {
+  it("loads backend preview runtime config for opt-in flow cases", async () => {
+    const flowCases = await loadCases("flow");
+    const caseEntry = flowCases.find((entry) => entry.id === "flow-test1-reuse-existing-script");
+
+    expect(caseEntry?.runtime).toEqual({
+      backendPreview: {
+        args: {
+          a: 2,
+          b: 3,
+        },
+      },
+    });
+  });
+});
--- a/ai_evals/core/cases.ts
+++ b/ai_evals/core/cases.ts
@@ -0,0 +1,73 @@
+import { readFile } from "node:fs/promises";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+import { parse } from "yaml";
+import type { EvalCase, EvalCaseRuntimeSpec, EvalMode, FlowValidationSpec } from "./types";
+
+const REPO_ROOT = fileURLToPath(new URL("../../", import.meta.url));
+const CASES_DIR = path.join(REPO_ROOT, "ai_evals", "cases");
+
+interface RawEvalCase {
+  id: string;
+  prompt: string;
+  initial?: string;
+  expected?: string;
+  validate?: FlowValidationSpec;
+  judgeChecklist?: string[];
+  runtime?: EvalCaseRuntimeSpec;
+}
+
+export function getRepoRoot(): string {
+  return REPO_ROOT;
+}
+
+export function getAiEvalsRoot(): string {
+  return path.join(REPO_ROOT, "ai_evals");
+}
+
+export async function loadCases(mode: EvalMode): Promise<EvalCase[]> {
+  const filePath = path.join(CASES_DIR, `${mode}.yaml`);
+  const raw = await readFile(filePath, "utf8");
+  const parsed = parse(raw);
+
+  if (!Array.isArray(parsed)) {
+    throw new Error(`Expected ${filePath} to contain a YAML list of cases`);
+  }
+
+  return parsed.map((entry) => ({
+    id: entry.id,
+    prompt: entry.prompt,
+    initialPath: resolveFixturePath(entry.initial),
+    expectedPath: resolveFixturePath(entry.expected),
+    validate: entry.validate,
+    judgeChecklist: entry.judgeChecklist,
+    runtime: entry.runtime,
+  }));
+}
+
+export async function loadSelectedCases(
+  mode: EvalMode,
+  selectedIds: string[]
+): Promise<EvalCase[]> {
+  const allCases = await loadCases(mode);
+  if (selectedIds.length === 0) {
+    return allCases;
+  }
+
+  const caseMap = new Map(allCases.map((entry) => [entry.id, entry]));
+  const missing = selectedIds.filter((id) => !caseMap.has(id));
+  if (missing.length > 0) {
+    throw new Error(
+      `Unknown ${mode} case${missing.length === 1 ? "" : "s"}: ${missing.join(", ")}`
+    );
+  }
+
+  return selectedIds.map((id) => caseMap.get(id)!);
+}
+
+function resolveFixturePath(value: string | undefined): string | undefined {
+  if (!value) {
+    return undefined;
+  }
+  return path.isAbsolute(value) ? value : path.join(REPO_ROOT, value);
+}
--- a/ai_evals/core/files.ts
+++ b/ai_evals/core/files.ts
@@ -0,0 +1,67 @@
+import { access, copyFile, mkdir, readdir, readFile } from "node:fs/promises";
+import path from "node:path";
+
+export async function exists(filePath: string): Promise<boolean> {
+  try {
+    await access(filePath);
+    return true;
+  } catch {
+    return false;
+  }
+}
+
+export async function readJsonFile<T>(filePath: string): Promise<T> {
+  const raw = await readFile(filePath, "utf8");
+  return JSON.parse(raw) as T;
+}
+
+export async function readDirectoryFiles(
+  rootDir: string,
+  options: {
+    ignore?: Set<string>;
+  } = {}
+): Promise<Record<string, string>> {
+  const files: Record<string, string> = {};
+  await walkDirectory(rootDir, "", files, options.ignore ?? new Set());
+  return files;
+}
+
+export async function copyDirectory(sourceDir: string, targetDir: string): Promise<void> {
+  const entries = await readdir(sourceDir, { withFileTypes: true });
+  await mkdir(targetDir, { recursive: true });
+
+  for (const entry of entries) {
+    const sourcePath = path.join(sourceDir, entry.name);
+    const targetPath = path.join(targetDir, entry.name);
+    if (entry.isDirectory()) {
+      await copyDirectory(sourcePath, targetPath);
+      continue;
+    }
+    await mkdir(path.dirname(targetPath), { recursive: true });
+    await copyFile(sourcePath, targetPath);
+  }
+}
+
+async function walkDirectory(
+  absoluteDir: string,
+  relativeDir: string,
+  output: Record<string, string>,
+  ignore: Set<string>
+): Promise<void> {
+  const entries = await readdir(absoluteDir, { withFileTypes: true });
+
+  for (const entry of entries) {
+    const relativePath = relativeDir ? `${relativeDir}/${entry.name}` : entry.name;
+    if (ignore.has(relativePath) || ignore.has(entry.name)) {
+      continue;
+    }
+
+    const absolutePath = path.join(absoluteDir, entry.name);
+    if (entry.isDirectory()) {
+      await walkDirectory(absolutePath, relativePath, output, ignore);
+      continue;
+    }
+
+    output[relativePath] = await readFile(absolutePath, "utf8");
+  }
+}
--- a/ai_evals/core/judge.ts
+++ b/ai_evals/core/judge.ts
@@ -0,0 +1,151 @@
+import Anthropic from "@anthropic-ai/sdk";
+import type { EvalMode, JudgeResult } from "./types";
+
+export const DEFAULT_JUDGE_MODEL = "claude-sonnet-4-6";
+
+const JUDGE_TOOL_NAME = "submit_judgement";
+
+export async function judgeOutput(input: {
+  mode: EvalMode;
+  prompt: string;
+  checklist?: string[];
+  initial?: unknown;
+  expected?: unknown;
+  actual: unknown;
+  model?: string;
+}): Promise<JudgeResult> {
+  const apiKey = process.env.ANTHROPIC_API_KEY;
+  if (!apiKey) {
+    return {
+      success: false,
+      score: 0,
+      summary: "Judge unavailable",
+      error: "ANTHROPIC_API_KEY is not set",
+    };
+  }
+
+  const client = new Anthropic({ apiKey });
+  const model = input.model ?? DEFAULT_JUDGE_MODEL;
+
+  const system = [
+    "You evaluate benchmark outputs for Windmill AI generation.",
+    "Deterministic checks already run separately. Focus on whether the final output satisfies the user request.",
+    "If expected state is provided, treat it as a valid example and reward semantically equivalent outputs.",
+    "If a checklist is provided, treat it as the explicit acceptance criteria for this case.",
+    "Be strict about missing requested functionality.",
+    "When the prompt wording is ambiguous, prefer the checklist over inferred structural requirements.",
+    "Do not invent additional Windmill-specific constraints that are not explicit in the prompt, checklist, or expected state.",
+    "Do not lower the score just because the output uses a different but valid Windmill idiom, naming choice, or equivalent field shape.",
+    "Do not require exact ids, exact topology, or exact field names unless the prompt, checklist, or expected state clearly requires them.",
+    `Always respond by calling the ${JUDGE_TOOL_NAME} tool exactly once.`,
+  ].join("\n\n");
+
+  const user = [
+    `Mode: ${input.mode}`,
+    "",
+    "User prompt:",
+    input.prompt,
+    "",
+    "Checklist:",
+    formatChecklist(input.checklist),
+    "",
+    "Initial state:",
+    formatJsonBlock(input.initial),
+    "",
+    "Expected state:",
+    formatJsonBlock(input.expected),
+    "",
+    "Actual result:",
+    formatJsonBlock(input.actual),
+  ].join("\n");
+
+  try {
+    const response = await client.messages.create({
+      model,
+      max_tokens: 1024,
+      temperature: 0,
+      system,
+      messages: [{ role: "user", content: user }],
+      tools: [
+        {
+          name: JUDGE_TOOL_NAME,
+          description: "Submit the benchmark judgement as structured data.",
+          input_schema: {
+            type: "object",
+            properties: {
+              score: {
+                type: "integer",
+                minimum: 0,
+                maximum: 100,
+              },
+              summary: {
+                type: "string",
+              },
+            },
+            required: ["score", "summary"],
+          },
+        },
+      ],
+      tool_choice: {
+        type: "tool",
+        name: JUDGE_TOOL_NAME,
+        disable_parallel_tool_use: true,
+      },
+    });
+
+    const toolUseBlock = response.content.find(
+      (block): block is Anthropic.ToolUseBlock =>
+        block.type === "tool_use" && block.name === JUDGE_TOOL_NAME
+    );
+
+    if (!toolUseBlock) {
+      return {
+        success: false,
+        score: 0,
+        summary: "Judge returned no tool output",
+        error: "Expected structured tool output from judge",
+      };
+    }
+
+    const parsed = toolUseBlock.input as {
+      score: number;
+      summary: string;
+    };
+
+    return {
+      success: true,
+      score: normalizeScore(parsed.score),
+      summary: parsed.summary,
+    };
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    return {
+      success: false,
+      score: 0,
+      summary: "Judge failed",
+      error: message,
+    };
+  }
+}
+
+function formatJsonBlock(value: unknown): string {
+  if (value === undefined) {
+    return "(none)";
+  }
+  return JSON.stringify(value, null, 2);
+}
+
+function formatChecklist(checklist: string[] | undefined): string {
+  if (!checklist || checklist.length === 0) {
+    return "(none)";
+  }
+
+  return checklist.map((item) => `- ${item}`).join("\n");
+}
+
+function normalizeScore(value: number): number {
+  if (!Number.isFinite(value)) {
+    return 0;
+  }
+  return Math.max(0, Math.min(100, Math.round(value)));
+}
--- a/ai_evals/core/models.test.ts
+++ b/ai_evals/core/models.test.ts
@@ -0,0 +1,29 @@
+import { describe, expect, it } from "bun:test";
+import { resolveEvalModel } from "./models";
+
+describe("resolveEvalModel", () => {
+  it("supports Gemini aliases for frontend evals", () => {
+    expect(resolveEvalModel("flow", "gemini").frontend).toEqual({
+      provider: "googleai",
+      model: "gemini-2.5-flash",
+    });
+    expect(resolveEvalModel("app", "gemini-pro").frontend).toEqual({
+      provider: "googleai",
+      model: "gemini-2.5-pro",
+    });
+    expect(resolveEvalModel("script", "gemini-3-flash-preview").frontend).toEqual({
+      provider: "googleai",
+      model: "gemini-3-flash-preview",
+    });
+    expect(resolveEvalModel("flow", "gemini-3.1-pro-preview").frontend).toEqual({
+      provider: "googleai",
+      model: "gemini-3.1-pro-preview",
+    });
+  });
+
+  it("rejects Gemini aliases for cli evals", () => {
+    expect(() => resolveEvalModel("cli", "gemini")).toThrow(
+      "Model gemini-flash is not supported for cli mode"
+    );
+  });
+});
--- a/ai_evals/core/models.ts
+++ b/ai_evals/core/models.ts
@@ -0,0 +1,185 @@
+import type { EvalMode } from "./types";
+
+export interface FrontendEvalModelConfig {
+  provider: "anthropic" | "openai" | "googleai";
+  model: string;
+}
+
+export interface CliEvalModelConfig {
+  provider: "anthropic";
+  model: string;
+}
+
+export interface EvalModelSpec {
+  id: string;
+  label: string;
+  aliases: string[];
+  frontend?: FrontendEvalModelConfig;
+  cli?: CliEvalModelConfig;
+}
+
+export const EVAL_MODELS: EvalModelSpec[] = [
+  {
+    id: "haiku",
+    label: "Claude Haiku 4.5",
+    aliases: [
+      "haiku",
+      "haiku-4.5",
+      "claude-haiku",
+      "claude-haiku-4.5",
+      "claude-haiku-4-5",
+      "claude-haiku-4-5-20251001",
+    ],
+    frontend: {
+      provider: "anthropic",
+      model: "claude-haiku-4-5-20251001",
+    },
+    cli: {
+      provider: "anthropic",
+      model: "haiku",
+    },
+  },
+  {
+    id: "sonnet",
+    label: "Claude Sonnet 4.5",
+    aliases: [
+      "sonnet",
+      "sonnet-4.5",
+      "claude-sonnet",
+      "claude-sonnet-4.5",
+      "claude-sonnet-4-5",
+      "claude-sonnet-4-5-20250929",
+    ],
+    frontend: {
+      provider: "anthropic",
+      model: "claude-sonnet-4-5-20250929",
+    },
+    cli: {
+      provider: "anthropic",
+      model: "sonnet",
+    },
+  },
+  {
+    id: "opus",
+    label: "Claude Opus 4.6",
+    aliases: [
+      "opus",
+      "opus-4.6",
+      "claude-opus",
+      "claude-opus-4.6",
+      "claude-opus-4-6",
+    ],
+    frontend: {
+      provider: "anthropic",
+      model: "claude-opus-4-6",
+    },
+    cli: {
+      provider: "anthropic",
+      model: "opus",
+    },
+  },
+  {
+    id: "4o",
+    label: "GPT-4o",
+    aliases: ["4o", "gpt-4o"],
+    frontend: {
+      provider: "openai",
+      model: "gpt-4o",
+    },
+  },
+  {
+    id: "gemini-flash",
+    label: "Gemini 2.5 Flash",
+    aliases: ["gemini", "gemini-flash", "gemini-2.5-flash"],
+    frontend: {
+      provider: "googleai",
+      model: "gemini-2.5-flash",
+    },
+  },
+  {
+    id: "gemini-pro",
+    label: "Gemini 2.5 Pro",
+    aliases: ["gemini-pro", "gemini-2.5-pro"],
+    frontend: {
+      provider: "googleai",
+      model: "gemini-2.5-pro",
+    },
+  },
+  {
+    id: "gemini-3-flash-preview",
+    label: "Gemini 3 Flash Preview",
+    aliases: ["gemini-3-flash-preview", "gemini-3-flash"],
+    frontend: {
+      provider: "googleai",
+      model: "gemini-3-flash-preview",
+    },
+  },
+  {
+    id: "gemini-3.1-pro-preview",
+    label: "Gemini 3.1 Pro Preview",
+    aliases: ["gemini-3.1-pro-preview", "gemini-3.1-pro", "gemini-3-pro-preview"],
+    frontend: {
+      provider: "googleai",
+      model: "gemini-3.1-pro-preview",
+    },
+  },
+];
+
+export function resolveEvalModel(mode: EvalMode, alias?: string): EvalModelSpec {
+  const spec = alias ? findEvalModel(alias) : getDefaultEvalModel(mode);
+  if (!spec) {
+    throw new Error(`Unknown model: ${alias}`);
+  }
+
+  if (mode === "cli" && !spec.cli) {
+    throw new Error(`Model ${spec.id} is not supported for cli mode`);
+  }
+
+  if (mode !== "cli" && !spec.frontend) {
+    throw new Error(`Model ${spec.id} is not supported for ${mode} mode`);
+  }
+
+  return spec;
+}
+
+export function getEvalModelHelpText(): string {
+  return EVAL_MODELS.map((model) => {
+    const modes = [
+      ...(model.frontend ? ["flow", "script", "app"] : []),
+      ...(model.cli ? ["cli"] : []),
+    ];
+    return `  ${model.id.padEnd(8)} ${model.label} (${modes.join(", ")})`;
+  }).join("\n");
+}
+
+export function formatRunModelLabel(mode: EvalMode, model: EvalModelSpec): string {
+  if (mode === "cli") {
+    return `${model.cli!.provider}:${model.cli!.model}`;
+  }
+  return `${model.frontend!.provider}:${model.frontend!.model}`;
+}
+
+export function getFrontendEvalModel(model: EvalModelSpec): FrontendEvalModelConfig {
+  if (!model.frontend) {
+    throw new Error(`Model ${model.id} does not support frontend evals`);
+  }
+  return model.frontend;
+}
+
+export function getCliEvalModel(model: EvalModelSpec): CliEvalModelConfig {
+  if (!model.cli) {
+    throw new Error(`Model ${model.id} does not support cli evals`);
+  }
+  return model.cli;
+}
+
+function getDefaultEvalModel(mode: EvalMode): EvalModelSpec {
+  return mode === "cli" ? EVAL_MODELS[0]! : EVAL_MODELS[0]!;
+}
+
+function findEvalModel(alias: string): EvalModelSpec | undefined {
+  const normalized = alias.trim().toLowerCase();
+  return EVAL_MODELS.find((model) =>
+    [model.id, ...model.aliases].some((candidate) => candidate.toLowerCase() === normalized)
+  );
+}
--- a/ai_evals/core/results.ts
+++ b/ai_evals/core/results.ts
@@ -0,0 +1,296 @@
+import { appendFile, mkdir, rm, writeFile } from "node:fs/promises";
+import path from "node:path";
+import { execFileSync } from "node:child_process";
+import { getAiEvalsRoot, getRepoRoot } from "./cases";
+import type {
+  BenchmarkArtifactFile,
+  BenchmarkCaseResult,
+  BenchmarkRunResult,
+  BenchmarkTokenUsage,
+  EvalMode,
+} from "./types";
+
+export async function writeRunResult(
+  result: BenchmarkRunResult,
+  outputPath?: string
+): Promise<string> {
+  const targetPath = resolveRunOutputPath(result.mode, outputPath);
+  await mkdir(path.dirname(targetPath), { recursive: true });
+  await writeFile(targetPath, JSON.stringify(toSerializableRunResult(result), null, 2) + "\n", "utf8");
+  return targetPath;
+}
+
+export async function appendHistoryRecord(
+  result: BenchmarkRunResult,
+  historyPath = resolveHistoryPath(result.mode)
+): Promise<string> {
+  await mkdir(path.dirname(historyPath), { recursive: true });
+  await appendFile(historyPath, JSON.stringify(toHistoryRecord(result)) + "\n", "utf8");
+  return historyPath;
+}
+
+export async function writeRunArtifacts(
+  result: BenchmarkRunResult,
+  outputPath?: string
+): Promise<string | null> {
+  const targetPath = resolveRunOutputPath(result.mode, outputPath);
+  const artifactRoot = defaultArtifactsRoot(targetPath);
+
+  await rm(artifactRoot, { recursive: true, force: true });
+
+  let wroteArtifacts = false;
+  for (const caseResult of result.cases) {
+    for (const attempt of caseResult.attempts) {
+      const artifactFiles = attempt.artifactFiles ?? [];
+      if (artifactFiles.length === 0) {
+        attempt.artifactsPath = null;
+        continue;
+      }
+
+      const attemptDir = path.join(artifactRoot, caseResult.id, `attempt-${attempt.attempt}`);
+      await writeArtifactFiles(attemptDir, artifactFiles);
+      attempt.artifactsPath = attemptDir;
+      wroteArtifacts = true;
+    }
+  }
+
+  result.artifactsPath = wroteArtifacts ? artifactRoot : null;
+  return result.artifactsPath ?? null;
+}
+
+export function buildRunResult(input: {
+  mode: EvalMode;
+  runs: number;
+  runModel: string | null;
+  judgeModel: string | null;
+  caseResults: BenchmarkCaseResult[];
+}): BenchmarkRunResult {
+  const attemptCount = input.caseResults.reduce((sum, entry) => sum + entry.attempts.length, 0);
+  const passedAttempts = input.caseResults.reduce(
+    (sum, entry) => sum + entry.attempts.filter((attempt) => attempt.passed).length,
+    0
+  );
+  const durationTotal = input.caseResults.reduce(
+    (sum, entry) => sum + entry.attempts.reduce((inner, attempt) => inner + attempt.durationMs, 0),
+    0
+  );
+  const tokenUsageTotal = input.caseResults.reduce<BenchmarkTokenUsage | null>(
+    (sum, entry) => {
+      for (const attempt of entry.attempts) {
+        if (!attempt.tokenUsage) {
+          continue;
+        }
+        sum ??= { prompt: 0, completion: 0, total: 0 };
+        sum.prompt += attempt.tokenUsage.prompt;
+        sum.completion += attempt.tokenUsage.completion;
+        sum.total += attempt.tokenUsage.total;
+      }
+      return sum;
+    },
+    null
+  );
+
+  return {
+    version: 1,
+    mode: input.mode,
+    createdAt: new Date().toISOString(),
+    gitSha: getGitSha(),
+    runs: input.runs,
+    runModel: input.runModel,
+    judgeModel: input.judgeModel,
+    caseCount: input.caseResults.length,
+    attemptCount,
+    passedAttempts,
+    passRate: attemptCount === 0 ? 0 : passedAttempts / attemptCount,
+    averageDurationMs: attemptCount === 0 ? 0 : durationTotal / attemptCount,
+    totalTokenUsage: tokenUsageTotal,
+    averageTokenUsagePerAttempt:
+      attemptCount === 0 || !tokenUsageTotal
+        ? null
+        : {
+            prompt: tokenUsageTotal.prompt / attemptCount,
+            completion: tokenUsageTotal.completion / attemptCount,
+            total: tokenUsageTotal.total / attemptCount,
+          },
+    cases: input.caseResults,
+  };
+}
+
+export function formatRunSummary(result: BenchmarkRunResult): string {
+  const lines = [
+    `${result.mode} benchmark complete`,
+    `Pass rate: ${formatPercent(result.passRate)} (${result.passedAttempts}/${result.attemptCount})`,
+    `Average duration: ${Math.round(result.averageDurationMs)}ms`,
+  ];
+
+  const failures = collectFailures(result);
+  if (failures.length > 0) {
+    lines.push("Failures:");
+    for (const entry of failures.slice(0, 10)) {
+      lines.push(`- ${entry}`);
+    }
+  }
+
+  return lines.join("\n");
+}
+
+function collectFailures(result: BenchmarkRunResult): string[] {
+  const failures: string[] = [];
+
+  for (const caseResult of result.cases) {
+    for (const attempt of caseResult.attempts) {
+      if (attempt.passed) {
+        continue;
+      }
+      const failedChecks = attempt.checks.filter((check) => !check.passed).map((check) => check.name);
+      failures.push(
+        `${caseResult.id} attempt ${attempt.attempt}: ${failedChecks.join(", ") || attempt.error || "failed"}`
+      );
+    }
+  }
+
+  return failures;
+}
+
+function defaultFileName(mode: EvalMode): string {
+  return `${new Date().toISOString().replaceAll(":", "-")}__${mode}.json`;
+}
+
+export function resolveRunOutputPath(mode: EvalMode, outputPath?: string): string {
+  return outputPath ?? path.join(getAiEvalsRoot(), "results", defaultFileName(mode));
+}
+
+export function resolveHistoryPath(mode: EvalMode): string {
+  return path.join(getAiEvalsRoot(), "history", `${mode}.jsonl`);
+}
+
+function defaultArtifactsRoot(resultPath: string): string {
+  return resultPath.endsWith(".json")
+    ? resultPath.slice(0, -".json".length)
+    : `${resultPath}.artifacts`;
+}
+
+async function writeArtifactFiles(
+  rootDir: string,
+  files: BenchmarkArtifactFile[]
+): Promise<void> {
+  for (const file of files) {
+    const relativePath = normalizeArtifactPath(file.path);
+    const targetPath = path.join(rootDir, relativePath);
+    await mkdir(path.dirname(targetPath), { recursive: true });
+    await writeFile(targetPath, file.content, "utf8");
+  }
+}
+
+function normalizeArtifactPath(filePath: string): string {
+  const normalized = filePath.replaceAll("\\", "/").replace(/^\/+/, "");
+  const parts = normalized.split("/").filter(Boolean);
+  if (parts.length === 0 || parts.some((part) => part === "." || part === "..")) {
+    throw new Error(`Invalid artifact path: ${filePath}`);
+  }
+  return parts.join("/");
+}
+
+function toSerializableRunResult(result: BenchmarkRunResult): BenchmarkRunResult {
+  return {
+    ...result,
+    cases: result.cases.map((caseResult) => ({
+      ...caseResult,
+      attempts: caseResult.attempts.map(({ artifactFiles, ...attempt }) => attempt),
+    })),
+  };
+}
+
+function toHistoryRecord(result: BenchmarkRunResult) {
+  const judgeScores = result.cases.flatMap((caseResult) =>
+    caseResult.attempts.flatMap((attempt) =>
+      typeof attempt.judgeScore === "number" ? [attempt.judgeScore] : []
+    )
+  );
+
+  return {
+    createdAt: result.createdAt,
+    gitSha: result.gitSha,
+    mode: result.mode,
+    runs: result.runs,
+    runModel: result.runModel,
+    judgeModel: result.judgeModel,
+    caseCount: result.caseCount,
+    attemptCount: result.attemptCount,
+    passedAttempts: result.passedAttempts,
+    passRate: result.passRate,
+    averageDurationMs: result.averageDurationMs,
+    averageJudgeScore:
+      judgeScores.length === 0
+        ? null
+        : judgeScores.reduce((sum, score) => sum + score, 0) / judgeScores.length,
+    averageTokenUsagePerAttempt: result.averageTokenUsagePerAttempt ?? null,
+    failedCaseIds: Array.from(
+      new Set(
+        result.cases
+          .filter((caseResult) => caseResult.attempts.some((attempt) => !attempt.passed))
+          .map((caseResult) => caseResult.id)
+      )
+    ),
+    cases: result.cases.map((caseResult) => {
+      const attemptCount = caseResult.attempts.length;
+      const passedAttempts = caseResult.attempts.filter((attempt) => attempt.passed).length;
+      const totalDurationMs = caseResult.attempts.reduce(
+        (sum, attempt) => sum + attempt.durationMs,
+        0
+      );
+      const judgeScores = caseResult.attempts.flatMap((attempt) =>
+        typeof attempt.judgeScore === "number" ? [attempt.judgeScore] : []
+      );
+      const totalTokenUsage = caseResult.attempts.reduce<BenchmarkTokenUsage | null>(
+        (sum, attempt) => {
+          if (!attempt.tokenUsage) {
+            return sum;
+          }
+          sum ??= { prompt: 0, completion: 0, total: 0 };
+          sum.prompt += attempt.tokenUsage.prompt;
+          sum.completion += attempt.tokenUsage.completion;
+          sum.total += attempt.tokenUsage.total;
+          return sum;
+        },
+        null
+      );
+
+      return {
+        id: caseResult.id,
+        attemptCount,
+        passedAttempts,
+        passRate: attemptCount === 0 ? 0 : passedAttempts / attemptCount,
+        averageDurationMs: attemptCount === 0 ? 0 : totalDurationMs / attemptCount,
+        averageJudgeScore:
+          judgeScores.length === 0
+            ? null
+            : judgeScores.reduce((sum, score) => sum + score, 0) / judgeScores.length,
+        averageTokenUsagePerAttempt:
+          attemptCount === 0 || !totalTokenUsage
+            ? null
+            : {
+                prompt: totalTokenUsage.prompt / attemptCount,
+                completion: totalTokenUsage.completion / attemptCount,
+                total: totalTokenUsage.total / attemptCount,
+              },
+      };
+    }),
+  };
+}
+
+function getGitSha(): string | null {
+  try {
+    return execFileSync("git", ["rev-parse", "HEAD"], {
+      cwd: getRepoRoot(),
+      encoding: "utf8",
+      stdio: ["ignore", "pipe", "ignore"],
+    }).trim();
+  } catch {
+    return null;
+  }
+}
+
+function formatPercent(value: number): string {
+  return `${(value * 100).toFixed(1)}%`;
+}
--- a/ai_evals/core/runSuite.ts
+++ b/ai_evals/core/runSuite.ts
@@ -0,0 +1,301 @@
+import { judgeOutput, DEFAULT_JUDGE_MODEL } from "./judge";
+import type {
+  BenchmarkAttemptResult,
+  BenchmarkCaseResult,
+  BenchmarkCheck,
+  EvalCase,
+  FrontendBenchmarkProgressEvent,
+  ModeRunner,
+} from "./types";
+
+export async function runSuite<TInitial, TExpected, TActual>(input: {
+  modeRunner: ModeRunner<TInitial, TExpected, TActual>;
+  cases: EvalCase[];
+  runs: number;
+  runModel: string | null;
+  judgeModel?: string | null;
+  concurrency?: number;
+  verbose?: boolean;
+  onProgress?: (event: FrontendBenchmarkProgressEvent) => void;
+}): Promise<BenchmarkCaseResult[]> {
+  const judgeModel = input.judgeModel ?? DEFAULT_JUDGE_MODEL;
+  const concurrency = Math.max(1, input.concurrency ?? input.modeRunner.concurrency);
+  const results = new Array<BenchmarkCaseResult>(input.cases.length);
+  let cursor = 0;
+
+  if (input.modeRunner.mode !== "cli") {
+    input.onProgress?.({
+      type: "run-start",
+      surface: input.modeRunner.mode,
+      totalCases: input.cases.length,
+      runs: input.runs,
+      concurrency,
+    });
+  }
+
+  async function worker(): Promise<void> {
+    while (true) {
+      const caseIndex = cursor++;
+      if (caseIndex >= input.cases.length) {
+        return;
+      }
+      const evalCase = input.cases[caseIndex];
+      results[caseIndex] = {
+        id: evalCase.id,
+        prompt: evalCase.prompt,
+        initialPath: evalCase.initialPath,
+        expectedPath: evalCase.expectedPath,
+        attempts: await runCaseAttempts({
+          caseIndex,
+          evalCase,
+          runs: input.runs,
+          judgeModel,
+          judgeThreshold: input.modeRunner.judgeThreshold ?? 80,
+          modeRunner: input.modeRunner,
+          totalCases: input.cases.length,
+          verbose: input.verbose ?? false,
+          onProgress: input.onProgress,
+        }),
+      };
+    }
+  }
+
+  await Promise.all(
+    Array.from({ length: Math.min(concurrency, input.cases.length) }, () => worker())
+  );
+
+  return results;
+}
+
+async function runCaseAttempts<TInitial, TExpected, TActual>(input: {
+  caseIndex: number;
+  evalCase: EvalCase;
+  runs: number;
+  judgeModel: string;
+  judgeThreshold: number;
+  modeRunner: ModeRunner<TInitial, TExpected, TActual>;
+  totalCases: number;
+  verbose: boolean;
+  onProgress?: (event: FrontendBenchmarkProgressEvent) => void;
+}): Promise<BenchmarkAttemptResult[]> {
+  const attempts: BenchmarkAttemptResult[] = [];
+  const surface = input.modeRunner.mode === "cli" ? null : input.modeRunner.mode;
+
+  for (let attempt = 1; attempt <= input.runs; attempt += 1) {
+    if (surface) {
+      input.onProgress?.({
+        type: "attempt-start",
+        surface,
+        caseId: input.evalCase.id,
+        caseNumber: input.caseIndex + 1,
+        totalCases: input.totalCases,
+        attempt,
+        runs: input.runs,
+      });
+    }
+
+    const startedAt = Date.now();
+
+    try {
+      const initial = await input.modeRunner.loadInitial(input.evalCase.initialPath);
+      const expected = await input.modeRunner.loadExpected(input.evalCase.expectedPath);
+      const run = await input.modeRunner.run(input.evalCase.prompt, initial, {
+        caseId: input.evalCase.id,
+        caseNumber: input.caseIndex + 1,
+        totalCases: input.totalCases,
+        attempt,
+        runs: input.runs,
+        verbose: input.verbose,
+        onAssistantMessageStart: input.verbose && surface
+          ? () =>
+              input.onProgress?.({
+                type: "assistant-message-start",
+                surface,
+                caseId: input.evalCase.id,
+                caseNumber: input.caseIndex + 1,
+                totalCases: input.totalCases,
+                attempt,
+                runs: input.runs,
+              })
+          : undefined,
+        onAssistantChunk: input.verbose && surface
+          ? (chunk: string) =>
+              input.onProgress?.({
+                type: "assistant-chunk",
+                surface,
+                caseId: input.evalCase.id,
+                caseNumber: input.caseIndex + 1,
+                totalCases: input.totalCases,
+                attempt,
+                runs: input.runs,
+                chunk,
+              })
+          : undefined,
+        onAssistantMessageEnd: input.verbose && surface
+          ? () =>
+              input.onProgress?.({
+                type: "assistant-message-end",
+                surface,
+                caseId: input.evalCase.id,
+                caseNumber: input.caseIndex + 1,
+                totalCases: input.totalCases,
+                attempt,
+                runs: input.runs,
+              })
+          : undefined,
+      });
+      const checks: BenchmarkCheck[] = [
+        buildCheck("run succeeded", run.success, run.error),
+        ...input.modeRunner.validate({
+          evalCase: input.evalCase,
+          prompt: input.evalCase.prompt,
+          initial,
+          expected,
+          actual: run.actual,
+          run,
+        }),
+      ];
+      const artifactFiles = input.modeRunner.buildArtifacts?.(run.actual) ?? [];
+
+      if (run.success && input.modeRunner.backendValidate) {
+        try {
+          const backendValidation = await input.modeRunner.backendValidate({
+            evalCase: input.evalCase,
+            prompt: input.evalCase.prompt,
+            initial,
+            expected,
+            actual: run.actual,
+            run,
+            context: {
+              caseId: input.evalCase.id,
+              caseNumber: input.caseIndex + 1,
+              totalCases: input.totalCases,
+              attempt,
+              runs: input.runs,
+              verbose: input.verbose,
+              onAssistantMessageStart: undefined,
+              onAssistantChunk: undefined,
+              onAssistantMessageEnd: undefined,
+            },
+          });
+
+          if (backendValidation) {
+            checks.push(...backendValidation.checks);
+            artifactFiles.push(...(backendValidation.artifactFiles ?? []));
+          }
+        } catch (error) {
+          checks.push(
+            buildCheck(
+              "backend validation succeeded",
+              false,
+              error instanceof Error ? error.message : String(error)
+            )
+          );
+        }
+      }
+
+      let judgeScore: number | null = null;
+      let judgeSummary: string | null = null;
+
+      if (run.success) {
+        const judge = await judgeOutput({
+          mode: input.modeRunner.mode,
+          prompt: input.evalCase.prompt,
+          checklist: input.evalCase.judgeChecklist,
+          initial,
+          expected: input.modeRunner.mode === "cli" ? undefined : expected,
+          actual: run.actual,
+          model: input.judgeModel,
+        });
+
+        judgeScore = judge.success ? judge.score : null;
+        judgeSummary = judge.summary;
+        checks.push(buildCheck("judge succeeded", judge.success, judge.error));
+        checks.push(
+          buildCheck(
+            `judge score >= ${input.judgeThreshold}`,
+            (judgeScore ?? 0) >= input.judgeThreshold,
+            judge.success ? `score=${judgeScore}` : judge.error
+          )
+        );
+      }
+
+      const attemptResult: BenchmarkAttemptResult = {
+        attempt,
+        passed: checks.every((check) => check.passed),
+        durationMs: Date.now() - startedAt,
+        assistantMessageCount: run.assistantMessageCount,
+        toolCallCount: run.toolCallCount,
+        toolsUsed: uniqueStrings(run.toolsUsed),
+        skillsInvoked: uniqueStrings(run.skillsInvoked),
+        checks,
+        judgeScore,
+        judgeSummary,
+        error: run.error ?? null,
+        tokenUsage: run.tokenUsage ?? null,
+        artifactsPath: null,
+        artifactFiles,
+      };
+
+      if (surface) {
+        input.onProgress?.({
+          type: "attempt-finish",
+          surface,
+          caseId: input.evalCase.id,
+          caseNumber: input.caseIndex + 1,
+          totalCases: input.totalCases,
+          attempt,
+          runs: input.runs,
+          passed: attemptResult.passed,
+          durationMs: attemptResult.durationMs,
+          judgeScore: attemptResult.judgeScore,
+          error: attemptResult.error,
+        });
+      }
+
+      attempts.push(attemptResult);
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      const failedAttempt: BenchmarkAttemptResult = {
+        attempt,
+        passed: false,
+        durationMs: Date.now() - startedAt,
+        assistantMessageCount: 0,
+        toolCallCount: 0,
+        toolsUsed: [],
+        skillsInvoked: [],
+        checks: [buildCheck("run crashed", false, message)],
+        judgeScore: null,
+        judgeSummary: null,
+        error: message,
+        tokenUsage: null,
+      };
+      if (surface) {
+        input.onProgress?.({
+          type: "attempt-finish",
+          surface,
+          caseId: input.evalCase.id,
+          caseNumber: input.caseIndex + 1,
+          totalCases: input.totalCases,
+          attempt,
+          runs: input.runs,
+          passed: false,
+          durationMs: failedAttempt.durationMs,
+          judgeScore: null,
+          error: message,
+        });
+      }
+      attempts.push(failedAttempt);
+    }
+  }
+
+  return attempts;
+}
+
+function buildCheck(name: string, passed: boolean, details?: string): BenchmarkCheck {
+  return details ? { name, passed, details } : { name, passed };
+}
+
+function uniqueStrings(values: string[]): string[] {
+  return [...new Set(values)];
+}
--- a/ai_evals/core/types.ts
+++ b/ai_evals/core/types.ts
@@ -0,0 +1,255 @@
+export const EVAL_MODES = ["cli", "flow", "script", "app"] as const;
+
+export type EvalMode = (typeof EVAL_MODES)[number];
+
+export interface EvalCaseRuntimeBackendPreview {
+  args?: Record<string, unknown>;
+  timeoutSeconds?: number;
+}
+
+export interface EvalCaseRuntimeSpec {
+  backendPreview?: EvalCaseRuntimeBackendPreview;
+}
+
+export interface FlowValidationSpec {
+  schemaRequiredPaths?: string[];
+  schemaAnyOf?: Array<{
+    requiredPaths: string[];
+  }>;
+  exactTopLevelStepIds?: string[];
+  topLevelStepIds?: string[];
+  topLevelStepOrder?: string[];
+  topLevelStepTypeCountsAtLeast?: Array<{
+    type: string;
+    count: number;
+  }>;
+  topLevelStepTypes?: Array<{
+    id: string;
+    type: string;
+  }>;
+  moduleRules?: Array<{
+    id: string;
+    hasStopAfterIf?: boolean;
+    hasStopAfterAllItersIf?: boolean;
+    immediateChildStepIds?: string[];
+    exactImmediateChildStepIds?: string[];
+    immediateChildStepTypes?: Array<{
+      id: string;
+      type: string;
+    }>;
+    requiredInputTransforms?: Array<{
+      type?: string;
+      expr?: string;
+      exprAnyOf?: string[];
+      value?: string | number | boolean | null;
+    }>;
+  }>;
+  moduleFieldRules?: Array<{
+    id: string;
+    path: string;
+    equals: string | number | boolean | null;
+  }>;
+  resolveResultsRefs?: boolean;
+  requireSpecialModules?: Array<"preprocessor_module" | "failure_module">;
+  requireSuspendSteps?: Array<{
+    id: string;
+    requiredEvents?: number;
+    resumeRequiredStringFieldAnyOf?: string[];
+  }>;
+}
+
+export interface EvalCase {
+  id: string;
+  prompt: string;
+  initialPath?: string;
+  expectedPath?: string;
+  validate?: FlowValidationSpec;
+  judgeChecklist?: string[];
+  runtime?: EvalCaseRuntimeSpec;
+}
+
+export interface BenchmarkCheck {
+  name: string;
+  passed: boolean;
+  details?: string;
+}
+
+export interface JudgeResult {
+  success: boolean;
+  score: number;
+  summary: string;
+  error?: string;
+}
+
+export interface BenchmarkArtifactFile {
+  path: string;
+  content: string;
+}
+
+export interface BackendValidationResult {
+  checks: BenchmarkCheck[];
+  artifactFiles?: BenchmarkArtifactFile[];
+}
+
+export interface BenchmarkTokenUsage {
+  prompt: number;
+  completion: number;
+  total: number;
+}
+
+export interface ModeRunOutput<TActual> {
+  success: boolean;
+  actual: TActual;
+  error?: string;
+  assistantMessageCount: number;
+  toolCallCount: number;
+  toolsUsed: string[];
+  skillsInvoked: string[];
+  tokenUsage?: BenchmarkTokenUsage | null;
+}
+
+export interface ModeRunContext {
+  caseId: string;
+  caseNumber: number;
+  totalCases: number;
+  attempt: number;
+  runs: number;
+  verbose: boolean;
+  onAssistantMessageStart?: () => void;
+  onAssistantChunk?: (chunk: string) => void;
+  onAssistantMessageEnd?: () => void;
+}
+
+export interface ModeRunner<TInitial, TExpected, TActual> {
+  mode: EvalMode;
+  concurrency: number;
+  judgeThreshold?: number;
+  loadInitial(path?: string): Promise<TInitial | undefined>;
+  loadExpected(path?: string): Promise<TExpected | undefined>;
+  run(
+    prompt: string,
+    initial: TInitial | undefined,
+    context: ModeRunContext
+  ): Promise<ModeRunOutput<TActual>>;
+  validate(input: {
+    evalCase: EvalCase;
+    prompt: string;
+    initial: TInitial | undefined;
+    expected: TExpected | undefined;
+    actual: TActual;
+    run: ModeRunOutput<TActual>;
+  }): BenchmarkCheck[];
+  backendValidate?(input: {
+    evalCase: EvalCase;
+    prompt: string;
+    initial: TInitial | undefined;
+    expected: TExpected | undefined;
+    actual: TActual;
+    run: ModeRunOutput<TActual>;
+    context: ModeRunContext;
+  }): Promise<BackendValidationResult | null>;
+  buildArtifacts?(actual: TActual): BenchmarkArtifactFile[];
+}
+
+export interface BenchmarkAttemptResult {
+  attempt: number;
+  passed: boolean;
+  durationMs: number;
+  assistantMessageCount: number;
+  toolCallCount: number;
+  toolsUsed: string[];
+  skillsInvoked: string[];
+  checks: BenchmarkCheck[];
+  judgeScore: number | null;
+  judgeSummary: string | null;
+  error: string | null;
+  tokenUsage?: BenchmarkTokenUsage | null;
+  artifactsPath?: string | null;
+  artifactFiles?: BenchmarkArtifactFile[];
+}
+
+export interface BenchmarkCaseResult {
+  id: string;
+  prompt: string;
+  initialPath?: string;
+  expectedPath?: string;
+  attempts: BenchmarkAttemptResult[];
+}
+
+export interface BenchmarkRunResult {
+  version: 1;
+  mode: EvalMode;
+  createdAt: string;
+  gitSha: string | null;
+  runs: number;
+  runModel: string | null;
+  judgeModel: string | null;
+  caseCount: number;
+  attemptCount: number;
+  passedAttempts: number;
+  passRate: number;
+  averageDurationMs: number;
+  totalTokenUsage?: BenchmarkTokenUsage | null;
+  averageTokenUsagePerAttempt?: BenchmarkTokenUsage | null;
+  artifactsPath?: string | null;
+  cases: BenchmarkCaseResult[];
+}
+
+export type FrontendBenchmarkProgressEvent =
+  | {
+      type: "run-start";
+      surface: Exclude<EvalMode, "cli">;
+      totalCases: number;
+      runs: number;
+      concurrency: number;
+    }
+  | {
+      type: "attempt-start";
+      surface: Exclude<EvalMode, "cli">;
+      caseId: string;
+      caseNumber: number;
+      totalCases: number;
+      attempt: number;
+      runs: number;
+    }
+  | {
+      type: "attempt-finish";
+      surface: Exclude<EvalMode, "cli">;
+      caseId: string;
+      caseNumber: number;
+      totalCases: number;
+      attempt: number;
+      runs: number;
+      passed: boolean;
+      durationMs: number;
+      judgeScore: number | null;
+      error: string | null;
+    }
+  | {
+      type: "assistant-message-start";
+      surface: Exclude<EvalMode, "cli">;
+      caseId: string;
+      caseNumber: number;
+      totalCases: number;
+      attempt: number;
+      runs: number;
+    }
+  | {
+      type: "assistant-chunk";
+      surface: Exclude<EvalMode, "cli">;
+      caseId: string;
+      caseNumber: number;
+      totalCases: number;
+      attempt: number;
+      runs: number;
+      chunk: string;
+    }
+  | {
+      type: "assistant-message-end";
+      surface: Exclude<EvalMode, "cli">;
+      caseId: string;
+      caseNumber: number;
+      totalCases: number;
+      attempt: number;
+      runs: number;
+    };
--- a/ai_evals/core/validators.test.ts
+++ b/ai_evals/core/validators.test.ts
@@ -0,0 +1,36 @@
+import { describe, expect, it } from "bun:test";
+import { validateScriptState } from "./validators";
+
+describe("validateScriptState", () => {
+  it("accepts semantically equivalent script implementations", () => {
+    const checks = validateScriptState({
+      actual: {
+        path: "f/evals/greet_user.ts",
+        lang: "bun",
+        code: "export async function main(name: string): Promise<string> {\n  return `Hello, ${name}!`;\n}\n",
+      },
+      expected: {
+        path: "f/evals/greet_user.ts",
+        lang: "bun",
+        code: "export async function main(name: string) {\n\treturn `Hello, ${name}!`\n}\n",
+      },
+    });
+
+    expect(checks.every((check) => check.passed)).toBe(true);
+  });
+
+  it("still requires an exported main entrypoint", () => {
+    const checks = validateScriptState({
+      actual: {
+        path: "f/evals/greet_user.ts",
+        lang: "bun",
+        code: "async function main(name: string) {\n  return `Hello, ${name}!`;\n}\n",
+      },
+    });
+
+    expect(checks).toContainEqual({
+      name: "script exports entrypoint",
+      passed: false,
+    });
+  });
+});
--- a/ai_evals/core/validators.ts
+++ b/ai_evals/core/validators.ts
--- a/Show More
+++ b/Show More