fix: improve flow chat and benchmark coverage (#8825)

* fix: support special flow modules in evals

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* refactor: extract shared flow helper logic

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* fix: make special flow tools openai-compatible

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* fix: improve flow eval prompts and validation

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* test: relax flow benchmark overfits

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* test: record updated flow benchmark history

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* fix: address flow review findings

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* refactor: source flow chat special module prompt

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* fix: narrow rawscript helper return type

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* refactor: dedupe flow chat prompt guidance

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* fix: relax flow test10 validation

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
centdix
2026-04-15 18:22:39 +02:00
committed by GitHub
parent a3f24aeff8
commit d3cb0c6220
20 changed files with 1961 additions and 207 deletions

View File

@@ -4,10 +4,15 @@ import type { FlowModule, InputTransform } from '../../../../../frontend/src/lib
import type { ExtendedOpenFlow } from '../../../../../frontend/src/lib/components/flows/types'
import type { FlowAIChatHelpers } from '../../../../../frontend/src/lib/components/copilot/chat/flow/core'
import type { ScriptLintResult } from '../../../../../frontend/src/lib/components/copilot/chat/shared'
import { findModuleById } from '../../../../../frontend/src/lib/components/copilot/chat/shared'
import { getSubModules } from '../../../../../frontend/src/lib/components/flows/flowExplorer'
import {
createInlineScriptSession
} from '../../../../../frontend/src/lib/components/copilot/chat/flow/inlineScriptsUtils'
import {
applyFlowJsonUpdate,
getFlowModuleById,
updateRawScriptModuleContent
} from '../../../../../frontend/src/lib/components/copilot/chat/flow/helperUtils'
import {
registerBenchmarkWorkspace,
registerBenchmarkWorkspaceRunnables,
@@ -32,6 +37,8 @@ export interface FlowWorkspaceFixtures {
export async function createFlowFileHelpers(
initialModules: FlowModule[] = [],
initialSchema?: Record<string, any>,
initialPreprocessorModule?: FlowModule,
initialFailureModule?: FlowModule,
workspaceRoot?: string,
workspaceFixtures?: FlowWorkspaceFixtures
): Promise<{
@@ -42,7 +49,11 @@ export async function createFlowFileHelpers(
workspaceDir: string | null
}> {
let flow: ExtendedOpenFlow = {
value: { modules: structuredClone(initialModules) },
value: {
modules: structuredClone(initialModules),
preprocessor_module: structuredClone(initialPreprocessorModule),
failure_module: structuredClone(initialFailureModule)
},
summary: '',
schema: initialSchema ?? {
$schema: 'https://json-schema.org/draft/2020-12/schema',
@@ -76,36 +87,29 @@ export async function createFlowFileHelpers(
getFlowAndSelectedId: () => ({ flow, selectedId: '' }),
getModules: (id?: string) => {
if (!id) return flow.value.modules
const module = findModuleById(flow.value.modules, id)
return module ? [module] : []
const module = getFlowModuleById(flow, id)
return module ? getSubModules(module).flat() : []
},
inlineScriptSession,
setSnapshot: () => {},
revertToSnapshot: () => {},
setCode: async (id: string, code: string) => {
const module = findModuleById(flow.value.modules, id)
if (module && module.value.type === 'rawscript') {
module.value.content = code
}
updateRawScriptModuleContent(flow, id, code)
inlineScriptSession.set(id, code)
await persistFlow()
},
setFlowJson: async (
modules: FlowModule[] | undefined,
schema: Record<string, any> | undefined
schema: Record<string, any> | undefined,
preprocessorModule: FlowModule | null | undefined,
failureModule: FlowModule | null | undefined
) => {
if (modules) {
flow.value.modules = inlineScriptSession.restoreInlineScriptReferences(modules)
const unresolvedRefs = inlineScriptSession.findUnresolvedInlineScriptRefs(flow.value.modules)
if (unresolvedRefs.length > 0) {
throw new Error(
`Unresolved inline script references: ${unresolvedRefs.join(', ')}`
)
}
}
if (schema !== undefined) {
flow.schema = schema
}
applyFlowJsonUpdate(flow, inlineScriptSession, {
modules,
schema,
preprocessorModule,
failureModule
})
await persistFlow()
},
getFlowInputsSchema: async () => flow.schema ?? {},
@@ -122,7 +126,9 @@ export async function createFlowFileHelpers(
JSON.stringify(
{
requestedArgs: args ?? {},
modules: flow.value.modules.map((module) => module.id)
modules: flow.value.modules.map((module) => module.id),
preprocessor_module: flow.value.preprocessor_module?.id ?? null,
failure_module: flow.value.failure_module?.id ?? null
},
null,
2
@@ -136,6 +142,8 @@ export async function createFlowFileHelpers(
result: {
requestedArgs: args ?? {},
modules: flow.value.modules.map((module) => module.id),
preprocessor_module: flow.value.preprocessor_module?.id ?? null,
failure_module: flow.value.failure_module?.id ?? null,
mocked: true
},
logs: 'Mock benchmark flow test run completed successfully.'

View File

@@ -19,6 +19,8 @@ import type { TokenUsage } from '../shared/types'
export interface FlowFixture {
value?: {
modules?: FlowModule[]
preprocessor_module?: FlowModule
failure_module?: FlowModule
}
schema?: Record<string, unknown>
}
@@ -54,6 +56,8 @@ export async function runFlowEval(
const { helpers, getFlow, cleanup } = await createFlowFileHelpers(
options?.initialFlow?.value?.modules ?? [],
options?.initialFlow?.schema,
options?.initialFlow?.value?.preprocessor_module,
options?.initialFlow?.value?.failure_module,
workspaceRoot,
options?.workspaceFixtures
)

View File

@@ -136,7 +136,7 @@
- search FAQs
- open a support ticket when needed
After that, log the interaction and return the assistant's response along with any actions it took.
After that, log the interaction and return the assistant's response.
judgeChecklist:
- "the input schema includes `customer_id` and `query_text`"
- the flow loads the customer's profile and order history
@@ -146,24 +146,40 @@
- the assistant can search FAQs
- the assistant can open a support ticket
- the flow logs the interaction
- the final output returns the assistant response along with any actions taken or resulting support action details
- the final output returns the assistant response
- id: flow-test7-simple-modification
prompt: |-
Update this flow so it validates processed data before saving it.
After `process_data`, add a `validate_data` step that checks the data array is not empty.
If the array is empty, it should return an error object with the message `No data to save`.
If the array is empty, the flow should surface the message `No data to save` and prevent saving.
If validation passes, let the save continue normally.
Update `save_results` so it handles the validation result correctly.
Update `save_results` so it uses the validation outcome instead of bypassing it.
initial: ai_evals/fixtures/frontend/flow/initial/test5_initial.json
expected: ai_evals/fixtures/frontend/flow/expected/test5_modify_simple.json
validate:
topLevelStepIds:
- fetch_data
- process_data
- validate_data
topLevelStepOrder:
- fetch_data
- process_data
- validate_data
topLevelStepTypes:
- id: fetch_data
type: rawscript
- id: process_data
type: rawscript
- id: validate_data
type: rawscript
judgeChecklist:
- the updated flow keeps the original fetch and process steps intact
- "a `validate_data` step is added after `process_data`"
- "`validate_data` checks that the processed data array is not empty"
- "empty data returns an error object with the message `No data to save`"
- "`save_results` handles the validation result correctly"
- "when processed data is empty, the flow surfaces the message `No data to save` and does not save results"
- "`save_results` uses the validation outcome instead of reading `results.process_data` directly"
- "exact field names or wrapper object shape for the validation result are not important"
- id: flow-test8-branching-in-loop
prompt: |-
@@ -193,7 +209,29 @@
Update `combine_data` so it merges the enrichment results and sets a `hasFallbacks` flag when any fallback was used.
Keep `get_item` as the first step and `return_result` as the last step.
initial: ai_evals/fixtures/frontend/flow/initial/test7_initial.json
expected: ai_evals/fixtures/frontend/flow/expected/test7_modify_complex.json
validate:
topLevelStepIds:
- get_item
- combine_data
- return_result
topLevelStepOrder:
- get_item
- combine_data
- return_result
topLevelStepTypeCountsAtLeast:
- type: branchall
count: 1
topLevelStepTypes:
- id: get_item
type: rawscript
- id: combine_data
type: rawscript
- id: return_result
type: rawscript
moduleRules:
- id: enrich_price
- id: enrich_inventory
- id: enrich_reviews
judgeChecklist:
- "the updated flow keeps `get_item` as the first step"
- "the updated flow keeps `return_result` as the last step"
@@ -206,14 +244,42 @@
prompt: |-
Create a flow that keeps incrementing a counter until it reaches a target value.
The input should include a number field named `target`.
Name the looping step `count_until_target`.
Once the target is reached, return the final counter value.
expected: ai_evals/fixtures/frontend/flow/expected/test10_while_loop_counter.json
Use a top-level loop step named `count_until_target`.
Inside it, use a single step named `increment_counter` that increments the current counter.
The loop should stop once the counter reaches `target`.
After the loop, add a top-level step named `return_final_counter` that returns the last counter value.
validate:
exactTopLevelStepIds:
- count_until_target
- return_final_counter
topLevelStepOrder:
- count_until_target
- return_final_counter
topLevelStepTypes:
- id: count_until_target
type: whileloopflow
- id: return_final_counter
type: rawscript
moduleRules:
- id: count_until_target
hasStopAfterIf: true
hasStopAfterAllItersIf: false
exactImmediateChildStepIds:
- increment_counter
immediateChildStepTypes:
- id: increment_counter
type: rawscript
moduleFieldRules:
- id: count_until_target
path: stop_after_if.expr
equals: result >= flow_input.target
judgeChecklist:
- "the input schema includes a number field named `target`"
- "the looping step is named `count_until_target`"
- the flow keeps incrementing a counter until the target is reached
- the final output returns the final counter value
- "the top-level while loop step is named `count_until_target`"
- "`count_until_target` contains a single increment step named `increment_counter`"
- "`count_until_target` uses module-level `stop_after_if` to stop when the counter reaches `target`"
- "`increment_counter` uses `flow_input.iter.value` or an equivalent loop-state expression and falls back to `0` on the first iteration"
- "`return_final_counter` returns the final counter value"
- id: flow-test11-preprocessor-and-failure-handler
prompt: |-
@@ -242,8 +308,16 @@
Add an approval step named `request_approval` that pauses the flow and asks the approver for a comment.
One approval should be enough to continue.
After approval, add a final step named `finalize_purchase` that returns an approved status object.
expected: ai_evals/fixtures/frontend/flow/expected/test12_approval_step.json
validate:
topLevelStepIds:
- request_approval
- finalize_purchase
topLevelStepOrder:
- request_approval
- finalize_purchase
topLevelStepTypes:
- id: finalize_purchase
type: rawscript
schemaRequiredPaths:
- requester_email
- amount

View File

@@ -34,6 +34,8 @@ export async function judgeOutput(input: {
"If a checklist is provided, treat it as the explicit acceptance criteria for this case.",
"Be strict about missing requested functionality.",
"When the prompt wording is ambiguous, prefer the checklist over inferred structural requirements.",
"Do not invent additional Windmill-specific constraints that are not explicit in the prompt, checklist, or expected state.",
"Do not lower the score just because the output uses a different but valid Windmill idiom, naming choice, or equivalent field shape.",
"Do not require exact ids, exact topology, or exact field names unless the prompt, checklist, or expected state clearly requires them.",
`Always respond by calling the ${JUDGE_TOOL_NAME} tool exactly once.`,
].join("\n\n");

View File

@@ -16,6 +16,39 @@ export interface FlowValidationSpec {
schemaAnyOf?: Array<{
requiredPaths: string[];
}>;
exactTopLevelStepIds?: string[];
topLevelStepIds?: string[];
topLevelStepOrder?: string[];
topLevelStepTypeCountsAtLeast?: Array<{
type: string;
count: number;
}>;
topLevelStepTypes?: Array<{
id: string;
type: string;
}>;
moduleRules?: Array<{
id: string;
hasStopAfterIf?: boolean;
hasStopAfterAllItersIf?: boolean;
immediateChildStepIds?: string[];
exactImmediateChildStepIds?: string[];
immediateChildStepTypes?: Array<{
id: string;
type: string;
}>;
requiredInputTransforms?: Array<{
type?: string;
expr?: string;
exprAnyOf?: string[];
value?: string | number | boolean | null;
}>;
}>;
moduleFieldRules?: Array<{
id: string;
path: string;
equals: string | number | boolean | null;
}>;
resolveResultsRefs?: boolean;
requireSpecialModules?: Array<"preprocessor_module" | "failure_module">;
requireSuspendSteps?: Array<{

View File

@@ -490,6 +490,181 @@ function validateFlowRequirements(
validate: FlowValidationSpec
): BenchmarkCheck[] {
const checks: BenchmarkCheck[] = [];
const actualTopLevelModules = getFlowModules(flow);
const actualIds = actualTopLevelModules
.map((module) => (typeof module.id === "string" ? module.id : null))
.filter((id): id is string => Boolean(id));
if (validate.exactTopLevelStepIds && validate.exactTopLevelStepIds.length > 0) {
checks.push(
check(
"flow top-level step ids match exactly",
stringArraysEqual(actualIds, validate.exactTopLevelStepIds),
`expected ids: ${validate.exactTopLevelStepIds.join(", ")}; actual ids: ${actualIds.join(", ")}`
)
);
}
if (validate.topLevelStepIds && validate.topLevelStepIds.length > 0) {
checks.push(
check(
"flow includes required top-level step ids",
validate.topLevelStepIds.every((id) => actualIds.includes(id)),
`required ids: ${validate.topLevelStepIds.join(", ")}; actual ids: ${actualIds.join(", ")}`
)
);
}
if (validate.topLevelStepOrder && validate.topLevelStepOrder.length > 0) {
checks.push(
check(
"flow preserves required top-level step order",
preservesRelativeOrder(actualIds, validate.topLevelStepOrder),
`required order: ${validate.topLevelStepOrder.join(" -> ")}; actual ids: ${actualIds.join(" -> ")}`
)
);
}
for (const typeRequirement of validate.topLevelStepTypeCountsAtLeast ?? []) {
const actualCount = actualTopLevelModules.filter(
(module) => getModuleType(module) === typeRequirement.type
).length;
checks.push(
check(
`flow includes at least ${typeRequirement.count} top-level ${typeRequirement.type} step${typeRequirement.count === 1 ? "" : "s"}`,
actualCount >= typeRequirement.count,
`expected at least ${typeRequirement.count}, got ${actualCount}`
)
);
}
for (const requiredStep of validate.topLevelStepTypes ?? []) {
const module = actualTopLevelModules.find((candidate) => candidate.id === requiredStep.id);
checks.push(check(`${requiredStep.id} step exists`, Boolean(module)));
if (!module) {
continue;
}
checks.push(
check(
`${requiredStep.id} type matches required`,
getModuleType(module) === requiredStep.type,
`expected ${requiredStep.type}, got ${getModuleType(module) ?? "(missing)"}`
)
);
}
for (const moduleRule of validate.moduleRules ?? []) {
const module = findFlowModuleById(flow, moduleRule.id);
checks.push(check(`${moduleRule.id} module exists for rule validation`, Boolean(module)));
if (!module) {
continue;
}
if (moduleRule.hasStopAfterIf !== undefined) {
checks.push(
check(
`${moduleRule.id} stop_after_if presence matches required shape`,
hasStopAfterIf(module) === moduleRule.hasStopAfterIf,
`expected stop_after_if=${moduleRule.hasStopAfterIf}, got ${hasStopAfterIf(module)}`
)
);
}
if (moduleRule.hasStopAfterAllItersIf !== undefined) {
checks.push(
check(
`${moduleRule.id} stop_after_all_iters_if presence matches required shape`,
hasStopAfterAllItersIf(module) === moduleRule.hasStopAfterAllItersIf,
`expected stop_after_all_iters_if=${moduleRule.hasStopAfterAllItersIf}, got ${hasStopAfterAllItersIf(module)}`
)
);
}
const immediateChildren = getImmediateNestedModules(module);
const childIds = immediateChildren
.map((child) => (typeof child.id === "string" ? child.id : null))
.filter((id): id is string => Boolean(id));
if (moduleRule.immediateChildStepIds && moduleRule.immediateChildStepIds.length > 0) {
checks.push(
check(
`${moduleRule.id} includes required immediate child steps`,
moduleRule.immediateChildStepIds.every((id) => childIds.includes(id)),
`required child ids: ${moduleRule.immediateChildStepIds.join(", ")}; actual child ids: ${childIds.join(", ")}`
)
);
}
if (moduleRule.exactImmediateChildStepIds && moduleRule.exactImmediateChildStepIds.length > 0) {
checks.push(
check(
`${moduleRule.id} immediate child steps match exactly`,
stringArraysEqual(childIds, moduleRule.exactImmediateChildStepIds),
`expected child ids: ${moduleRule.exactImmediateChildStepIds.join(", ")}; actual child ids: ${childIds.join(", ")}`
)
);
}
for (const requiredChild of moduleRule.immediateChildStepTypes ?? []) {
const child = immediateChildren.find((candidate) => candidate.id === requiredChild.id);
checks.push(check(`${moduleRule.id}.${requiredChild.id} child step exists`, Boolean(child)));
if (!child) {
continue;
}
checks.push(
check(
`${moduleRule.id}.${requiredChild.id} child type matches required`,
getModuleType(child) === requiredChild.type,
`expected ${requiredChild.type}, got ${getModuleType(child) ?? "(missing)"}`
)
);
}
const inputTransforms = getInputTransformRecords(module);
for (const requiredTransform of moduleRule.requiredInputTransforms ?? []) {
const matchedTransform = inputTransforms.find((transform) =>
matchesRequiredInputTransform(transform, requiredTransform)
);
const expectedParts = [
requiredTransform.type ? `type=${JSON.stringify(requiredTransform.type)}` : null,
requiredTransform.expr ? `expr=${JSON.stringify(requiredTransform.expr)}` : null,
requiredTransform.exprAnyOf && requiredTransform.exprAnyOf.length > 0
? `exprAnyOf=${JSON.stringify(requiredTransform.exprAnyOf)}`
: null,
requiredTransform.value !== undefined
? `value=${JSON.stringify(requiredTransform.value)}`
: null,
].filter(Boolean);
checks.push(
check(
`${moduleRule.id} includes required input transform (${expectedParts.join(", ")})`,
Boolean(matchedTransform),
`available transforms: ${summarizeInputTransforms(inputTransforms)}`
)
);
}
}
for (const fieldRule of validate.moduleFieldRules ?? []) {
const module = findFlowModuleById(flow, fieldRule.id);
checks.push(check(`${fieldRule.id} module exists for field validation`, Boolean(module)));
if (!module) {
continue;
}
const actualValue = getValueAtPath(module, fieldRule.path);
checks.push(
check(
`${fieldRule.id}.${fieldRule.path} matches required value`,
valuesEqualForValidation(actualValue, fieldRule.equals),
`expected ${JSON.stringify(fieldRule.equals)}, got ${JSON.stringify(actualValue)}`
)
);
}
for (const requiredPath of validate.schemaRequiredPaths ?? []) {
checks.push(
@@ -639,6 +814,30 @@ function preservesRelativeOrder(actualIds: string[], expectedIds: string[]): boo
return false;
}
// Exact equality, including order. Use a different helper for order-insensitive checks.
function stringArraysEqual(left: string[], right: string[]): boolean {
if (left.length !== right.length) {
return false;
}
return left.every((value, index) => value === right[index]);
}
function valuesEqualForValidation(
actual: unknown,
expected: string | number | boolean | null
): boolean {
if (typeof expected === "string" && typeof actual === "string") {
return normalizeInlineExpression(actual) === normalizeInlineExpression(expected);
}
return actual === expected;
}
function normalizeInlineExpression(value: string): string {
return value.replace(/\s+/g, " ").trim();
}
function collectUnresolvedResultsRefs(flow: FlowState): string[] {
const unresolved = new Set<string>();
validateModuleSequence(getFlowModules(flow), new Map<string, Record<string, unknown>>(), unresolved);
@@ -873,18 +1072,24 @@ function getInlineScriptPlaceholderModuleIds(flow: FlowState): string[] {
}
function getImmediateNestedModuleIds(module: Record<string, unknown>): string[] {
const ids: string[] = [];
return getImmediateNestedModules(module).flatMap((child) =>
typeof child.id === "string" ? [child.id] : []
);
}
function getImmediateNestedModules(module: Record<string, unknown>): Array<Record<string, unknown>> {
const nested: Array<Record<string, unknown>> = [];
const value = isObjectRecord(module.value) ? module.value : null;
if (!value) {
return ids;
return nested;
}
if (Array.isArray(value.modules)) {
ids.push(...asModuleArray(value.modules).flatMap((child) => (typeof child.id === "string" ? [child.id] : [])));
nested.push(...asModuleArray(value.modules));
}
if (Array.isArray(value.default)) {
ids.push(...asModuleArray(value.default).flatMap((child) => (typeof child.id === "string" ? [child.id] : [])));
nested.push(...asModuleArray(value.default));
}
if (Array.isArray(value.branches)) {
@@ -892,13 +1097,11 @@ function getImmediateNestedModuleIds(module: Record<string, unknown>): string[]
if (!isObjectRecord(branch) || !Array.isArray(branch.modules)) {
continue;
}
ids.push(
...asModuleArray(branch.modules).flatMap((child) => (typeof child.id === "string" ? [child.id] : []))
);
nested.push(...asModuleArray(branch.modules));
}
}
return ids;
return nested;
}
function getModuleCode(module: Record<string, unknown>): string | null {
@@ -906,6 +1109,79 @@ function getModuleCode(module: Record<string, unknown>): string | null {
return typeof value?.content === "string" ? value.content : null;
}
function getValueAtPath(record: Record<string, unknown>, dottedPath: string): unknown {
const segments = dottedPath.split(".").filter(Boolean);
let current: unknown = record;
for (const segment of segments) {
if (!isObjectRecord(current)) {
return undefined;
}
current = current[segment];
}
return current;
}
function getInputTransformRecords(module: Record<string, unknown>): Array<Record<string, unknown>> {
const value = isObjectRecord(module.value) ? module.value : null;
const inputTransforms = isObjectRecord(value?.input_transforms) ? value.input_transforms : null;
if (!inputTransforms) {
return [];
}
return Object.values(inputTransforms).filter(isObjectRecord);
}
function matchesRequiredInputTransform(
actual: Record<string, unknown>,
required: {
type?: string;
expr?: string;
exprAnyOf?: string[];
value?: string | number | boolean | null;
}
): boolean {
if (required.type !== undefined && !valuesEqualForValidation(actual.type, required.type)) {
return false;
}
if (required.expr !== undefined && !valuesEqualForValidation(actual.expr, required.expr)) {
return false;
}
if (required.exprAnyOf !== undefined) {
if (
typeof actual.expr !== "string" ||
!required.exprAnyOf.some((candidate) => valuesEqualForValidation(actual.expr, candidate))
) {
return false;
}
}
if (required.value !== undefined && !valuesEqualForValidation(actual.value, required.value)) {
return false;
}
return true;
}
function summarizeInputTransforms(transforms: Array<Record<string, unknown>>): string {
if (transforms.length === 0) {
return "(none)";
}
return transforms
.map((transform) =>
JSON.stringify({
type: transform.type,
expr: transform.expr,
value: transform.value,
})
)
.join("; ");
}
function asModuleArray(value: unknown[]): Array<Record<string, unknown>> {
return value.filter(isObjectRecord);
}
@@ -950,6 +1226,14 @@ function hasSuspendConfig(module: Record<string, unknown>): boolean {
return typeof module.suspend === "object" && module.suspend !== null;
}
function hasStopAfterIf(module: Record<string, unknown>): boolean {
return isObjectRecord(module.stop_after_if);
}
function hasStopAfterAllItersIf(module: Record<string, unknown>): boolean {
return isObjectRecord(module.stop_after_all_iters_if);
}
function getSuspendRequiredEvents(module: Record<string, unknown>): number | null {
const suspend = isObjectRecord(module.suspend) ? module.suspend : null;
return typeof suspend?.required_events === "number" ? suspend.required_events : null;

View File

@@ -4,11 +4,25 @@
{
"id": "count_until_target",
"value": {
"type": "whileloopflow"
"type": "whileloopflow",
"skip_failures": false,
"modules": [
{
"id": "increment_counter",
"value": {
"type": "rawscript",
"language": "bun"
}
}
]
},
"stop_after_if": {
"expr": "result >= flow_input.target",
"skip_if_stopped": false
}
},
{
"id": "return_final_count",
"id": "return_final_counter",
"value": {
"type": "rawscript"
}
@@ -25,6 +39,9 @@
},
"required": [
"target"
],
"order": [
"target"
]
}
}

View File

@@ -0,0 +1,3 @@
Recorded history rows are anchored to the benchmark-definition commit used for the run.
That means `gitSha` points to the commit whose prompts, evaluators, and fixtures produced the recorded result. A later commit may only add the new JSONL row to git history without changing the benchmark itself.

View File

@@ -1,3 +1,15 @@
{"createdAt":"2026-04-10T14:25:16.664Z","gitSha":"8f8b487be517a0bdd318c36857c1d46d5ab0723a","mode":"flow","runs":1,"runModel":"anthropic:claude-haiku-4-5-20251001","judgeModel":"claude-sonnet-4-6","caseCount":13,"attemptCount":13,"passedAttempts":8,"passRate":0.6153846153846154,"averageDurationMs":33424.692307692305,"averageJudgeScore":82.61538461538461,"averageTokenUsagePerAttempt":{"prompt":131901,"completion":3121.230769230769,"total":135022.23076923078},"failedCaseIds":["flow-test6-ai-agent-tools","flow-test7-simple-modification","flow-test9-parallel-refactor","flow-test10-while-loop-counter","flow-test11-preprocessor-and-failure-handler"],"cases":[{"id":"flow-test0-sum-two-numbers","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":16943,"averageJudgeScore":98,"averageTokenUsagePerAttempt":{"prompt":126615,"completion":839,"total":127454}},{"id":"flow-test1-reuse-existing-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":15220,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":75614,"completion":805,"total":76419}},{"id":"flow-test2-call-existing-subflow","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":15699,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":76182,"completion":887,"total":77069}},{"id":"flow-test3-branchone-routing","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":21605,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":62230,"completion":1509,"total":63739}},{"id":"flow-test4-order-processing-loop","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":47228,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":143511,"completion":5443,"total":148954}},{"id":"flow-test5-parallel-data-pipeline","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":81870,"averageJudgeScore":92,"averageTokenUsagePerAttempt":{"prompt":194542,"completion":12409,"total":206951}},{"id":"flow-test6-ai-agent-tools","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":51878,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":142071,"completion":5720,"total":147791}},{"id":"flow-test7-simple-modification","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":49113,"averageJudgeScore":42,"averageTokenUsagePerAttempt":{"prompt":318525,"completion":2702,"total":321227}},{"id":"flow-test8-branching-in-loop","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":18244,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":78441,"completion":979,"total":79420}},{"id":"flow-test9-parallel-refactor","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":49485,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":135237,"completion":5467,"total":140704}},{"id":"flow-test10-while-loop-counter","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":21210,"averageJudgeScore":90,"averageTokenUsagePerAttempt":{"prompt":127844,"completion":1179,"total":129023}},{"id":"flow-test11-preprocessor-and-failure-handler","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":25142,"averageJudgeScore":42,"averageTokenUsagePerAttempt":{"prompt":128648,"completion":1337,"total":129985}},{"id":"flow-test12-approval-step","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":20884,"averageJudgeScore":90,"averageTokenUsagePerAttempt":{"prompt":105253,"completion":1300,"total":106553}}]}
{"createdAt":"2026-04-10T14:57:17.513Z","gitSha":"2a58402cfc5c320748839e92b51a1291b937bf26","mode":"flow","runs":1,"runModel":"anthropic:claude-opus-4-6","judgeModel":"claude-sonnet-4-6","caseCount":13,"attemptCount":13,"passedAttempts":8,"passRate":0.6153846153846154,"averageDurationMs":58074.53846153846,"averageJudgeScore":87.53846153846153,"averageTokenUsagePerAttempt":{"prompt":125452.76923076923,"completion":2957.769230769231,"total":128410.53846153847},"failedCaseIds":["flow-test4-order-processing-loop","flow-test6-ai-agent-tools","flow-test7-simple-modification","flow-test10-while-loop-counter","flow-test11-preprocessor-and-failure-handler"],"cases":[{"id":"flow-test0-sum-two-numbers","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":26967,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":103796,"completion":634,"total":104430}},{"id":"flow-test1-reuse-existing-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":29009,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":75507,"completion":743,"total":76250}},{"id":"flow-test2-call-existing-subflow","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":26828,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":76172,"completion":807,"total":76979}},{"id":"flow-test3-branchone-routing","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":44418,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":130440,"completion":1787,"total":132227}},{"id":"flow-test4-order-processing-loop","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":82185,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":116133,"completion":4905,"total":121038}},{"id":"flow-test5-parallel-data-pipeline","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":110344,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":122092,"completion":6980,"total":129072}},{"id":"flow-test6-ai-agent-tools","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":119901,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":154916,"completion":8908,"total":163824}},{"id":"flow-test7-simple-modification","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":44333,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":109935,"completion":1536,"total":111471}},{"id":"flow-test8-branching-in-loop","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":54247,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":136872,"completion":2638,"total":139510}},{"id":"flow-test9-parallel-refactor","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":63274,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":137794,"completion":3686,"total":141480}},{"id":"flow-test10-while-loop-counter","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":38813,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":105075,"completion":1157,"total":106232}},{"id":"flow-test11-preprocessor-and-failure-handler","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":77267,"averageJudgeScore":52,"averageTokenUsagePerAttempt":{"prompt":256547,"completion":3398,"total":259945}},{"id":"flow-test12-approval-step","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":37383,"averageJudgeScore":90,"averageTokenUsagePerAttempt":{"prompt":105607,"completion":1272,"total":106879}}]}
{"createdAt":"2026-04-10T14:29:52.249Z","gitSha":"8f8b487be517a0bdd318c36857c1d46d5ab0723a","mode":"flow","runs":1,"runModel":"openai:gpt-4o","judgeModel":"claude-sonnet-4-6","caseCount":13,"attemptCount":13,"passedAttempts":6,"passRate":0.46153846153846156,"averageDurationMs":29841.53846153846,"averageJudgeScore":68.46153846153847,"averageTokenUsagePerAttempt":{"prompt":72815.92307692308,"completion":770.7692307692307,"total":73586.69230769231},"failedCaseIds":["flow-test5-parallel-data-pipeline","flow-test6-ai-agent-tools","flow-test7-simple-modification","flow-test9-parallel-refactor","flow-test10-while-loop-counter","flow-test11-preprocessor-and-failure-handler","flow-test12-approval-step"],"cases":[{"id":"flow-test0-sum-two-numbers","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":20059,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":64091,"completion":265,"total":64356}},{"id":"flow-test1-reuse-existing-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":20728,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":46594,"completion":270,"total":46864}},{"id":"flow-test2-call-existing-subflow","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":21533,"averageJudgeScore":98,"averageTokenUsagePerAttempt":{"prompt":46859,"completion":232,"total":47091}},{"id":"flow-test3-branchone-routing","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":29004,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":64593,"completion":568,"total":65161}},{"id":"flow-test4-order-processing-loop","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":36250,"averageJudgeScore":95,"averageTokenUsagePerAttempt":{"prompt":66346,"completion":1259,"total":67605}},{"id":"flow-test5-parallel-data-pipeline","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":46151,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":104676,"completion":1698,"total":106374}},{"id":"flow-test6-ai-agent-tools","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":72403,"averageJudgeScore":62,"averageTokenUsagePerAttempt":{"prompt":105280,"completion":2216,"total":107496}},{"id":"flow-test7-simple-modification","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":41599,"averageJudgeScore":20,"averageTokenUsagePerAttempt":{"prompt":103053,"completion":707,"total":103760}},{"id":"flow-test8-branching-in-loop","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":23352,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":97955,"completion":468,"total":98423}},{"id":"flow-test9-parallel-refactor","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":19341,"averageJudgeScore":0,"averageTokenUsagePerAttempt":{"prompt":12254,"completion":1057,"total":13311}},{"id":"flow-test10-while-loop-counter","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":16143,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":64480,"completion":445,"total":64925}},{"id":"flow-test11-preprocessor-and-failure-handler","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":24231,"averageJudgeScore":52,"averageTokenUsagePerAttempt":{"prompt":106068,"completion":472,"total":106540}},{"id":"flow-test12-approval-step","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":17146,"averageJudgeScore":30,"averageTokenUsagePerAttempt":{"prompt":64358,"completion":363,"total":64721}}]}
{"createdAt":"2026-04-13T16:38:05.547Z","gitSha":"3f5841f84d878cd3f43c435fa237d3f0c2265fb9","mode":"flow","runs":1,"runModel":"anthropic:claude-haiku-4-5-20251001","judgeModel":"claude-sonnet-4-6","caseCount":13,"attemptCount":13,"passedAttempts":8,"passRate":0.6153846153846154,"averageDurationMs":28942.846153846152,"averageJudgeScore":83.46153846153847,"averageTokenUsagePerAttempt":{"prompt":110218.15384615384,"completion":2819,"total":113037.15384615384},"failedCaseIds":["flow-test4-order-processing-loop","flow-test6-ai-agent-tools","flow-test7-simple-modification","flow-test10-while-loop-counter","flow-test11-preprocessor-and-failure-handler"],"cases":[{"id":"flow-test0-sum-two-numbers","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":15019,"averageJudgeScore":98,"averageTokenUsagePerAttempt":{"prompt":103955,"completion":771,"total":104726}},{"id":"flow-test1-reuse-existing-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":15667,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":75649,"completion":803,"total":76452}},{"id":"flow-test2-call-existing-subflow","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":13990,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":76215,"completion":877,"total":77092}},{"id":"flow-test3-branchone-routing","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":17999,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":46494,"completion":1476,"total":47970}},{"id":"flow-test4-order-processing-loop","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":44637,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":142164,"completion":4784,"total":146948}},{"id":"flow-test5-parallel-data-pipeline","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":66613,"averageJudgeScore":95,"averageTokenUsagePerAttempt":{"prompt":158640,"completion":10231,"total":168871}},{"id":"flow-test6-ai-agent-tools","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":59129,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":149720,"completion":7633,"total":157353}},{"id":"flow-test7-simple-modification","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":23655,"averageJudgeScore":62,"averageTokenUsagePerAttempt":{"prompt":124117,"completion":1380,"total":125497}},{"id":"flow-test8-branching-in-loop","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":17782,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":78450,"completion":958,"total":79408}},{"id":"flow-test9-parallel-refactor","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":30100,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":90009,"completion":3124,"total":93133}},{"id":"flow-test10-while-loop-counter","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":24845,"averageJudgeScore":85,"averageTokenUsagePerAttempt":{"prompt":153396,"completion":1967,"total":155363}},{"id":"flow-test11-preprocessor-and-failure-handler","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":24102,"averageJudgeScore":35,"averageTokenUsagePerAttempt":{"prompt":128760,"completion":1351,"total":130111}},{"id":"flow-test12-approval-step","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":22719,"averageJudgeScore":95,"averageTokenUsagePerAttempt":{"prompt":105267,"completion":1292,"total":106559}}]}
{"createdAt":"2026-04-13T16:41:07.631Z","gitSha":"3f5841f84d878cd3f43c435fa237d3f0c2265fb9","mode":"flow","runs":1,"runModel":"anthropic:claude-opus-4-6","judgeModel":"claude-sonnet-4-6","caseCount":13,"attemptCount":13,"passedAttempts":9,"passRate":0.6923076923076923,"averageDurationMs":51699.38461538462,"averageJudgeScore":84.3076923076923,"averageTokenUsagePerAttempt":{"prompt":126038.92307692308,"completion":2519.6923076923076,"total":128558.61538461539},"failedCaseIds":["flow-test6-ai-agent-tools","flow-test7-simple-modification","flow-test10-while-loop-counter","flow-test11-preprocessor-and-failure-handler"],"cases":[{"id":"flow-test0-sum-two-numbers","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":25781,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":103871,"completion":637,"total":104508}},{"id":"flow-test1-reuse-existing-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":21895,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":75587,"completion":716,"total":76303}},{"id":"flow-test2-call-existing-subflow","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":24773,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":76207,"completion":790,"total":76997}},{"id":"flow-test3-branchone-routing","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":41700,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":130588,"completion":1785,"total":132373}},{"id":"flow-test4-order-processing-loop","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":79107,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":143173,"completion":4977,"total":148150}},{"id":"flow-test5-parallel-data-pipeline","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":89071,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":118418,"completion":5658,"total":124076}},{"id":"flow-test6-ai-agent-tools","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":83867,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":138732,"completion":4745,"total":143477}},{"id":"flow-test7-simple-modification","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":44256,"averageJudgeScore":30,"averageTokenUsagePerAttempt":{"prompt":111016,"completion":1873,"total":112889}},{"id":"flow-test8-branching-in-loop","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":50962,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":137240,"completion":2722,"total":139962}},{"id":"flow-test9-parallel-refactor","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":58847,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":137437,"completion":3521,"total":140958}},{"id":"flow-test10-while-loop-counter","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":38971,"averageJudgeScore":90,"averageTokenUsagePerAttempt":{"prompt":105189,"completion":1161,"total":106350}},{"id":"flow-test11-preprocessor-and-failure-handler","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":79582,"averageJudgeScore":52,"averageTokenUsagePerAttempt":{"prompt":256128,"completion":3124,"total":259252}},{"id":"flow-test12-approval-step","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":33280,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":104920,"completion":1047,"total":105967}}]}
{"createdAt":"2026-04-13T16:42:33.076Z","gitSha":"3f5841f84d878cd3f43c435fa237d3f0c2265fb9","mode":"flow","runs":1,"runModel":"openai:gpt-4o","judgeModel":"claude-sonnet-4-6","caseCount":13,"attemptCount":13,"passedAttempts":7,"passRate":0.5384615384615384,"averageDurationMs":25127.30769230769,"averageJudgeScore":71.07692307692308,"averageTokenUsagePerAttempt":{"prompt":75554.46153846153,"completion":772.8461538461538,"total":76327.30769230769},"failedCaseIds":["flow-test6-ai-agent-tools","flow-test7-simple-modification","flow-test9-parallel-refactor","flow-test10-while-loop-counter","flow-test11-preprocessor-and-failure-handler","flow-test12-approval-step"],"cases":[{"id":"flow-test0-sum-two-numbers","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":16276,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":64149,"completion":312,"total":64461}},{"id":"flow-test1-reuse-existing-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":13918,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":46634,"completion":270,"total":46904}},{"id":"flow-test2-call-existing-subflow","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":15559,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":46899,"completion":229,"total":47128}},{"id":"flow-test3-branchone-routing","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":18332,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":64651,"completion":528,"total":65179}},{"id":"flow-test4-order-processing-loop","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":35969,"averageJudgeScore":92,"averageTokenUsagePerAttempt":{"prompt":85106,"completion":1226,"total":86332}},{"id":"flow-test5-parallel-data-pipeline","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":44250,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":120119,"completion":1514,"total":121633}},{"id":"flow-test6-ai-agent-tools","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":39138,"averageJudgeScore":62,"averageTokenUsagePerAttempt":{"prompt":104858,"completion":2010,"total":106868}},{"id":"flow-test7-simple-modification","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":30801,"averageJudgeScore":20,"averageTokenUsagePerAttempt":{"prompt":140601,"completion":837,"total":141438}},{"id":"flow-test8-branching-in-loop","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":29650,"averageJudgeScore":90,"averageTokenUsagePerAttempt":{"prompt":84676,"completion":434,"total":85110}},{"id":"flow-test9-parallel-refactor","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":15278,"averageJudgeScore":0,"averageTokenUsagePerAttempt":{"prompt":12264,"completion":1037,"total":13301}},{"id":"flow-test10-while-loop-counter","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":18609,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":64538,"completion":447,"total":64985}},{"id":"flow-test11-preprocessor-and-failure-handler","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":24459,"averageJudgeScore":30,"averageTokenUsagePerAttempt":{"prompt":64752,"completion":522,"total":65274}},{"id":"flow-test12-approval-step","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":24416,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":82961,"completion":681,"total":83642}}]}
{"createdAt":"2026-04-13T16:44:35.781Z","gitSha":"3f5841f84d878cd3f43c435fa237d3f0c2265fb9","mode":"flow","runs":1,"runModel":"googleai:gemini-3-flash-preview","judgeModel":"claude-sonnet-4-6","caseCount":13,"attemptCount":13,"passedAttempts":8,"passRate":0.6153846153846154,"averageDurationMs":37479.307692307695,"averageJudgeScore":85,"averageTokenUsagePerAttempt":{"prompt":186704.3076923077,"completion":1286.076923076923,"total":189682.92307692306},"failedCaseIds":["flow-test6-ai-agent-tools","flow-test7-simple-modification","flow-test9-parallel-refactor","flow-test10-while-loop-counter","flow-test11-preprocessor-and-failure-handler"],"cases":[{"id":"flow-test0-sum-two-numbers","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":17390,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":91200,"completion":368,"total":92084}},{"id":"flow-test1-reuse-existing-script","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":16881,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":65540,"completion":414,"total":66412}},{"id":"flow-test2-call-existing-subflow","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":17296,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":66397,"completion":482,"total":67455}},{"id":"flow-test3-branchone-routing","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":29437,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":174842,"completion":1107,"total":176621}},{"id":"flow-test4-order-processing-loop","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":46387,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":233010,"completion":1931,"total":236992}},{"id":"flow-test5-parallel-data-pipeline","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":47883,"averageJudgeScore":88,"averageTokenUsagePerAttempt":{"prompt":300741,"completion":2353,"total":304779}},{"id":"flow-test6-ai-agent-tools","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":51830,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":255392,"completion":2178,"total":259675}},{"id":"flow-test7-simple-modification","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":43691,"averageJudgeScore":62,"averageTokenUsagePerAttempt":{"prompt":167159,"completion":1056,"total":171042}},{"id":"flow-test8-branching-in-loop","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":38113,"averageJudgeScore":95,"averageTokenUsagePerAttempt":{"prompt":222138,"completion":1578,"total":225135}},{"id":"flow-test9-parallel-refactor","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":59161,"averageJudgeScore":78,"averageTokenUsagePerAttempt":{"prompt":342540,"completion":2071,"total":347200}},{"id":"flow-test10-while-loop-counter","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":41602,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":146820,"completion":755,"total":151064}},{"id":"flow-test11-preprocessor-and-failure-handler","attemptCount":1,"passedAttempts":0,"passRate":0,"averageDurationMs":48067,"averageJudgeScore":52,"averageTokenUsagePerAttempt":{"prompt":245838,"completion":1399,"total":249623}},{"id":"flow-test12-approval-step","attemptCount":1,"passedAttempts":1,"passRate":1,"averageDurationMs":29493,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":115539,"completion":1027,"total":117796}}]}
{"createdAt":"2026-04-15T12:47:42.333Z","gitSha":"fada91cb74cbb0d8c4191e88c9c782661fa79e0c","mode":"flow","runs":2,"runModel":"anthropic:claude-haiku-4-5-20251001","judgeModel":"claude-sonnet-4-6","caseCount":13,"attemptCount":26,"passedAttempts":22,"passRate":0.8461538461538461,"averageDurationMs":30184.96153846154,"averageJudgeScore":90.23076923076923,"averageTokenUsagePerAttempt":{"prompt":131953,"completion":3005.4615384615386,"total":134958.46153846153},"failedCaseIds":["flow-test6-ai-agent-tools","flow-test9-parallel-refactor"],"cases":[{"id":"flow-test0-sum-two-numbers","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":17632.5,"averageJudgeScore":99,"averageTokenUsagePerAttempt":{"prompt":119410.5,"completion":785,"total":120195.5}},{"id":"flow-test1-reuse-existing-script","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":15469,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":91090,"completion":796,"total":91886}},{"id":"flow-test2-call-existing-subflow","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":14306.5,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":101415.5,"completion":1010,"total":102425.5}},{"id":"flow-test3-branchone-routing","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":23193,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":76384,"completion":2375.5,"total":78759.5}},{"id":"flow-test4-order-processing-loop","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":44973,"averageJudgeScore":92.5,"averageTokenUsagePerAttempt":{"prompt":189119,"completion":4639.5,"total":193758.5}},{"id":"flow-test5-parallel-data-pipeline","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":63343.5,"averageJudgeScore":94.5,"averageTokenUsagePerAttempt":{"prompt":171440.5,"completion":8551,"total":179991.5}},{"id":"flow-test6-ai-agent-tools","attemptCount":2,"passedAttempts":0,"passRate":0,"averageDurationMs":64051,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":200807,"completion":8626,"total":209433}},{"id":"flow-test7-simple-modification","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":20897,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":124223,"completion":1363,"total":125586}},{"id":"flow-test8-branching-in-loop","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":26266.5,"averageJudgeScore":95,"averageTokenUsagePerAttempt":{"prompt":99486,"completion":3338.5,"total":102824.5}},{"id":"flow-test9-parallel-refactor","attemptCount":2,"passedAttempts":0,"passRate":0,"averageDurationMs":34616.5,"averageJudgeScore":95,"averageTokenUsagePerAttempt":{"prompt":139827,"completion":3639.5,"total":143466.5}},{"id":"flow-test10-while-loop-counter","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":25068,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":134504.5,"completion":1472,"total":135976.5}},{"id":"flow-test11-preprocessor-and-failure-handler","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":22762,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":147320,"completion":1372,"total":148692}},{"id":"flow-test12-approval-step","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":19826,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":120362,"completion":1103,"total":121465}}]}
{"createdAt":"2026-04-15T12:59:23.430Z","gitSha":"fada91cb74cbb0d8c4191e88c9c782661fa79e0c","mode":"flow","runs":2,"runModel":"openai:gpt-4o","judgeModel":"claude-sonnet-4-6","caseCount":13,"attemptCount":26,"passedAttempts":17,"passRate":0.6538461538461539,"averageDurationMs":22773.73076923077,"averageJudgeScore":74.96153846153847,"averageTokenUsagePerAttempt":{"prompt":80958.57692307692,"completion":794,"total":81752.57692307692},"failedCaseIds":["flow-test4-order-processing-loop","flow-test6-ai-agent-tools","flow-test7-simple-modification","flow-test9-parallel-refactor","flow-test10-while-loop-counter"],"cases":[{"id":"flow-test0-sum-two-numbers","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":21414.5,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":74020,"completion":278,"total":74298}},{"id":"flow-test1-reuse-existing-script","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":11469,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":56486,"completion":264,"total":56750}},{"id":"flow-test2-call-existing-subflow","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":11158,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":56791,"completion":271.5,"total":57062.5}},{"id":"flow-test3-branchone-routing","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":15699.5,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":74511,"completion":517,"total":75028}},{"id":"flow-test4-order-processing-loop","attemptCount":2,"passedAttempts":1,"passRate":0.5,"averageDurationMs":22957.5,"averageJudgeScore":67,"averageTokenUsagePerAttempt":{"prompt":65343,"completion":1127.5,"total":66470.5}},{"id":"flow-test5-parallel-data-pipeline","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":33018.5,"averageJudgeScore":87,"averageTokenUsagePerAttempt":{"prompt":76464,"completion":1572,"total":78036}},{"id":"flow-test6-ai-agent-tools","attemptCount":2,"passedAttempts":0,"passRate":0,"averageDurationMs":37364,"averageJudgeScore":67,"averageTokenUsagePerAttempt":{"prompt":130732,"completion":2106,"total":132838}},{"id":"flow-test7-simple-modification","attemptCount":2,"passedAttempts":0,"passRate":0,"averageDurationMs":24472.5,"averageJudgeScore":36,"averageTokenUsagePerAttempt":{"prompt":123649,"completion":896,"total":124545}},{"id":"flow-test8-branching-in-loop","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":23635.5,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":104919,"completion":460.5,"total":105379.5}},{"id":"flow-test9-parallel-refactor","attemptCount":2,"passedAttempts":0,"passRate":0,"averageDurationMs":28727,"averageJudgeScore":15,"averageTokenUsagePerAttempt":{"prompt":48189.5,"completion":1501.5,"total":49691}},{"id":"flow-test10-while-loop-counter","attemptCount":2,"passedAttempts":0,"passRate":0,"averageDurationMs":22109,"averageJudgeScore":56,"averageTokenUsagePerAttempt":{"prompt":84576.5,"completion":403,"total":84979.5}},{"id":"flow-test11-preprocessor-and-failure-handler","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":25620,"averageJudgeScore":88.5,"averageTokenUsagePerAttempt":{"prompt":105479.5,"completion":500.5,"total":105980}},{"id":"flow-test12-approval-step","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":18413.5,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":51301,"completion":424.5,"total":51725.5}}]}
{"createdAt":"2026-04-15T13:04:53.138Z","gitSha":"fada91cb74cbb0d8c4191e88c9c782661fa79e0c","mode":"flow","runs":2,"runModel":"anthropic:claude-opus-4-6","judgeModel":"claude-sonnet-4-6","caseCount":13,"attemptCount":26,"passedAttempts":18,"passRate":0.6923076923076923,"averageDurationMs":53728.153846153844,"averageJudgeScore":90.46153846153847,"averageTokenUsagePerAttempt":{"prompt":136217.65384615384,"completion":2690.576923076923,"total":138908.23076923078},"failedCaseIds":["flow-test6-ai-agent-tools","flow-test7-simple-modification","flow-test9-parallel-refactor","flow-test10-while-loop-counter","flow-test12-approval-step"],"cases":[{"id":"flow-test0-sum-two-numbers","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":26766.5,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":119291.5,"completion":619.5,"total":119911}},{"id":"flow-test1-reuse-existing-script","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":25131.5,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":90983.5,"completion":746.5,"total":91730}},{"id":"flow-test2-call-existing-subflow","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":25598.5,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":91533,"completion":718.5,"total":92251.5}},{"id":"flow-test3-branchone-routing","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":42976.5,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":149081,"completion":1746,"total":150827}},{"id":"flow-test4-order-processing-loop","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":82068,"averageJudgeScore":98,"averageTokenUsagePerAttempt":{"prompt":160765,"completion":4723,"total":165488}},{"id":"flow-test5-parallel-data-pipeline","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":107520.5,"averageJudgeScore":96,"averageTokenUsagePerAttempt":{"prompt":137528,"completion":6918,"total":144446}},{"id":"flow-test6-ai-agent-tools","attemptCount":2,"passedAttempts":1,"passRate":0.5,"averageDurationMs":117563,"averageJudgeScore":77,"averageTokenUsagePerAttempt":{"prompt":172375,"completion":8691.5,"total":181066.5}},{"id":"flow-test7-simple-modification","attemptCount":2,"passedAttempts":1,"passRate":0.5,"averageDurationMs":40348,"averageJudgeScore":77,"averageTokenUsagePerAttempt":{"prompt":125491.5,"completion":1557,"total":127048.5}},{"id":"flow-test8-branching-in-loop","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":52332.5,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":155749,"completion":2693,"total":158442}},{"id":"flow-test9-parallel-refactor","attemptCount":2,"passedAttempts":0,"passRate":0,"averageDurationMs":58810,"averageJudgeScore":95,"averageTokenUsagePerAttempt":{"prompt":154580,"completion":3080,"total":157660}},{"id":"flow-test10-while-loop-counter","attemptCount":2,"passedAttempts":0,"passRate":0,"averageDurationMs":39319.5,"averageJudgeScore":72,"averageTokenUsagePerAttempt":{"prompt":120779,"completion":1131.5,"total":121910.5}},{"id":"flow-test11-preprocessor-and-failure-handler","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":43657.5,"averageJudgeScore":95,"averageTokenUsagePerAttempt":{"prompt":172242,"completion":1277,"total":173519}},{"id":"flow-test12-approval-step","attemptCount":2,"passedAttempts":0,"passRate":0,"averageDurationMs":36374,"averageJudgeScore":75,"averageTokenUsagePerAttempt":{"prompt":120431,"completion":1076,"total":121507}}]}
{"createdAt":"2026-04-15T13:09:23.557Z","gitSha":"fada91cb74cbb0d8c4191e88c9c782661fa79e0c","mode":"flow","runs":2,"runModel":"googleai:gemini-3-flash-preview","judgeModel":"claude-sonnet-4-6","caseCount":13,"attemptCount":26,"passedAttempts":23,"passRate":0.8846153846153846,"averageDurationMs":38015.153846153844,"averageJudgeScore":92.61538461538461,"averageTokenUsagePerAttempt":{"prompt":213122.73076923078,"completion":1306.6923076923076,"total":216288.61538461538},"failedCaseIds":["flow-test7-simple-modification","flow-test9-parallel-refactor"],"cases":[{"id":"flow-test0-sum-two-numbers","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":17852,"averageJudgeScore":97.5,"averageTokenUsagePerAttempt":{"prompt":106013.5,"completion":461,"total":106898.5}},{"id":"flow-test1-reuse-existing-script","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":17556,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":80428.5,"completion":521,"total":81375.5}},{"id":"flow-test2-call-existing-subflow","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":16211,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":80653,"completion":538,"total":81544.5}},{"id":"flow-test3-branchone-routing","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":28206.5,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":195088,"completion":1003.5,"total":196934.5}},{"id":"flow-test4-order-processing-loop","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":49612,"averageJudgeScore":89.5,"averageTokenUsagePerAttempt":{"prompt":285979.5,"completion":2140.5,"total":289883}},{"id":"flow-test5-parallel-data-pipeline","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":52635,"averageJudgeScore":94.5,"averageTokenUsagePerAttempt":{"prompt":315058,"completion":2118,"total":319111}},{"id":"flow-test6-ai-agent-tools","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":55039,"averageJudgeScore":89,"averageTokenUsagePerAttempt":{"prompt":298999.5,"completion":2563,"total":304299}},{"id":"flow-test7-simple-modification","attemptCount":2,"passedAttempts":1,"passRate":0.5,"averageDurationMs":45571,"averageJudgeScore":77,"averageTokenUsagePerAttempt":{"prompt":177988,"completion":963,"total":182547}},{"id":"flow-test8-branching-in-loop","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":53957,"averageJudgeScore":96,"averageTokenUsagePerAttempt":{"prompt":326580,"completion":1650,"total":331999}},{"id":"flow-test9-parallel-refactor","attemptCount":2,"passedAttempts":0,"passRate":0,"averageDurationMs":78361.5,"averageJudgeScore":93.5,"averageTokenUsagePerAttempt":{"prompt":495491,"completion":2535,"total":503137}},{"id":"flow-test10-while-loop-counter","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":27481,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":132736,"completion":820,"total":134766.5}},{"id":"flow-test11-preprocessor-and-failure-handler","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":29757.5,"averageJudgeScore":92.5,"averageTokenUsagePerAttempt":{"prompt":168158.5,"completion":1022.5,"total":170345.5}},{"id":"flow-test12-approval-step","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":21957.5,"averageJudgeScore":92.5,"averageTokenUsagePerAttempt":{"prompt":107422,"completion":651.5,"total":108911}}]}
{"createdAt":"2026-04-15T13:56:16.609Z","gitSha":"cc3e17dbc1c204b5d4e30ad449d59e9e7cd0bb89","mode":"flow","runs":2,"runModel":"anthropic:claude-haiku-4-5-20251001","judgeModel":"claude-sonnet-4-6","caseCount":13,"attemptCount":26,"passedAttempts":26,"passRate":1,"averageDurationMs":35150.57692307692,"averageJudgeScore":92.07692307692308,"averageTokenUsagePerAttempt":{"prompt":139081.07692307694,"completion":3570.3076923076924,"total":142651.38461538462},"failedCaseIds":[],"cases":[{"id":"flow-test0-sum-two-numbers","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":16746.5,"averageJudgeScore":98,"averageTokenUsagePerAttempt":{"prompt":119410.5,"completion":786.5,"total":120197}},{"id":"flow-test1-reuse-existing-script","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":16781.5,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":91090,"completion":796,"total":91886}},{"id":"flow-test2-call-existing-subflow","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":20842,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":101415.5,"completion":1065.5,"total":102481}},{"id":"flow-test3-branchone-routing","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":28184,"averageJudgeScore":98.5,"averageTokenUsagePerAttempt":{"prompt":76383,"completion":2365.5,"total":78748.5}},{"id":"flow-test4-order-processing-loop","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":48227,"averageJudgeScore":91,"averageTokenUsagePerAttempt":{"prompt":187421,"completion":4314.5,"total":191735.5}},{"id":"flow-test5-parallel-data-pipeline","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":117878.5,"averageJudgeScore":94.5,"averageTokenUsagePerAttempt":{"prompt":308754.5,"completion":19364.5,"total":328119}},{"id":"flow-test6-ai-agent-tools","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":44483.5,"averageJudgeScore":89,"averageTokenUsagePerAttempt":{"prompt":158473.5,"completion":5044,"total":163517.5}},{"id":"flow-test7-simple-modification","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":21374,"averageJudgeScore":92,"averageTokenUsagePerAttempt":{"prompt":124028,"completion":1309,"total":125337}},{"id":"flow-test8-branching-in-loop","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":30584.5,"averageJudgeScore":95,"averageTokenUsagePerAttempt":{"prompt":99486,"completion":3344,"total":102830}},{"id":"flow-test9-parallel-refactor","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":43953,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":153129,"completion":4306,"total":157435}},{"id":"flow-test10-while-loop-counter","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":20196.5,"averageJudgeScore":96,"averageTokenUsagePerAttempt":{"prompt":120701,"completion":1159,"total":121860}},{"id":"flow-test11-preprocessor-and-failure-handler","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":25325.5,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":147320,"completion":1369,"total":148689}},{"id":"flow-test12-approval-step","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":22381,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":120442,"completion":1190.5,"total":121632.5}}]}
{"createdAt":"2026-04-15T13:59:07.056Z","gitSha":"cc3e17dbc1c204b5d4e30ad449d59e9e7cd0bb89","mode":"flow","runs":2,"runModel":"openai:gpt-4o","judgeModel":"claude-sonnet-4-6","caseCount":13,"attemptCount":26,"passedAttempts":21,"passRate":0.8076923076923077,"averageDurationMs":28529.346153846152,"averageJudgeScore":82.65384615384616,"averageTokenUsagePerAttempt":{"prompt":87358.15384615384,"completion":964.4615384615385,"total":88322.61538461539},"failedCaseIds":["flow-test4-order-processing-loop","flow-test6-ai-agent-tools","flow-test7-simple-modification","flow-test9-parallel-refactor"],"cases":[{"id":"flow-test0-sum-two-numbers","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":16221,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":74020,"completion":280,"total":74300}},{"id":"flow-test1-reuse-existing-script","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":17431.5,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":56484.5,"completion":257,"total":56741.5}},{"id":"flow-test2-call-existing-subflow","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":14980.5,"averageJudgeScore":97.5,"averageTokenUsagePerAttempt":{"prompt":56751,"completion":230,"total":56981}},{"id":"flow-test3-branchone-routing","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":20897,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":64328,"completion":521,"total":64849}},{"id":"flow-test4-order-processing-loop","attemptCount":2,"passedAttempts":1,"passRate":0.5,"averageDurationMs":61242,"averageJudgeScore":70,"averageTokenUsagePerAttempt":{"prompt":158766.5,"completion":3520,"total":162286.5}},{"id":"flow-test5-parallel-data-pipeline","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":47899.5,"averageJudgeScore":86.5,"averageTokenUsagePerAttempt":{"prompt":87984.5,"completion":1582.5,"total":89567}},{"id":"flow-test6-ai-agent-tools","attemptCount":2,"passedAttempts":1,"passRate":0.5,"averageDurationMs":42154.5,"averageJudgeScore":77,"averageTokenUsagePerAttempt":{"prompt":130936,"completion":2206.5,"total":133142.5}},{"id":"flow-test7-simple-modification","attemptCount":2,"passedAttempts":1,"passRate":0.5,"averageDurationMs":38449.5,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":150313.5,"completion":948,"total":151261.5}},{"id":"flow-test8-branching-in-loop","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":35552.5,"averageJudgeScore":90,"averageTokenUsagePerAttempt":{"prompt":112832,"completion":470.5,"total":113302.5}},{"id":"flow-test9-parallel-refactor","attemptCount":2,"passedAttempts":0,"passRate":0,"averageDurationMs":22728.5,"averageJudgeScore":3.5,"averageTokenUsagePerAttempt":{"prompt":14727,"completion":1063,"total":15790}},{"id":"flow-test10-while-loop-counter","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":19612.5,"averageJudgeScore":93.5,"averageTokenUsagePerAttempt":{"prompt":84800.5,"completion":526.5,"total":85327}},{"id":"flow-test11-preprocessor-and-failure-handler","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":18568.5,"averageJudgeScore":92,"averageTokenUsagePerAttempt":{"prompt":92412,"completion":507.5,"total":92919.5}},{"id":"flow-test12-approval-step","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":15144,"averageJudgeScore":88.5,"averageTokenUsagePerAttempt":{"prompt":51300.5,"completion":425.5,"total":51726}}]}
{"createdAt":"2026-04-15T14:04:19.086Z","gitSha":"cc3e17dbc1c204b5d4e30ad449d59e9e7cd0bb89","mode":"flow","runs":2,"runModel":"anthropic:claude-opus-4-6","judgeModel":"claude-sonnet-4-6","caseCount":13,"attemptCount":26,"passedAttempts":26,"passRate":1,"averageDurationMs":53226.5,"averageJudgeScore":95.8076923076923,"averageTokenUsagePerAttempt":{"prompt":136106.3076923077,"completion":2673.5,"total":138779.8076923077},"failedCaseIds":[],"cases":[{"id":"flow-test0-sum-two-numbers","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":27188.5,"averageJudgeScore":98,"averageTokenUsagePerAttempt":{"prompt":119289,"completion":630.5,"total":119919.5}},{"id":"flow-test1-reuse-existing-script","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":26495.5,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":90983.5,"completion":746.5,"total":91730}},{"id":"flow-test2-call-existing-subflow","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":26312.5,"averageJudgeScore":97.5,"averageTokenUsagePerAttempt":{"prompt":91534,"completion":769.5,"total":92303.5}},{"id":"flow-test3-branchone-routing","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":42606,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":149110.5,"completion":1761.5,"total":150872}},{"id":"flow-test4-order-processing-loop","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":77153.5,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":159363,"completion":4355,"total":163718}},{"id":"flow-test5-parallel-data-pipeline","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":107545,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":138505.5,"completion":7243.5,"total":145749}},{"id":"flow-test6-ai-agent-tools","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":112611,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":171742,"completion":8499.5,"total":180241.5}},{"id":"flow-test7-simple-modification","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":44779,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":125571.5,"completion":1625.5,"total":127197}},{"id":"flow-test8-branching-in-loop","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":50868,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":155604.5,"completion":2681,"total":158285.5}},{"id":"flow-test9-parallel-refactor","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":59752,"averageJudgeScore":92.5,"averageTokenUsagePerAttempt":{"prompt":154274.5,"completion":2961,"total":157235.5}},{"id":"flow-test10-while-loop-counter","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":36922.5,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":120778,"completion":1121,"total":121899}},{"id":"flow-test11-preprocessor-and-failure-handler","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":44307.5,"averageJudgeScore":93.5,"averageTokenUsagePerAttempt":{"prompt":172195,"completion":1285,"total":173480}},{"id":"flow-test12-approval-step","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":35403.5,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":120431,"completion":1076,"total":121507}}]}
{"createdAt":"2026-04-15T14:09:26.896Z","gitSha":"cc3e17dbc1c204b5d4e30ad449d59e9e7cd0bb89","mode":"flow","runs":2,"runModel":"googleai:gemini-3-flash-preview","judgeModel":"claude-sonnet-4-6","caseCount":13,"attemptCount":26,"passedAttempts":26,"passRate":1,"averageDurationMs":43444.88461538462,"averageJudgeScore":93.73076923076923,"averageTokenUsagePerAttempt":{"prompt":209953.38461538462,"completion":1267.2307692307693,"total":213042.65384615384},"failedCaseIds":[],"cases":[{"id":"flow-test0-sum-two-numbers","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":18405.5,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":106013.5,"completion":466,"total":106954.5}},{"id":"flow-test1-reuse-existing-script","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":18034.5,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":80428.5,"completion":524.5,"total":81372.5}},{"id":"flow-test2-call-existing-subflow","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":17393,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":80653,"completion":538,"total":81544.5}},{"id":"flow-test3-branchone-routing","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":28979,"averageJudgeScore":100,"averageTokenUsagePerAttempt":{"prompt":195088,"completion":1003.5,"total":196934.5}},{"id":"flow-test4-order-processing-loop","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":47315,"averageJudgeScore":87,"averageTokenUsagePerAttempt":{"prompt":264983,"completion":1909.5,"total":268753}},{"id":"flow-test5-parallel-data-pipeline","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":55034.5,"averageJudgeScore":96,"averageTokenUsagePerAttempt":{"prompt":315058,"completion":2118,"total":319111}},{"id":"flow-test6-ai-agent-tools","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":53794.5,"averageJudgeScore":88.5,"averageTokenUsagePerAttempt":{"prompt":278794,"completion":2275,"total":283175}},{"id":"flow-test7-simple-modification","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":43680,"averageJudgeScore":91,"averageTokenUsagePerAttempt":{"prompt":177988,"completion":963,"total":182547}},{"id":"flow-test8-branching-in-loop","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":65355.5,"averageJudgeScore":97,"averageTokenUsagePerAttempt":{"prompt":326580,"completion":1650,"total":331999}},{"id":"flow-test9-parallel-refactor","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":99143,"averageJudgeScore":82,"averageTokenUsagePerAttempt":{"prompt":495491,"completion":2535,"total":503137}},{"id":"flow-test10-while-loop-counter","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":33126.5,"averageJudgeScore":94.5,"averageTokenUsagePerAttempt":{"prompt":132736,"completion":820,"total":134766.5}},{"id":"flow-test11-preprocessor-and-failure-handler","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":52696.5,"averageJudgeScore":87,"averageTokenUsagePerAttempt":{"prompt":168158.5,"completion":1022.5,"total":170345.5}},{"id":"flow-test12-approval-step","attemptCount":2,"passedAttempts":2,"passRate":1,"averageDurationMs":31826,"averageJudgeScore":98.5,"averageTokenUsagePerAttempt":{"prompt":107422.5,"completion":649,"total":108914.5}}]}

View File

@@ -4374,6 +4374,53 @@ The OpenFlow schema (openflow.openapi.yaml) is the source of truth for flow stru
- \`preprocessor\` - Reserved for preprocessor module
- \`Input\` - Reserved for flow input reference
## Hard Structural Rules
These are strict Windmill schema rules. Follow them exactly.
- \`value.modules\` is only for normal sequential steps
- \`value.preprocessor_module\` and \`value.failure_module\` are special top-level fields inside \`value\`, not entries in \`value.modules\`
- If a flow needs a preprocessor, create \`value.preprocessor_module\` with \`id: preprocessor\`
- If a flow needs a failure handler, create \`value.failure_module\` with \`id: failure\`
- Do NOT create regular modules inside \`value.modules\` named \`preprocessor\` or \`failure\`
- \`preprocessor_module\` and \`failure_module\` only support \`script\` or \`rawscript\`
- \`preprocessor_module\` runs before normal modules and cannot reference \`results.*\`
- \`failure_module\` can use the \`error\` object with \`error.message\`, \`error.step_id\`, \`error.name\`, and \`error.stack\`
Correct shape:
\`\`\`yaml
value:
preprocessor_module:
id: preprocessor
value:
type: rawscript
...
failure_module:
id: failure
value:
type: rawscript
...
modules:
- id: process_event
value:
type: rawscript
...
\`\`\`
Incorrect shape:
\`\`\`yaml
value:
modules:
- id: preprocessor
...
- id: process_event
...
- id: failure
...
\`\`\`
## Module ID Rules
- Must be unique across the entire flow
@@ -4389,10 +4436,148 @@ The OpenFlow schema (openflow.openapi.yaml) is the source of truth for flow stru
## Data Flow Between Steps
- \`flow_input.property\` - Access flow input parameters
- \`results.step_id\` - Access output from a previous step
- \`results.step_id.property\` - Access specific property from previous step output
- \`flow_input.iter.value\` - Current item when inside a for-loop
- \`flow_input.iter.index\` - Current index when inside a for-loop
- \`results.step_id\` - Access output from a previous step only when that step result is in scope
- \`results.step_id.property\` - Access specific property from a previous step output only when that step result is in scope
- \`flow_input.iter.value\` - Current iteration value when inside a loop (\`forloopflow\` or \`whileloopflow\`)
- \`flow_input.iter.index\` - Current loop index when inside a loop (\`forloopflow\` or \`whileloopflow\`)
## Loop Structure Rules
- For \`whileloopflow\`, use module-level \`stop_after_if\` on the loop module itself when the loop should stop after an iteration result
- Do NOT put \`stop_after_if\` inside \`value\` of a \`whileloopflow\`
- \`stop_after_all_iters_if\` is for checks after the whole loop finishes, not the normal per-iteration break condition
- When a \`whileloopflow\` carries state forward between iterations, use \`flow_input.iter.value\` as the current loop value and provide an explicit first-iteration fallback when needed
- Use \`flow_input.iter.index\` only when the loop logic is truly based on the iteration index, not as a replacement for the current loop value
- If the user asks for a final scalar/object after a loop, add a normal step after the loop that extracts the final value from the loop result instead of returning the whole loop result array
Correct \`whileloopflow\` shape:
\`\`\`yaml
- id: loop_until_done
stop_after_if:
expr: result.done === true
skip_if_stopped: false
value:
type: whileloopflow
skip_failures: false
modules:
- id: advance_state
value:
type: rawscript
input_transforms:
state:
type: javascript
expr: flow_input.iter && flow_input.iter.value !== undefined ? flow_input.iter.value : flow_input.initial_state
- id: return_final_state
value:
type: rawscript
input_transforms:
final_state:
type: javascript
expr: results.loop_until_done[results.loop_until_done.length - 1]
\`\`\`
Incorrect \`whileloopflow\` patterns:
\`\`\`yaml
- id: loop_until_done
value:
type: whileloopflow
stop_after_if:
expr: result.done === true
\`\`\`
\`\`\`yaml
input_transforms:
state:
type: javascript
expr: flow_input.iter.index
\`\`\`
\`\`\`yaml
input_transforms:
final_state:
type: javascript
expr: results.loop_until_done
\`\`\`
## Approval / Suspend Structure
- \`suspend\` belongs on the flow module object itself, as a sibling of \`id\` and \`value\`
- Never put \`suspend\` inside \`value\`
Correct shape:
\`\`\`yaml
- id: request_approval
suspend:
required_events: 1
resume_form:
schema:
type: object
properties:
comment:
type: string
required: [comment]
value:
type: identity
\`\`\`
Incorrect shape:
\`\`\`yaml
- id: request_approval
value:
type: rawscript
suspend:
required_events: 1
\`\`\`
## Branch Result Scope Rules
- Inside a branch, you may reference earlier outer steps and earlier steps in the same branch
- Outside a \`branchone\`, do NOT reference ids of steps that only exist inside its branches or default branch. Use \`results.<branchone_module_id>\` instead
- Outside a \`branchall\`, do NOT reference ids of steps inside its branches. Use \`results.<branchall_module_id>\` instead
- If downstream steps need a stable shape after a branch, make each branch return the same fields
- When needed, add a normalization step immediately after the branch and consume \`results.<branch_module_id>\` there
Correct after \`branchone\`:
\`\`\`yaml
- id: route_order
value:
type: branchone
...
- id: send_confirmation
value:
input_transforms:
routed:
type: javascript
expr: results.route_order
\`\`\`
Incorrect after \`branchone\`:
\`\`\`yaml
expr: results.create_shipment
expr: results.create_backorder
\`\`\`
Correct after \`branchall\`:
\`\`\`yaml
- id: enrich_parallel
value:
type: branchall
parallel: true
...
- id: combine_data
value:
input_transforms:
enrichments:
type: javascript
expr: results.enrich_parallel
\`\`\`
## Input Transforms
@@ -4409,14 +4594,14 @@ JavaScript transform (dynamic expression):
- For flow inputs: Use type \`"object"\` with format \`"resource-{type}"\` (e.g., \`"resource-postgresql"\`)
- For step inputs: Use static value \`"$res:path/to/resource"\`
## Failure Handler
## Final Structural Self-Check
Executes when any step fails. Has access to error details:
Before finalizing a flow, verify:
- \`error.message\` - Error message
- \`error.step_id\` - ID of failed step
- \`error.name\` - Error name
- \`error.stack\` - Stack trace
- any preprocessor is in \`value.preprocessor_module\`
- any failure handler is in \`value.failure_module\`
- any approval step has module-level \`suspend\`
- no downstream step references inner branch step ids from outside the branch
## S3 Object Operations

View File

@@ -2,17 +2,20 @@
import FlowModuleSchemaMap from '$lib/components/flows/map/FlowModuleSchemaMap.svelte'
import { getContext, untrack } from 'svelte'
import type { ExtendedOpenFlow, FlowEditorContext } from '$lib/components/flows/types'
import { dfs } from '$lib/components/flows/previousResults'
import type { FlowModule, InputTransform, OpenFlow } from '$lib/gen'
import type { FlowModule, InputTransform } from '$lib/gen'
import type { FlowAIChatHelpers } from './core'
import { createInlineScriptSession } from './inlineScriptsUtils'
import { loadSchemaFromModule } from '$lib/components/flows/flowInfers'
import { aiChatManager } from '../AIChatManager.svelte'
import { refreshStateStore } from '$lib/svelte5Utils.svelte'
import { getSubModules } from '$lib/components/flows/flowExplorer'
import { SPECIAL_MODULE_IDS } from '../shared'
import type { FlowCopilotContext } from '../../flow'
import type { ScriptLintResult } from '../shared'
import {
applyFlowJsonUpdate,
getFlowModuleById,
getRawScriptModuleById
} from './helperUtils'
let {
flowModuleSchemaMap,
@@ -32,16 +35,6 @@
// Get diffManager from the graph
const diffManager = $derived(flowModuleSchemaMap?.getDiffManager())
function getModule(id: string, flow: OpenFlow = flowStore.val) {
if (id === SPECIAL_MODULE_IDS.PREPROCESSOR) {
return flow.value.preprocessor_module
} else if (id === SPECIAL_MODULE_IDS.FAILURE) {
return flow.value.failure_module
} else {
return dfs(id, flow, false)[0]
}
}
const flowHelpers: FlowAIChatHelpers = {
// flow context
getFlowAndSelectedId: () => {
@@ -53,7 +46,7 @@
},
getModules: (id?: string) => {
if (id) {
const module = getModule(id)
const module = getFlowModuleById(flowStore.val, id)
if (!module) {
throw new Error('Module not found')
@@ -76,7 +69,7 @@
// Update current editor if needed
const targetSnapshot = snapshot ?? diffManager.beforeFlow
if ($currentEditor && targetSnapshot) {
const module = getModule($currentEditor.stepId, targetSnapshot)
const module = getFlowModuleById(targetSnapshot, $currentEditor.stepId)
if (module) {
if ($currentEditor.type === 'script' && module.value.type === 'rawscript') {
$currentEditor.editor.setCode(module.value.content)
@@ -91,51 +84,48 @@
// ai chat tools
setCode: async (id: string, code: string) => {
const module = getModule(id)
const module = getRawScriptModuleById(flowStore.val, id)
if (!module) {
throw new Error('Module not found')
throw new Error('Module not found or is not a rawscript')
}
if (module.value.type === 'rawscript') {
// 1. Take snapshot only if none exists (preserves baseline for cumulative changes)
if (!diffManager?.beforeFlow) {
const snapshot = $state.snapshot(flowStore).val
diffManager?.setBeforeFlow(snapshot)
diffManager?.setEditMode(true)
}
// 2. Apply the code change
module.value.content = code
inlineScriptSession.set(id, code)
const { input_transforms, schema } = await loadSchemaFromModule(module)
module.value.input_transforms = input_transforms
refreshStateStore(flowStore)
// 1. Take snapshot only if none exists (preserves baseline for cumulative changes)
if (!diffManager?.beforeFlow) {
const snapshot = $state.snapshot(flowStore).val
diffManager?.setBeforeFlow(snapshot)
diffManager?.setEditMode(true)
}
// Update exprsToSet if this module is currently selected
if (id === selectedId && exprsToSet) {
exprsToSet.set(input_transforms)
}
// 2. Apply the code change
module.value.content = code
inlineScriptSession.set(id, code)
const { input_transforms, schema } = await loadSchemaFromModule(module)
module.value.input_transforms = input_transforms
refreshStateStore(flowStore)
if (flowStateStore.val[id]) {
flowStateStore.val[id].schema = schema
} else {
flowStateStore.val[id] = {
schema
}
}
// Update exprsToSet if this module is currently selected
if (id === selectedId && exprsToSet) {
exprsToSet.set(input_transforms)
}
// 3. Manually add to moduleActions, preserving existing action types
// Note: currentFlow is auto-synced by FlowGraphV2's effect after refreshStateStore
const currentAction = diffManager?.moduleActions[id]
if (!currentAction) {
diffManager?.setModuleActions({
...diffManager?.moduleActions,
[id]: { action: 'modified', pending: true }
})
}
// If already tracked (e.g., 'added' from setFlowJson), keep that status
if (flowStateStore.val[id]) {
flowStateStore.val[id].schema = schema
} else {
throw new Error('Module is not a rawscript or script')
flowStateStore.val[id] = {
schema
}
}
// 3. Manually add to moduleActions, preserving existing action types
// Note: currentFlow is auto-synced by FlowGraphV2's effect after refreshStateStore
const currentAction = diffManager?.moduleActions[id]
if (!currentAction) {
diffManager?.setModuleActions({
...diffManager?.moduleActions,
[id]: { action: 'modified', pending: true }
})
}
// If already tracked (e.g., 'added' from setFlowJson), keep that status
if ($currentEditor && $currentEditor.type === 'script' && $currentEditor.stepId === id) {
$currentEditor.editor.setCode(code)
}
@@ -176,7 +166,7 @@
getLintErrors: async (moduleId: string): Promise<ScriptLintResult> => {
const module = getModule(moduleId)
const module = getFlowModuleById(flowStore.val, moduleId)
if (!module || module.value.type !== 'rawscript') {
return { errorCount: 0, warningCount: 0, errors: [], warnings: [] }
}
@@ -205,10 +195,17 @@
setFlowJson: async (
modules: FlowModule[] | undefined,
schema: Record<string, any> | undefined
schema: Record<string, any> | undefined,
preprocessorModule: FlowModule | null | undefined,
failureModule: FlowModule | null | undefined
) => {
try {
if (modules || schema) {
if (
modules !== undefined ||
schema !== undefined ||
preprocessorModule !== undefined ||
failureModule !== undefined
) {
// Take snapshot of current flowStore and set as beforeFlow
if (!diffManager?.hasPendingChanges) {
const snapshot = $state.snapshot(flowStore).val
@@ -217,23 +214,12 @@
}
}
if (modules) {
// Restore inline script references back to full content
const restoredModules = inlineScriptSession.restoreInlineScriptReferences(modules)
const unresolvedRefs = inlineScriptSession.findUnresolvedInlineScriptRefs(restoredModules)
if (unresolvedRefs.length > 0) {
throw new Error(
`Unresolved inline script references: ${unresolvedRefs.join(', ')}`
)
}
// Directly modify flowStore (immediate effect)
flowStore.val.value.modules = restoredModules
}
// Update schema if provided
if (schema !== undefined) {
flowStore.val.schema = schema
}
applyFlowJsonUpdate(flowStore.val, inlineScriptSession, {
modules,
schema,
preprocessorModule,
failureModule
})
// Refresh the state store to update UI
refreshStateStore(flowStore)
@@ -253,7 +239,7 @@
diffManager?.moduleActions[selectedId]?.pending &&
$currentEditor.editor.getAiChatEditorHandler()
) {
const moduleLastSnapshot = getModule(selectedId, diffManager.beforeFlow)
const moduleLastSnapshot = getFlowModuleById(diffManager.beforeFlow, selectedId)
const content =
moduleLastSnapshot?.value.type === 'rawscript' ? moduleLastSnapshot.value.content : ''
if (content.length > 0) {

View File

@@ -35,9 +35,9 @@ import {
import type { ContextElement } from '../context'
import type { ExtendedOpenFlow } from '$lib/components/flows/types'
import type { InlineScriptSession } from './inlineScriptsUtils'
import { flowModulesSchema } from './openFlowZod'
import { flowModuleSchema, flowModulesSchema } from './openFlowZod'
import { collectAllModuleIdsFromArray } from './utils'
import { getFlowPrompt } from '$system_prompts'
import { FLOW_CHAT_SPECIAL_MODULES, getFlowPrompt } from '$system_prompts'
/**
* Navigate to a schema at a given path, handling arrays, objects, unions, and wrappers.
@@ -259,7 +259,9 @@ export interface FlowAIChatHelpers {
setCode: (id: string, code: string) => Promise<void>
setFlowJson: (
modules: FlowModule[] | undefined,
schema: Record<string, any> | undefined
schema: Record<string, any> | undefined,
preprocessorModule: FlowModule | null | undefined,
failureModule: FlowModule | null | undefined
) => Promise<void>
getFlowInputsSchema: () => Promise<Record<string, any>>
/** Update exprsToSet store for InputTransformForm components (only if module is selected) */
@@ -309,19 +311,106 @@ const getInstructionsForCodeGenerationToolDef = createToolDef(
'Get instructions for code generation for a raw script step'
)
const specialModuleToolArgSchema = z
.string()
.nullable()
.describe(
'JSON string containing the special module object. Use null to remove the special module.'
)
// Using string for modules and schema because Gemini-2.5-flash performs better with strings (MALFORMED_FUNCTION_CALL errors happens more often with objects)
const setFlowJsonToolSchema = z.object({
modules: z.string().optional().nullable().describe('JSON string containing the flow modules'),
schema: z.string().optional().nullable().describe('JSON string containing the flow input schema')
schema: z.string().optional().nullable().describe('JSON string containing the flow input schema'),
preprocessor_module: z
.string()
.optional()
.nullable()
.describe('JSON string containing the optional preprocessor module'),
failure_module: z
.string()
.optional()
.nullable()
.describe('JSON string containing the optional failure module')
})
const setFlowJsonToolDef = createToolDef(
setFlowJsonToolSchema,
'set_flow_json',
'Set the entire flow by providing the complete flow object. This replaces all existing modules and schema.',
'Set the entire flow by providing the complete flow object. This replaces any provided modules, schema, preprocessor_module, and failure_module.',
{ strict: false }
)
const setPreprocessorModuleToolSchema = z.object({
module: specialModuleToolArgSchema
})
const setPreprocessorModuleToolDef = createToolDef(
setPreprocessorModuleToolSchema,
'set_preprocessor_module',
'Set or replace the flow preprocessor module. Use this when the flow needs logic that runs before the main modules.'
)
const setFailureModuleToolSchema = z.object({
module: specialModuleToolArgSchema
})
const setFailureModuleToolDef = createToolDef(
setFailureModuleToolSchema,
'set_failure_module',
'Set or replace the flow failure module. Use this when the flow needs a dedicated error handler.'
)
const specialFlowModuleFields = {
preprocessor_module: SPECIAL_MODULE_IDS.PREPROCESSOR,
failure_module: SPECIAL_MODULE_IDS.FAILURE
} as const
type SpecialFlowModuleField = keyof typeof specialFlowModuleFields
function parseOptionalJsonArg(value: unknown, field: string): unknown {
if (value === undefined || value === null) {
return value
}
try {
return typeof value === 'string' ? JSON.parse(value) : value
} catch (e) {
const errorMessage = e instanceof Error ? e.message : String(e)
throw new Error(`Invalid JSON for ${field}: ${errorMessage}`)
}
}
function validateSpecialFlowModule(
module: unknown,
field: SpecialFlowModuleField
): FlowModule | null | undefined {
if (module === undefined || module === null) {
return module
}
const result = flowModuleSchema.safeParse(module)
if (!result.success) {
const errors = result.error.issues.slice(0, 5).map((issue) => {
const path = issue.path.length > 0 ? issue.path.join('.') : field
return `${path}: ${issue.message}`
})
throw new Error(`Invalid ${field}:\n${errors.join('\n')}`)
}
const parsedModule = result.data
const expectedId = specialFlowModuleFields[field]
if (parsedModule.id !== expectedId) {
throw new Error(`Invalid ${field}: id must be "${expectedId}"`)
}
if (parsedModule.value.type !== 'rawscript' && parsedModule.value.type !== 'script') {
throw new Error(`Invalid ${field}: only "rawscript" and "script" modules are supported`)
}
return parsedModule
}
// Will be overridden by setSchema
const testRunFlowSchema = z.object({
args: z
@@ -634,36 +723,111 @@ export const flowTools: Tool<FlowAIChatHelpers>[] = [
return `Code for module '${moduleId}' has been updated successfully.`
}
},
{
def: setPreprocessorModuleToolDef,
streamArguments: true,
showDetails: true,
showFade: true,
fn: async ({ args, helpers, toolId, toolCallbacks }) => {
const parsedArgs = setPreprocessorModuleToolSchema.parse(args)
const parsedModule = validateSpecialFlowModule(
parseOptionalJsonArg(parsedArgs.module, 'module'),
'preprocessor_module'
)
toolCallbacks.setToolStatus(toolId, {
content:
parsedModule === null ? 'Removing preprocessor module...' : 'Setting preprocessor module...'
})
await helpers.setFlowJson(undefined, undefined, parsedModule, undefined)
if (
parsedModule &&
helpers.getFlowAndSelectedId().selectedId === SPECIAL_MODULE_IDS.PREPROCESSOR &&
'input_transforms' in parsedModule.value &&
parsedModule.value.input_transforms
) {
helpers.updateExprsToSet(parsedModule.id, parsedModule.value.input_transforms)
}
toolCallbacks.setToolStatus(toolId, {
content:
parsedModule === null ? 'Preprocessor module removed' : 'Preprocessor module updated',
result: 'Success'
})
return parsedModule === null
? 'Preprocessor module removed'
: 'Preprocessor module updated successfully.'
}
},
{
def: setFailureModuleToolDef,
streamArguments: true,
showDetails: true,
showFade: true,
fn: async ({ args, helpers, toolId, toolCallbacks }) => {
const parsedArgs = setFailureModuleToolSchema.parse(args)
const parsedModule = validateSpecialFlowModule(
parseOptionalJsonArg(parsedArgs.module, 'module'),
'failure_module'
)
toolCallbacks.setToolStatus(toolId, {
content: parsedModule === null ? 'Removing failure module...' : 'Setting failure module...'
})
await helpers.setFlowJson(undefined, undefined, undefined, parsedModule)
if (
parsedModule &&
helpers.getFlowAndSelectedId().selectedId === SPECIAL_MODULE_IDS.FAILURE &&
'input_transforms' in parsedModule.value &&
parsedModule.value.input_transforms
) {
helpers.updateExprsToSet(parsedModule.id, parsedModule.value.input_transforms)
}
toolCallbacks.setToolStatus(toolId, {
content: parsedModule === null ? 'Failure module removed' : 'Failure module updated',
result: 'Success'
})
return parsedModule === null
? 'Failure module removed'
: 'Failure module updated successfully.'
}
},
{
def: setFlowJsonToolDef,
streamArguments: true,
showDetails: true,
showFade: true,
fn: async ({ args, helpers, toolId, toolCallbacks }) => {
const { modules, schema } = args
const { modules, schema, preprocessor_module, failure_module } = args
let parsedModules: FlowModule[] | undefined
let parsedSchema: Record<string, any> | undefined
let parsedPreprocessorModule: FlowModule | null | undefined
let parsedFailureModule: FlowModule | null | undefined
// Parse JSON strings
try {
parsedModules = modules
? typeof modules === 'string'
? JSON.parse(modules)
: modules
: undefined
parsedSchema = schema
? typeof schema === 'string'
? JSON.parse(schema)
: schema
: undefined
} catch (e) {
const errorMessage = e instanceof Error ? e.message : String(e)
throw new Error(`Invalid JSON: ${errorMessage}`)
parsedModules = parseOptionalJsonArg(modules, 'modules') as FlowModule[] | undefined
parsedSchema = parseOptionalJsonArg(schema, 'schema') as Record<string, any> | undefined
parsedPreprocessorModule = parseOptionalJsonArg(
preprocessor_module,
'preprocessor_module'
) as FlowModule | null | undefined
parsedFailureModule = parseOptionalJsonArg(failure_module, 'failure_module') as
| FlowModule
| null
| undefined
if (parsedModules === null) {
parsedModules = undefined
}
if (parsedSchema === null) {
parsedSchema = undefined
}
// Validate modules against OpenFlow schema
if (parsedModules) {
if (parsedModules !== undefined) {
const result = flowModulesSchema.safeParse(parsedModules)
if (!result.success) {
const errors = result.error.issues.slice(0, 5).map((e) => {
@@ -699,23 +863,61 @@ export const flowTools: Tool<FlowAIChatHelpers>[] = [
throw new Error(`Invalid flow modules:\n${errors.join('\n')}`)
} else {
// check for duplicate ids
const ids = collectAllModuleIdsFromArray(parsedModules)
if (ids.length !== new Set(ids).size) {
throw new Error('Duplicate module IDs found in flow')
const reservedIds = ids.filter(
(id) =>
id === SPECIAL_MODULE_IDS.PREPROCESSOR || id === SPECIAL_MODULE_IDS.FAILURE
)
if (reservedIds.length > 0) {
throw new Error(
'Special modules must be provided via preprocessor_module and failure_module, not inside modules'
)
}
}
}
parsedPreprocessorModule = validateSpecialFlowModule(
parsedPreprocessorModule,
'preprocessor_module'
)
parsedFailureModule = validateSpecialFlowModule(parsedFailureModule, 'failure_module')
const ids = [
...(parsedModules ? collectAllModuleIdsFromArray(parsedModules) : []),
...([parsedPreprocessorModule, parsedFailureModule].filter(
(module): module is FlowModule => module !== undefined && module !== null
)
.map((module) => module.id))
]
if (ids.length !== new Set(ids).size) {
throw new Error('Duplicate module IDs found in flow')
}
toolCallbacks.setToolStatus(toolId, {
content: `Setting flow...`
})
await helpers.setFlowJson(parsedModules, parsedSchema)
await helpers.setFlowJson(
parsedModules,
parsedSchema,
parsedPreprocessorModule,
parsedFailureModule
)
// Update exprsToSet if the selected module has input_transforms
if (parsedModules) {
if (
parsedModules !== undefined ||
parsedPreprocessorModule !== undefined ||
parsedFailureModule !== undefined
) {
const { selectedId } = helpers.getFlowAndSelectedId()
const selectedModule = findModuleById(parsedModules, selectedId)
const selectedModule =
selectedId === SPECIAL_MODULE_IDS.PREPROCESSOR
? parsedPreprocessorModule ?? undefined
: selectedId === SPECIAL_MODULE_IDS.FAILURE
? parsedFailureModule ?? undefined
: parsedModules
? findModuleById(parsedModules, selectedId)
: undefined
if (
selectedModule &&
'input_transforms' in selectedModule.value &&
@@ -767,7 +969,9 @@ export function prepareFlowSystemMessage(customPrompt?: string): ChatCompletionS
## Tool Selection Guide
**Flow Modification:**
- **Create or modify the entire flow** → \`set_flow_json\` (provide complete modules array and optional schema)
- **Update only the preprocessor** → \`set_preprocessor_module\`
- **Update only the failure handler** → \`set_failure_module\`
- **Create or replace the full flow** → \`set_flow_json\`
**Code & Scripts:**
- **View existing inline script code** → \`inspect_inline_script\`
@@ -789,13 +993,17 @@ export function prepareFlowSystemMessage(customPrompt?: string): ChatCompletionS
- **Search resource types** → \`resource_type\`
- **Get database schema** → \`get_db_schema\`
${FLOW_CHAT_SPECIAL_MODULES}
## Flow Modification with set_flow_json
Use the \`set_flow_json\` tool to set the entire flow structure at once. Provide the complete modules array and optionally the flow input schema.
Use the \`set_flow_json\` tool to set the entire flow structure at once. Provide the complete modules array and optionally the flow input schema, \`preprocessor_module\`, and \`failure_module\`.
**Parameters:**
- \`modules\`: Array of flow modules (required)
- \`schema\`: Flow input schema in JSON Schema format (optional)
- \`preprocessor_module\`: Special module that runs before \`modules\` (optional, separate from \`modules\`)
- \`failure_module\`: Special module that runs on failure (optional, separate from \`modules\`)
**Example - Simple flow:**
\`\`\`javascript

View File

@@ -0,0 +1,111 @@
import type { FlowModule, OpenFlow, RawScript } from '$lib/gen'
import { dfs } from '$lib/components/flows/previousResults'
import { SPECIAL_MODULE_IDS } from '../shared'
import type { InlineScriptSession } from './inlineScriptsUtils'
type FlowLike = Pick<OpenFlow, 'value'> & {
schema?: Record<string, any>
}
export interface FlowJsonUpdate {
modules?: FlowModule[]
schema?: Record<string, any>
preprocessorModule?: FlowModule | null
failureModule?: FlowModule | null
}
export function getFlowModuleById(flow: FlowLike | undefined, id: string): FlowModule | undefined {
if (!flow) {
return undefined
}
if (id === SPECIAL_MODULE_IDS.PREPROCESSOR) {
return flow.value.preprocessor_module
}
if (id === SPECIAL_MODULE_IDS.FAILURE) {
return flow.value.failure_module
}
return dfs(id, flow as OpenFlow, false)[0]
}
export function getRawScriptModuleById(
flow: FlowLike | undefined,
id: string
): (FlowModule & { value: RawScript }) | undefined {
const module = getFlowModuleById(flow, id)
if (!module || module.value.type !== 'rawscript') {
return undefined
}
return module as FlowModule & { value: RawScript }
}
export function updateRawScriptModuleContent(
flow: FlowLike,
id: string,
code: string
): (FlowModule & { value: RawScript }) | undefined {
const rawScriptModule = getRawScriptModuleById(flow, id)
if (!rawScriptModule) {
return undefined
}
rawScriptModule.value.content = code
return rawScriptModule
}
export function applyFlowJsonUpdate(
flow: FlowLike,
inlineScriptSession: InlineScriptSession,
{ modules, schema, preprocessorModule, failureModule }: FlowJsonUpdate
): void {
if (modules !== undefined) {
flow.value.modules = restoreFlowModules(modules, inlineScriptSession)
}
if (schema !== undefined) {
flow.schema = schema
}
if (preprocessorModule !== undefined) {
flow.value.preprocessor_module =
preprocessorModule === null
? undefined
: restoreFlowModule(preprocessorModule, inlineScriptSession)
}
if (failureModule !== undefined) {
flow.value.failure_module =
failureModule === null ? undefined : restoreFlowModule(failureModule, inlineScriptSession)
}
}
function restoreFlowModules(
modules: FlowModule[],
inlineScriptSession: InlineScriptSession
): FlowModule[] {
const restoredModules = inlineScriptSession.restoreInlineScriptReferences(modules)
assertResolvedInlineScripts(restoredModules, inlineScriptSession)
return restoredModules
}
function restoreFlowModule(
module: FlowModule,
inlineScriptSession: InlineScriptSession
): FlowModule {
const [restoredModule] = inlineScriptSession.restoreInlineScriptReferences([module])
assertResolvedInlineScripts([restoredModule], inlineScriptSession)
return restoredModule
}
function assertResolvedInlineScripts(
modules: FlowModule[],
inlineScriptSession: InlineScriptSession
): void {
const unresolvedRefs = inlineScriptSession.findUnresolvedInlineScriptRefs(modules)
if (unresolvedRefs.length > 0) {
throw new Error(`Unresolved inline script references: ${unresolvedRefs.join(', ')}`)
}
}

View File

@@ -546,6 +546,7 @@ export function createToolDef(
let parameters = z.toJSONSchema(zodSchema)
delete parameters.$schema
if (!parameters.required) parameters.required = []
normalizeToolParameterSchema(parameters)
return {
type: 'function',
@@ -605,9 +606,9 @@ export const createSearchHubScriptsTool = (withContent: boolean = false) => ({
})
/**
* Recursively removes format: null or format: '' from a JSON schema object
* Recursively normalizes JSON Schema quirks that specific providers reject.
*/
function removeNullFormats(schema: Record<string, any> | undefined): void {
function normalizeToolParameterSchema(schema: Record<string, any> | undefined): void {
if (!schema || typeof schema !== 'object') {
return
}
@@ -620,25 +621,31 @@ function removeNullFormats(schema: Record<string, any> | undefined): void {
// Recurse into properties
if (schema.properties && typeof schema.properties === 'object') {
for (const key of Object.keys(schema.properties)) {
removeNullFormats(schema.properties[key])
normalizeToolParameterSchema(schema.properties[key])
}
}
// Recurse into items (for arrays)
if (schema.items) {
removeNullFormats(schema.items)
if (Array.isArray(schema.items)) {
for (const item of schema.items) {
normalizeToolParameterSchema(item)
}
} else {
normalizeToolParameterSchema(schema.items)
}
}
// Recurse into additionalProperties if it's an object schema
if (schema.additionalProperties && typeof schema.additionalProperties === 'object') {
removeNullFormats(schema.additionalProperties)
normalizeToolParameterSchema(schema.additionalProperties)
}
// Recurse into allOf, anyOf, oneOf
for (const key of ['allOf', 'anyOf', 'oneOf']) {
if (Array.isArray(schema[key])) {
for (const subSchema of schema[key]) {
removeNullFormats(subSchema)
normalizeToolParameterSchema(subSchema)
}
}
}
@@ -662,8 +669,8 @@ export async function buildSchemaForTool(
toolDef.function.parameters = { ...schema, additionalProperties: false }
// recursively remove any format: null or format: '' (empty string) from schema
removeNullFormats(toolDef.function.parameters)
// recursively normalize provider-incompatible schema fragments
normalizeToolParameterSchema(toolDef.function.parameters)
// OPEN AI models don't support strict mode well with schema with complex properties, so we disable it
const model = getCurrentModel()

View File

@@ -20,6 +20,53 @@ The OpenFlow schema (openflow.openapi.yaml) is the source of truth for flow stru
- `preprocessor` - Reserved for preprocessor module
- `Input` - Reserved for flow input reference
## Hard Structural Rules
These are strict Windmill schema rules. Follow them exactly.
- `value.modules` is only for normal sequential steps
- `value.preprocessor_module` and `value.failure_module` are special top-level fields inside `value`, not entries in `value.modules`
- If a flow needs a preprocessor, create `value.preprocessor_module` with `id: preprocessor`
- If a flow needs a failure handler, create `value.failure_module` with `id: failure`
- Do NOT create regular modules inside `value.modules` named `preprocessor` or `failure`
- `preprocessor_module` and `failure_module` only support `script` or `rawscript`
- `preprocessor_module` runs before normal modules and cannot reference `results.*`
- `failure_module` can use the `error` object with `error.message`, `error.step_id`, `error.name`, and `error.stack`
Correct shape:
```yaml
value:
preprocessor_module:
id: preprocessor
value:
type: rawscript
...
failure_module:
id: failure
value:
type: rawscript
...
modules:
- id: process_event
value:
type: rawscript
...
```
Incorrect shape:
```yaml
value:
modules:
- id: preprocessor
...
- id: process_event
...
- id: failure
...
```
## Module ID Rules
- Must be unique across the entire flow
@@ -35,10 +82,148 @@ The OpenFlow schema (openflow.openapi.yaml) is the source of truth for flow stru
## Data Flow Between Steps
- `flow_input.property` - Access flow input parameters
- `results.step_id` - Access output from a previous step
- `results.step_id.property` - Access specific property from previous step output
- `flow_input.iter.value` - Current item when inside a for-loop
- `flow_input.iter.index` - Current index when inside a for-loop
- `results.step_id` - Access output from a previous step only when that step result is in scope
- `results.step_id.property` - Access specific property from a previous step output only when that step result is in scope
- `flow_input.iter.value` - Current iteration value when inside a loop (`forloopflow` or `whileloopflow`)
- `flow_input.iter.index` - Current loop index when inside a loop (`forloopflow` or `whileloopflow`)
## Loop Structure Rules
- For `whileloopflow`, use module-level `stop_after_if` on the loop module itself when the loop should stop after an iteration result
- Do NOT put `stop_after_if` inside `value` of a `whileloopflow`
- `stop_after_all_iters_if` is for checks after the whole loop finishes, not the normal per-iteration break condition
- When a `whileloopflow` carries state forward between iterations, use `flow_input.iter.value` as the current loop value and provide an explicit first-iteration fallback when needed
- Use `flow_input.iter.index` only when the loop logic is truly based on the iteration index, not as a replacement for the current loop value
- If the user asks for a final scalar/object after a loop, add a normal step after the loop that extracts the final value from the loop result instead of returning the whole loop result array
Correct `whileloopflow` shape:
```yaml
- id: loop_until_done
stop_after_if:
expr: result.done === true
skip_if_stopped: false
value:
type: whileloopflow
skip_failures: false
modules:
- id: advance_state
value:
type: rawscript
input_transforms:
state:
type: javascript
expr: flow_input.iter && flow_input.iter.value !== undefined ? flow_input.iter.value : flow_input.initial_state
- id: return_final_state
value:
type: rawscript
input_transforms:
final_state:
type: javascript
expr: results.loop_until_done[results.loop_until_done.length - 1]
```
Incorrect `whileloopflow` patterns:
```yaml
- id: loop_until_done
value:
type: whileloopflow
stop_after_if:
expr: result.done === true
```
```yaml
input_transforms:
state:
type: javascript
expr: flow_input.iter.index
```
```yaml
input_transforms:
final_state:
type: javascript
expr: results.loop_until_done
```
## Approval / Suspend Structure
- `suspend` belongs on the flow module object itself, as a sibling of `id` and `value`
- Never put `suspend` inside `value`
Correct shape:
```yaml
- id: request_approval
suspend:
required_events: 1
resume_form:
schema:
type: object
properties:
comment:
type: string
required: [comment]
value:
type: identity
```
Incorrect shape:
```yaml
- id: request_approval
value:
type: rawscript
suspend:
required_events: 1
```
## Branch Result Scope Rules
- Inside a branch, you may reference earlier outer steps and earlier steps in the same branch
- Outside a `branchone`, do NOT reference ids of steps that only exist inside its branches or default branch. Use `results.<branchone_module_id>` instead
- Outside a `branchall`, do NOT reference ids of steps inside its branches. Use `results.<branchall_module_id>` instead
- If downstream steps need a stable shape after a branch, make each branch return the same fields
- When needed, add a normalization step immediately after the branch and consume `results.<branch_module_id>` there
Correct after `branchone`:
```yaml
- id: route_order
value:
type: branchone
...
- id: send_confirmation
value:
input_transforms:
routed:
type: javascript
expr: results.route_order
```
Incorrect after `branchone`:
```yaml
expr: results.create_shipment
expr: results.create_backorder
```
Correct after `branchall`:
```yaml
- id: enrich_parallel
value:
type: branchall
parallel: true
...
- id: combine_data
value:
input_transforms:
enrichments:
type: javascript
expr: results.enrich_parallel
```
## Input Transforms
@@ -55,14 +240,14 @@ JavaScript transform (dynamic expression):
- For flow inputs: Use type `"object"` with format `"resource-{type}"` (e.g., `"resource-postgresql"`)
- For step inputs: Use static value `"$res:path/to/resource"`
## Failure Handler
## Final Structural Self-Check
Executes when any step fails. Has access to error details:
Before finalizing a flow, verify:
- `error.message` - Error message
- `error.step_id` - ID of failed step
- `error.name` - Error name
- `error.stack` - Stack trace
- any preprocessor is in `value.preprocessor_module`
- any failure handler is in `value.failure_module`
- any approval step has module-level `suspend`
- no downstream step references inner branch step ids from outside the branch
## S3 Object Operations

View File

@@ -51,6 +51,53 @@ The OpenFlow schema (openflow.openapi.yaml) is the source of truth for flow stru
- \`preprocessor\` - Reserved for preprocessor module
- \`Input\` - Reserved for flow input reference
## Hard Structural Rules
These are strict Windmill schema rules. Follow them exactly.
- \`value.modules\` is only for normal sequential steps
- \`value.preprocessor_module\` and \`value.failure_module\` are special top-level fields inside \`value\`, not entries in \`value.modules\`
- If a flow needs a preprocessor, create \`value.preprocessor_module\` with \`id: preprocessor\`
- If a flow needs a failure handler, create \`value.failure_module\` with \`id: failure\`
- Do NOT create regular modules inside \`value.modules\` named \`preprocessor\` or \`failure\`
- \`preprocessor_module\` and \`failure_module\` only support \`script\` or \`rawscript\`
- \`preprocessor_module\` runs before normal modules and cannot reference \`results.*\`
- \`failure_module\` can use the \`error\` object with \`error.message\`, \`error.step_id\`, \`error.name\`, and \`error.stack\`
Correct shape:
\`\`\`yaml
value:
preprocessor_module:
id: preprocessor
value:
type: rawscript
...
failure_module:
id: failure
value:
type: rawscript
...
modules:
- id: process_event
value:
type: rawscript
...
\`\`\`
Incorrect shape:
\`\`\`yaml
value:
modules:
- id: preprocessor
...
- id: process_event
...
- id: failure
...
\`\`\`
## Module ID Rules
- Must be unique across the entire flow
@@ -66,10 +113,148 @@ The OpenFlow schema (openflow.openapi.yaml) is the source of truth for flow stru
## Data Flow Between Steps
- \`flow_input.property\` - Access flow input parameters
- \`results.step_id\` - Access output from a previous step
- \`results.step_id.property\` - Access specific property from previous step output
- \`flow_input.iter.value\` - Current item when inside a for-loop
- \`flow_input.iter.index\` - Current index when inside a for-loop
- \`results.step_id\` - Access output from a previous step only when that step result is in scope
- \`results.step_id.property\` - Access specific property from a previous step output only when that step result is in scope
- \`flow_input.iter.value\` - Current iteration value when inside a loop (\`forloopflow\` or \`whileloopflow\`)
- \`flow_input.iter.index\` - Current loop index when inside a loop (\`forloopflow\` or \`whileloopflow\`)
## Loop Structure Rules
- For \`whileloopflow\`, use module-level \`stop_after_if\` on the loop module itself when the loop should stop after an iteration result
- Do NOT put \`stop_after_if\` inside \`value\` of a \`whileloopflow\`
- \`stop_after_all_iters_if\` is for checks after the whole loop finishes, not the normal per-iteration break condition
- When a \`whileloopflow\` carries state forward between iterations, use \`flow_input.iter.value\` as the current loop value and provide an explicit first-iteration fallback when needed
- Use \`flow_input.iter.index\` only when the loop logic is truly based on the iteration index, not as a replacement for the current loop value
- If the user asks for a final scalar/object after a loop, add a normal step after the loop that extracts the final value from the loop result instead of returning the whole loop result array
Correct \`whileloopflow\` shape:
\`\`\`yaml
- id: loop_until_done
stop_after_if:
expr: result.done === true
skip_if_stopped: false
value:
type: whileloopflow
skip_failures: false
modules:
- id: advance_state
value:
type: rawscript
input_transforms:
state:
type: javascript
expr: flow_input.iter && flow_input.iter.value !== undefined ? flow_input.iter.value : flow_input.initial_state
- id: return_final_state
value:
type: rawscript
input_transforms:
final_state:
type: javascript
expr: results.loop_until_done[results.loop_until_done.length - 1]
\`\`\`
Incorrect \`whileloopflow\` patterns:
\`\`\`yaml
- id: loop_until_done
value:
type: whileloopflow
stop_after_if:
expr: result.done === true
\`\`\`
\`\`\`yaml
input_transforms:
state:
type: javascript
expr: flow_input.iter.index
\`\`\`
\`\`\`yaml
input_transforms:
final_state:
type: javascript
expr: results.loop_until_done
\`\`\`
## Approval / Suspend Structure
- \`suspend\` belongs on the flow module object itself, as a sibling of \`id\` and \`value\`
- Never put \`suspend\` inside \`value\`
Correct shape:
\`\`\`yaml
- id: request_approval
suspend:
required_events: 1
resume_form:
schema:
type: object
properties:
comment:
type: string
required: [comment]
value:
type: identity
\`\`\`
Incorrect shape:
\`\`\`yaml
- id: request_approval
value:
type: rawscript
suspend:
required_events: 1
\`\`\`
## Branch Result Scope Rules
- Inside a branch, you may reference earlier outer steps and earlier steps in the same branch
- Outside a \`branchone\`, do NOT reference ids of steps that only exist inside its branches or default branch. Use \`results.<branchone_module_id>\` instead
- Outside a \`branchall\`, do NOT reference ids of steps inside its branches. Use \`results.<branchall_module_id>\` instead
- If downstream steps need a stable shape after a branch, make each branch return the same fields
- When needed, add a normalization step immediately after the branch and consume \`results.<branch_module_id>\` there
Correct after \`branchone\`:
\`\`\`yaml
- id: route_order
value:
type: branchone
...
- id: send_confirmation
value:
input_transforms:
routed:
type: javascript
expr: results.route_order
\`\`\`
Incorrect after \`branchone\`:
\`\`\`yaml
expr: results.create_shipment
expr: results.create_backorder
\`\`\`
Correct after \`branchall\`:
\`\`\`yaml
- id: enrich_parallel
value:
type: branchall
parallel: true
...
- id: combine_data
value:
input_transforms:
enrichments:
type: javascript
expr: results.enrich_parallel
\`\`\`
## Input Transforms
@@ -86,14 +271,14 @@ JavaScript transform (dynamic expression):
- For flow inputs: Use type \`"object"\` with format \`"resource-{type}"\` (e.g., \`"resource-postgresql"\`)
- For step inputs: Use static value \`"$res:path/to/resource"\`
## Failure Handler
## Final Structural Self-Check
Executes when any step fails. Has access to error details:
Before finalizing a flow, verify:
- \`error.message\` - Error message
- \`error.step_id\` - ID of failed step
- \`error.name\` - Error name
- \`error.stack\` - Stack trace
- any preprocessor is in \`value.preprocessor_module\`
- any failure handler is in \`value.failure_module\`
- any approval step has module-level \`suspend\`
- no downstream step references inner branch step ids from outside the branch
## S3 Object Operations
@@ -149,6 +334,46 @@ Reference a specific resource using \`$res:\` prefix:
\`\`\`
`;
export const FLOW_CHAT_SPECIAL_MODULES = `## Special Modules
- Use \`set_preprocessor_module\` to add, replace, or remove the top-level \`value.preprocessor_module\`
- Use \`set_failure_module\` to add, replace, or remove the top-level \`value.failure_module\`
- Use \`set_flow_json\` only when you are replacing the whole flow, including normal modules and optional special modules
**Example - Update only the special modules:**
\`\`\`javascript
set_preprocessor_module({
module: JSON.stringify({
id: "preprocessor",
value: {
type: "rawscript",
language: "bun",
content: "export async function preprocessor(payload: string) { const trimmed = payload.trim(); if (!trimmed) { throw new Error('payload must not be empty'); } return { payload: trimmed }; }",
input_transforms: {
payload: { type: "javascript", expr: "flow_input.payload" }
}
}
})
})
set_failure_module({
module: JSON.stringify({
id: "failure",
value: {
type: "rawscript",
language: "bun",
content: "export async function main(message: string, name: string, step_id: string) { return { message, name, step_id }; }",
input_transforms: {
message: { type: "javascript", expr: "error.message" },
name: { type: "javascript", expr: "error.name" },
step_id: { type: "javascript", expr: "error.step_id" }
}
}
})
})
\`\`\`
`;
export const SDK_TYPESCRIPT = `# TypeScript SDK (windmill-client)
Import: import * as wmill from 'windmill-client'

View File

@@ -25,6 +25,53 @@ The OpenFlow schema (openflow.openapi.yaml) is the source of truth for flow stru
- `preprocessor` - Reserved for preprocessor module
- `Input` - Reserved for flow input reference
## Hard Structural Rules
These are strict Windmill schema rules. Follow them exactly.
- `value.modules` is only for normal sequential steps
- `value.preprocessor_module` and `value.failure_module` are special top-level fields inside `value`, not entries in `value.modules`
- If a flow needs a preprocessor, create `value.preprocessor_module` with `id: preprocessor`
- If a flow needs a failure handler, create `value.failure_module` with `id: failure`
- Do NOT create regular modules inside `value.modules` named `preprocessor` or `failure`
- `preprocessor_module` and `failure_module` only support `script` or `rawscript`
- `preprocessor_module` runs before normal modules and cannot reference `results.*`
- `failure_module` can use the `error` object with `error.message`, `error.step_id`, `error.name`, and `error.stack`
Correct shape:
```yaml
value:
preprocessor_module:
id: preprocessor
value:
type: rawscript
...
failure_module:
id: failure
value:
type: rawscript
...
modules:
- id: process_event
value:
type: rawscript
...
```
Incorrect shape:
```yaml
value:
modules:
- id: preprocessor
...
- id: process_event
...
- id: failure
...
```
## Module ID Rules
- Must be unique across the entire flow
@@ -40,10 +87,148 @@ The OpenFlow schema (openflow.openapi.yaml) is the source of truth for flow stru
## Data Flow Between Steps
- `flow_input.property` - Access flow input parameters
- `results.step_id` - Access output from a previous step
- `results.step_id.property` - Access specific property from previous step output
- `flow_input.iter.value` - Current item when inside a for-loop
- `flow_input.iter.index` - Current index when inside a for-loop
- `results.step_id` - Access output from a previous step only when that step result is in scope
- `results.step_id.property` - Access specific property from a previous step output only when that step result is in scope
- `flow_input.iter.value` - Current iteration value when inside a loop (`forloopflow` or `whileloopflow`)
- `flow_input.iter.index` - Current loop index when inside a loop (`forloopflow` or `whileloopflow`)
## Loop Structure Rules
- For `whileloopflow`, use module-level `stop_after_if` on the loop module itself when the loop should stop after an iteration result
- Do NOT put `stop_after_if` inside `value` of a `whileloopflow`
- `stop_after_all_iters_if` is for checks after the whole loop finishes, not the normal per-iteration break condition
- When a `whileloopflow` carries state forward between iterations, use `flow_input.iter.value` as the current loop value and provide an explicit first-iteration fallback when needed
- Use `flow_input.iter.index` only when the loop logic is truly based on the iteration index, not as a replacement for the current loop value
- If the user asks for a final scalar/object after a loop, add a normal step after the loop that extracts the final value from the loop result instead of returning the whole loop result array
Correct `whileloopflow` shape:
```yaml
- id: loop_until_done
stop_after_if:
expr: result.done === true
skip_if_stopped: false
value:
type: whileloopflow
skip_failures: false
modules:
- id: advance_state
value:
type: rawscript
input_transforms:
state:
type: javascript
expr: flow_input.iter && flow_input.iter.value !== undefined ? flow_input.iter.value : flow_input.initial_state
- id: return_final_state
value:
type: rawscript
input_transforms:
final_state:
type: javascript
expr: results.loop_until_done[results.loop_until_done.length - 1]
```
Incorrect `whileloopflow` patterns:
```yaml
- id: loop_until_done
value:
type: whileloopflow
stop_after_if:
expr: result.done === true
```
```yaml
input_transforms:
state:
type: javascript
expr: flow_input.iter.index
```
```yaml
input_transforms:
final_state:
type: javascript
expr: results.loop_until_done
```
## Approval / Suspend Structure
- `suspend` belongs on the flow module object itself, as a sibling of `id` and `value`
- Never put `suspend` inside `value`
Correct shape:
```yaml
- id: request_approval
suspend:
required_events: 1
resume_form:
schema:
type: object
properties:
comment:
type: string
required: [comment]
value:
type: identity
```
Incorrect shape:
```yaml
- id: request_approval
value:
type: rawscript
suspend:
required_events: 1
```
## Branch Result Scope Rules
- Inside a branch, you may reference earlier outer steps and earlier steps in the same branch
- Outside a `branchone`, do NOT reference ids of steps that only exist inside its branches or default branch. Use `results.<branchone_module_id>` instead
- Outside a `branchall`, do NOT reference ids of steps inside its branches. Use `results.<branchall_module_id>` instead
- If downstream steps need a stable shape after a branch, make each branch return the same fields
- When needed, add a normalization step immediately after the branch and consume `results.<branch_module_id>` there
Correct after `branchone`:
```yaml
- id: route_order
value:
type: branchone
...
- id: send_confirmation
value:
input_transforms:
routed:
type: javascript
expr: results.route_order
```
Incorrect after `branchone`:
```yaml
expr: results.create_shipment
expr: results.create_backorder
```
Correct after `branchall`:
```yaml
- id: enrich_parallel
value:
type: branchall
parallel: true
...
- id: combine_data
value:
input_transforms:
enrichments:
type: javascript
expr: results.enrich_parallel
```
## Input Transforms
@@ -60,14 +245,14 @@ JavaScript transform (dynamic expression):
- For flow inputs: Use type `"object"` with format `"resource-{type}"` (e.g., `"resource-postgresql"`)
- For step inputs: Use static value `"$res:path/to/resource"`
## Failure Handler
## Final Structural Self-Check
Executes when any step fails. Has access to error details:
Before finalizing a flow, verify:
- `error.message` - Error message
- `error.step_id` - ID of failed step
- `error.name` - Error name
- `error.stack` - Stack trace
- any preprocessor is in `value.preprocessor_module`
- any failure handler is in `value.failure_module`
- any approval step has module-level `suspend`
- no downstream step references inner branch step ids from outside the branch
## S3 Object Operations

View File

@@ -20,6 +20,53 @@ The OpenFlow schema (openflow.openapi.yaml) is the source of truth for flow stru
- `preprocessor` - Reserved for preprocessor module
- `Input` - Reserved for flow input reference
## Hard Structural Rules
These are strict Windmill schema rules. Follow them exactly.
- `value.modules` is only for normal sequential steps
- `value.preprocessor_module` and `value.failure_module` are special top-level fields inside `value`, not entries in `value.modules`
- If a flow needs a preprocessor, create `value.preprocessor_module` with `id: preprocessor`
- If a flow needs a failure handler, create `value.failure_module` with `id: failure`
- Do NOT create regular modules inside `value.modules` named `preprocessor` or `failure`
- `preprocessor_module` and `failure_module` only support `script` or `rawscript`
- `preprocessor_module` runs before normal modules and cannot reference `results.*`
- `failure_module` can use the `error` object with `error.message`, `error.step_id`, `error.name`, and `error.stack`
Correct shape:
```yaml
value:
preprocessor_module:
id: preprocessor
value:
type: rawscript
...
failure_module:
id: failure
value:
type: rawscript
...
modules:
- id: process_event
value:
type: rawscript
...
```
Incorrect shape:
```yaml
value:
modules:
- id: preprocessor
...
- id: process_event
...
- id: failure
...
```
## Module ID Rules
- Must be unique across the entire flow
@@ -35,10 +82,148 @@ The OpenFlow schema (openflow.openapi.yaml) is the source of truth for flow stru
## Data Flow Between Steps
- `flow_input.property` - Access flow input parameters
- `results.step_id` - Access output from a previous step
- `results.step_id.property` - Access specific property from previous step output
- `flow_input.iter.value` - Current item when inside a for-loop
- `flow_input.iter.index` - Current index when inside a for-loop
- `results.step_id` - Access output from a previous step only when that step result is in scope
- `results.step_id.property` - Access specific property from a previous step output only when that step result is in scope
- `flow_input.iter.value` - Current iteration value when inside a loop (`forloopflow` or `whileloopflow`)
- `flow_input.iter.index` - Current loop index when inside a loop (`forloopflow` or `whileloopflow`)
## Loop Structure Rules
- For `whileloopflow`, use module-level `stop_after_if` on the loop module itself when the loop should stop after an iteration result
- Do NOT put `stop_after_if` inside `value` of a `whileloopflow`
- `stop_after_all_iters_if` is for checks after the whole loop finishes, not the normal per-iteration break condition
- When a `whileloopflow` carries state forward between iterations, use `flow_input.iter.value` as the current loop value and provide an explicit first-iteration fallback when needed
- Use `flow_input.iter.index` only when the loop logic is truly based on the iteration index, not as a replacement for the current loop value
- If the user asks for a final scalar/object after a loop, add a normal step after the loop that extracts the final value from the loop result instead of returning the whole loop result array
Correct `whileloopflow` shape:
```yaml
- id: loop_until_done
stop_after_if:
expr: result.done === true
skip_if_stopped: false
value:
type: whileloopflow
skip_failures: false
modules:
- id: advance_state
value:
type: rawscript
input_transforms:
state:
type: javascript
expr: flow_input.iter && flow_input.iter.value !== undefined ? flow_input.iter.value : flow_input.initial_state
- id: return_final_state
value:
type: rawscript
input_transforms:
final_state:
type: javascript
expr: results.loop_until_done[results.loop_until_done.length - 1]
```
Incorrect `whileloopflow` patterns:
```yaml
- id: loop_until_done
value:
type: whileloopflow
stop_after_if:
expr: result.done === true
```
```yaml
input_transforms:
state:
type: javascript
expr: flow_input.iter.index
```
```yaml
input_transforms:
final_state:
type: javascript
expr: results.loop_until_done
```
## Approval / Suspend Structure
- `suspend` belongs on the flow module object itself, as a sibling of `id` and `value`
- Never put `suspend` inside `value`
Correct shape:
```yaml
- id: request_approval
suspend:
required_events: 1
resume_form:
schema:
type: object
properties:
comment:
type: string
required: [comment]
value:
type: identity
```
Incorrect shape:
```yaml
- id: request_approval
value:
type: rawscript
suspend:
required_events: 1
```
## Branch Result Scope Rules
- Inside a branch, you may reference earlier outer steps and earlier steps in the same branch
- Outside a `branchone`, do NOT reference ids of steps that only exist inside its branches or default branch. Use `results.<branchone_module_id>` instead
- Outside a `branchall`, do NOT reference ids of steps inside its branches. Use `results.<branchall_module_id>` instead
- If downstream steps need a stable shape after a branch, make each branch return the same fields
- When needed, add a normalization step immediately after the branch and consume `results.<branch_module_id>` there
Correct after `branchone`:
```yaml
- id: route_order
value:
type: branchone
...
- id: send_confirmation
value:
input_transforms:
routed:
type: javascript
expr: results.route_order
```
Incorrect after `branchone`:
```yaml
expr: results.create_shipment
expr: results.create_backorder
```
Correct after `branchall`:
```yaml
- id: enrich_parallel
value:
type: branchall
parallel: true
...
- id: combine_data
value:
input_transforms:
enrichments:
type: javascript
expr: results.enrich_parallel
```
## Input Transforms
@@ -55,14 +240,14 @@ JavaScript transform (dynamic expression):
- For flow inputs: Use type `"object"` with format `"resource-{type}"` (e.g., `"resource-postgresql"`)
- For step inputs: Use static value `"$res:path/to/resource"`
## Failure Handler
## Final Structural Self-Check
Executes when any step fails. Has access to error details:
Before finalizing a flow, verify:
- `error.message` - Error message
- `error.step_id` - ID of failed step
- `error.name` - Error name
- `error.stack` - Stack trace
- any preprocessor is in `value.preprocessor_module`
- any failure handler is in `value.failure_module`
- any approval step has module-level `suspend`
- no downstream step references inner branch step ids from outside the branch
## S3 Object Operations

View File

@@ -0,0 +1,38 @@
## Special Modules
- Use `set_preprocessor_module` to add, replace, or remove the top-level `value.preprocessor_module`
- Use `set_failure_module` to add, replace, or remove the top-level `value.failure_module`
- Use `set_flow_json` only when you are replacing the whole flow, including normal modules and optional special modules
**Example - Update only the special modules:**
```javascript
set_preprocessor_module({
module: JSON.stringify({
id: "preprocessor",
value: {
type: "rawscript",
language: "bun",
content: "export async function preprocessor(payload: string) { const trimmed = payload.trim(); if (!trimmed) { throw new Error('payload must not be empty'); } return { payload: trimmed }; }",
input_transforms: {
payload: { type: "javascript", expr: "flow_input.payload" }
}
}
})
})
set_failure_module({
module: JSON.stringify({
id: "failure",
value: {
type: "rawscript",
language: "bun",
content: "export async function main(message: string, name: string, step_id: string) { return { message, name, step_id }; }",
input_transforms: {
message: { type: "javascript", expr: "error.message" },
name: { type: "javascript", expr: "error.name" },
step_id: { type: "javascript", expr: "error.step_id" }
}
}
})
})
```

View File

@@ -1162,6 +1162,7 @@ def main():
script_base = read_markdown_file(base_dir / "script-base.md")
flow_base = read_markdown_file(base_dir / "flow-base.md")
flow_chat_special_modules = read_markdown_file(base_dir / "flow-chat-special-modules.md")
# Read language files
languages = {}
@@ -1212,6 +1213,7 @@ def main():
# Base prompts
'SCRIPT_BASE': script_base,
'FLOW_BASE': flow_base,
'FLOW_CHAT_SPECIAL_MODULES': flow_chat_special_modules,
# SDKs
'SDK_TYPESCRIPT': ts_sdk_md,