Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 140 additions & 4 deletions strix/core/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import asyncio
import contextlib
import json
import logging
import uuid
from collections.abc import Callable
Expand Down Expand Up @@ -36,6 +37,7 @@
StreamEventSink = Callable[[str, Any], None]

_INPUT_REJECTION_CODES = frozenset({400, 404, 422})
_TOOL_ARGUMENT_KEYS = frozenset({"action_input", "arguments", "input", "parameters", "params"})


async def run_agent_loop(
Expand All @@ -62,7 +64,7 @@ async def run_agent_loop(

if not (start_parked and interactive):
if interactive:
result = await _run_cycle(
result = await _run_interactive_until_tool_continuation_settled(
agent,
coordinator,
agent_id,
Expand All @@ -71,7 +73,6 @@ async def run_agent_loop(
context=context,
max_turns=max_turns,
session=session,
interactive=interactive,
event_sink=event_sink,
hooks=hooks,
)
Expand Down Expand Up @@ -103,7 +104,7 @@ async def run_agent_loop(
raise BudgetExceededError("scan budget reached")

await coordinator.consume_pending(agent_id)
result = await _run_cycle(
result = await _run_interactive_until_tool_continuation_settled(
agent,
coordinator,
agent_id,
Expand All @@ -112,12 +113,73 @@ async def run_agent_loop(
context=context,
max_turns=max_turns,
session=session,
interactive=interactive,
event_sink=event_sink,
hooks=hooks,
)


async def _run_interactive_until_tool_continuation_settled(
agent: Any,
coordinator: AgentCoordinator,
agent_id: str,
*,
input_data: Any,
run_config: RunConfig,
context: dict[str, Any],
max_turns: int,
session: Session | None,
event_sink: StreamEventSink | None,
hooks: RunHooks[dict[str, Any]] | None,
) -> RunResultBase | None:
"""Retry interactive turns when a model prints tool-call JSON as final text."""
result: RunResultBase | None = None
invalid_final_outputs = 0
invalid_final_output_limit = max(1, max_turns)

while True:
if coordinator.budget_stopped:
await coordinator.set_status(agent_id, "stopped")
raise BudgetExceededError("scan budget reached")

result = await _run_cycle(
agent,
coordinator,
agent_id,
input_data=input_data,
run_config=run_config,
context=context,
max_turns=max_turns,
session=session,
interactive=True,
event_sink=event_sink,
hooks=hooks,
)

status = await _agent_status(coordinator, agent_id)
if status != "waiting" or not _looks_like_unexecuted_tool_call(result):
return result

invalid_final_outputs += 1
logger.warning(
"agent %s produced tool-call-shaped final output in interactive mode; "
"forcing tool continuation (%d/%d): %s",
agent_id,
invalid_final_outputs,
invalid_final_output_limit,
_final_output_preview(result),
)

if invalid_final_outputs >= invalid_final_output_limit:
return result

input_data = await _append_interactive_tool_required_message(
session=session,
context=context,
attempt=invalid_final_outputs,
limit=invalid_final_output_limit,
)


async def spawn_child_agent(
*,
coordinator: AgentCoordinator,
Expand Down Expand Up @@ -468,6 +530,72 @@ def _final_output_preview(result: RunResultBase | None) -> str:
return text[:300]


def _looks_like_unexecuted_tool_call(result: RunResultBase | None) -> bool:
final_output = getattr(result, "final_output", None)
if final_output is None:
return False
if isinstance(final_output, str):
parsed = _parse_json_final_output(final_output)
return parsed is not None and _is_tool_call_payload(parsed)
return _is_tool_call_payload(final_output)


def _parse_json_final_output(text: str) -> Any | None:
stripped = text.strip()
if not stripped:
return None
if stripped.startswith("```"):
lines = stripped.splitlines()
if len(lines) >= 2 and lines[-1].strip() == "```":
stripped = "\n".join(lines[1:-1]).strip()
try:
return json.loads(stripped)
except (TypeError, ValueError):
return None


def _is_tool_call_payload(payload: Any) -> bool:
if isinstance(payload, list):
return any(_is_tool_call_payload(item) for item in payload)
if not isinstance(payload, dict):
return False

tool_calls = payload.get("tool_calls")
if isinstance(tool_calls, list) and any(_is_tool_call_payload(item) for item in tool_calls):
return True

function = payload.get("function")
if isinstance(function, dict) and _is_tool_call_payload(function):
return True

tool_name = payload.get("action") or payload.get("tool") or payload.get("name")
return (
isinstance(tool_name, str)
and bool(tool_name.strip())
and any(key in payload for key in _TOOL_ARGUMENT_KEYS)
)
Comment on lines +571 to +576

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Generic JSON Triggers Retry

When an interactive answer is valid plain JSON such as {"action":"report","params":{...}} or {"name":"example","input":"..."}, this predicate treats it as an unexecuted tool call because the keys overlap the heuristic. The run then appends corrective tool-use prompts until the retry limit before parking, so a normal answer can consume extra turns and mutate the session just because its schema resembles a tool call.

Prompt To Fix With AI
This is a comment left during a code review.
Path: strix/core/execution.py
Line: 571-576

Comment:
**Generic JSON Triggers Retry**

When an interactive answer is valid plain JSON such as `{"action":"report","params":{...}}` or `{"name":"example","input":"..."}`, this predicate treats it as an unexecuted tool call because the keys overlap the heuristic. The run then appends corrective tool-use prompts until the retry limit before parking, so a normal answer can consume extra turns and mutate the session just because its schema resembles a tool call.

How can I resolve this? If you propose a fix, please make it concise.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair concern. Two bounds keep this from mutating a normal session much: the predicate only runs when the cycle produced no executed tool call and no lifecycle settle, and retries are capped by the same invalid_final_output_limit the non-interactive path uses - after the cap it falls back to the existing park-and-wait, so worst case for a JSON-shaped plain answer is a couple of extra turns. Validating the extracted name against registered tools would be tighter, but the sandbox-bound tools are attached per-run in the agents factory and aren't visible from module scope here, so that check would need a larger refactor than this fix wants to carry. Happy to tighten the key heuristic if maintainers prefer.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The explanation is fair and the guard conditions do meaningfully reduce the blast radius. That said, the original concern still partially holds and there's a concrete way to tighten it without needing the per-run tool registry.

The "name" key in _is_tool_call_payload is the main culprit. It's very generic — {"name": "Alice", "input": "hello"}, {"name": "report", "parameters": {...}}, {"name": "success", "arguments": []} all match. The stricter keys (tool_calls, function) have structural nesting that makes false positives much less likely, and "action" + "tool" are more domain-specific. Dropping "name" from the tool-name candidates would cut the false-positive rate significantly at essentially no cost to the actual recovery path, since LiteLLM/Ollama plain-text tool blobs use action or tool, not name.

Suggested change
tool_name = payload.get("action") or payload.get("tool") or payload.get("name")
return (
isinstance(tool_name, str)
and bool(tool_name.strip())
and any(key in payload for key in _TOOL_ARGUMENT_KEYS)
)
tool_name = payload.get("action") or payload.get("tool")
return (
isinstance(tool_name, str)
and bool(tool_name.strip())
and any(key in payload for key in _TOOL_ARGUMENT_KEYS)
)

If name needs to stay for some backend that emits it, consider requiring type == "function" alongside it (matching the OpenAI tool-call schema), which filters out almost all natural-language JSON:

    name_key = payload.get("action") or payload.get("tool") or (
        payload.get("name") if payload.get("type") == "function" else None
    )

The bounding arguments are reasonable — the concern is more about the quality of the in-session experience (spurious corrective prompts visible in the transcript) than catastrophic mutation. Either of those tightenings would address that without touching the architecture.

Tip: You can customize Greptile's behavior for this repo with .greptile/rules.md and .greptile/config.json.



async def _append_interactive_tool_required_message(
*,
session: Session | None,
context: dict[str, Any],
attempt: int,
limit: int,
) -> list[dict[str, str]]:
finish_tool = "finish_scan" if context.get("parent_id") is None else "agent_finish"
message = (
"Your previous response looked like a tool call, but it was returned as plain text "
"instead of being executed. Plain-text tool-call JSON is not executed by Strix. "
"Continue immediately and call exactly one tool. "
f"If your work is complete, call {finish_tool}. "
"If you are blocked waiting for another agent, call wait_for_message. "
"Otherwise use the appropriate execution or planning tool. "
f"This is recovery attempt {attempt}/{limit}."
)
return await _append_tool_required_message(session=session, message=message)


async def _append_noninteractive_tool_required_message(
*,
session: Session | None,
Expand All @@ -485,6 +613,14 @@ async def _append_noninteractive_tool_required_message(
"Otherwise use the appropriate execution or planning tool. "
f"This is recovery attempt {attempt}/{limit}."
)
return await _append_tool_required_message(session=session, message=message)


async def _append_tool_required_message(
*,
session: Session | None,
message: str,
) -> list[dict[str, str]]:
item = {"role": "user", "content": message}
if session is None:
return [item]
Expand Down
Loading