Add interactive step testing to Tool Builder

- New TestStepDialog for testing individual steps from the GUI
- Test button in Tool Builder to test selected step
- Auto-detects variables from step templates
- Multiple assertion types: not_empty, contains, valid_json,
  matches_regex, min/max_length, equals, valid_python
- Background execution with timing metrics
- Provider override option for testing with mock provider
- Output variable display and assertion pass/fail results

Completes M5: Testing & Polish milestone (100%)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
rob 2026-01-17 13:42:20 -04:00
parent a134fb59c3
commit 68ab329c33
2 changed files with 517 additions and 0 deletions

View File

@ -0,0 +1,496 @@
"""Test Step Dialog for interactive step testing."""
import json
import re
import time
from typing import Union
from PySide6.QtWidgets import (
QDialog, QVBoxLayout, QHBoxLayout, QFormLayout, QGroupBox,
QLabel, QLineEdit, QPlainTextEdit, QPushButton, QComboBox,
QTextEdit, QTableWidget, QTableWidgetItem, QHeaderView,
QSplitter, QWidget, QMessageBox
)
from PySide6.QtCore import Qt, QThread, Signal
from PySide6.QtGui import QColor
from ...tool import PromptStep, CodeStep, ToolStep
from ...runner import execute_prompt_step, execute_code_step, execute_tool_step
from ...providers import load_providers
class StepTestWorker(QThread):
"""Background worker for executing step tests."""
finished = Signal(dict) # Emits result dict
def __init__(self, step: Union[PromptStep, CodeStep, ToolStep], variables: dict, provider_override: str = None):
super().__init__()
self.step = step
self.variables = variables
self.provider_override = provider_override
def run(self):
result = {
"success": False,
"output": "",
"output_vars": {},
"error": None,
"elapsed_ms": 0
}
start_time = time.time()
try:
if isinstance(self.step, PromptStep):
output, success = execute_prompt_step(
self.step, self.variables, self.provider_override
)
result["success"] = success
result["output"] = output
result["output_vars"] = {self.step.output_var: output}
if not success:
result["error"] = "Provider call failed"
elif isinstance(self.step, CodeStep):
outputs, success = execute_code_step(
self.step, self.variables, step_num=1
)
result["success"] = success
result["output_vars"] = outputs
result["output"] = "\n".join(f"{k} = {v}" for k, v in outputs.items())
if not success:
result["error"] = "Code execution failed"
elif isinstance(self.step, ToolStep):
output, success = execute_tool_step(
self.step, self.variables,
depth=0,
provider_override=self.provider_override,
dry_run=False,
verbose=False
)
result["success"] = success
result["output"] = output
result["output_vars"] = {self.step.output_var: output}
if not success:
result["error"] = f"Tool '{self.step.tool}' execution failed"
except Exception as e:
result["success"] = False
result["error"] = str(e)
result["elapsed_ms"] = int((time.time() - start_time) * 1000)
self.finished.emit(result)
class TestStepDialog(QDialog):
"""Dialog for interactively testing a single step."""
# Assertion types available
ASSERTION_TYPES = [
("not_empty", "Not Empty", "Output must not be empty"),
("contains", "Contains", "Output must contain the specified text"),
("not_contains", "Does Not Contain", "Output must NOT contain the specified text"),
("equals", "Equals", "Output must exactly equal the expected value"),
("valid_json", "Valid JSON", "Output must be valid JSON"),
("valid_python", "Valid Python", "Output must be valid Python syntax"),
("matches_regex", "Matches Regex", "Output must match the regular expression"),
("min_length", "Min Length", "Output must be at least N characters"),
("max_length", "Max Length", "Output must be at most N characters"),
]
def __init__(self, parent, step: Union[PromptStep, CodeStep, ToolStep], available_vars: list = None):
super().__init__(parent)
self.step = step
self.available_vars = available_vars or ["input"]
self._worker = None
# Determine step type for title
if isinstance(step, PromptStep):
step_type = "Prompt"
elif isinstance(step, CodeStep):
step_type = "Code"
elif isinstance(step, ToolStep):
step_type = f"Tool ({step.tool})"
else:
step_type = "Unknown"
step_name = step.name if step.name else step_type
self.setWindowTitle(f"Test Step: {step_name}")
self.setMinimumSize(800, 700)
self._setup_ui()
self._detect_variables()
def _setup_ui(self):
"""Set up the dialog UI."""
layout = QVBoxLayout(self)
layout.setSpacing(12)
# Main splitter: top (inputs) | bottom (output)
splitter = QSplitter(Qt.Vertical)
# Top section: Variables and Assertions
top_widget = QWidget()
top_layout = QHBoxLayout(top_widget)
top_layout.setContentsMargins(0, 0, 0, 0)
# Left: Variables input
vars_group = QGroupBox("Input Variables")
vars_layout = QVBoxLayout(vars_group)
vars_help = QLabel("Provide test values for variables used in this step:")
vars_help.setStyleSheet("color: #718096; font-size: 11px;")
vars_layout.addWidget(vars_help)
# Variables form
self.vars_form = QFormLayout()
self.vars_form.setSpacing(8)
self.var_inputs = {} # variable name -> QLineEdit or QPlainTextEdit
vars_layout.addLayout(self.vars_form)
vars_layout.addStretch()
top_layout.addWidget(vars_group, 1)
# Right: Assertions
assert_group = QGroupBox("Assertions (Optional)")
assert_layout = QVBoxLayout(assert_group)
assert_help = QLabel("Define checks to validate the step output:")
assert_help.setStyleSheet("color: #718096; font-size: 11px;")
assert_layout.addWidget(assert_help)
# Assertions table
self.assertions_table = QTableWidget(0, 3)
self.assertions_table.setHorizontalHeaderLabels(["Type", "Value", ""])
self.assertions_table.horizontalHeader().setSectionResizeMode(0, QHeaderView.ResizeToContents)
self.assertions_table.horizontalHeader().setSectionResizeMode(1, QHeaderView.Stretch)
self.assertions_table.horizontalHeader().setSectionResizeMode(2, QHeaderView.ResizeToContents)
self.assertions_table.verticalHeader().setVisible(False)
self.assertions_table.setMaximumHeight(150)
assert_layout.addWidget(self.assertions_table)
# Add assertion button
btn_add_assertion = QPushButton("+ Add Assertion")
btn_add_assertion.clicked.connect(self._add_assertion_row)
assert_layout.addWidget(btn_add_assertion)
top_layout.addWidget(assert_group, 1)
splitter.addWidget(top_widget)
# Bottom section: Controls and Output
bottom_widget = QWidget()
bottom_layout = QVBoxLayout(bottom_widget)
bottom_layout.setContentsMargins(0, 0, 0, 0)
# Controls row
controls_layout = QHBoxLayout()
# Provider override (for prompt and tool steps)
if isinstance(self.step, (PromptStep, ToolStep)):
controls_layout.addWidget(QLabel("Provider:"))
self.provider_combo = QComboBox()
self.provider_combo.addItem("(use step's default)")
providers = load_providers()
for provider in sorted(providers, key=lambda p: p.name):
self.provider_combo.addItem(provider.name)
# Add common defaults
for default in ["mock"]:
if self.provider_combo.findText(default) < 0:
self.provider_combo.addItem(default)
self.provider_combo.setMinimumWidth(150)
controls_layout.addWidget(self.provider_combo)
else:
self.provider_combo = None
controls_layout.addStretch()
# Run button
self.btn_run = QPushButton("Run Step")
self.btn_run.setMinimumHeight(36)
self.btn_run.setMinimumWidth(120)
self.btn_run.clicked.connect(self._run_test)
controls_layout.addWidget(self.btn_run)
bottom_layout.addLayout(controls_layout)
# Output section
output_group = QGroupBox("Output")
output_layout = QVBoxLayout(output_group)
# Status line
self.status_label = QLabel("Click 'Run Step' to test this step")
self.status_label.setStyleSheet("color: #718096;")
output_layout.addWidget(self.status_label)
# Output display
self.output_display = QTextEdit()
self.output_display.setReadOnly(True)
self.output_display.setPlaceholderText("Step output will appear here...")
font = self.output_display.font()
font.setFamily("Consolas, Monaco, monospace")
self.output_display.setFont(font)
output_layout.addWidget(self.output_display)
# Assertion results
self.assertion_results = QLabel("")
self.assertion_results.setWordWrap(True)
output_layout.addWidget(self.assertion_results)
bottom_layout.addWidget(output_group)
splitter.addWidget(bottom_widget)
splitter.setSizes([300, 400])
layout.addWidget(splitter)
# Dialog buttons
buttons_layout = QHBoxLayout()
buttons_layout.addStretch()
btn_close = QPushButton("Close")
btn_close.clicked.connect(self.accept)
buttons_layout.addWidget(btn_close)
layout.addLayout(buttons_layout)
def _detect_variables(self):
"""Detect variables used in the step and create input fields."""
# Get template text based on step type
template = ""
if isinstance(self.step, PromptStep):
template = self.step.prompt
elif isinstance(self.step, CodeStep):
template = self.step.code
elif isinstance(self.step, ToolStep):
template = self.step.input_template
# Also add args values
for value in self.step.args.values():
template += " " + str(value)
# Find all {variable} references (excluding escaped {{ }})
# Simple regex: match {word} but not {{ or }}
var_pattern = r'\{([a-zA-Z_][a-zA-Z0-9_]*)\}'
found_vars = set(re.findall(var_pattern, template))
# Combine with available_vars (from previous steps)
all_vars = sorted(set(self.available_vars) | found_vars)
# Create input fields for each variable
for var_name in all_vars:
if var_name == "input":
# Use multiline for input
widget = QPlainTextEdit()
widget.setPlaceholderText("Enter test input text...")
widget.setMaximumHeight(80)
else:
widget = QLineEdit()
widget.setPlaceholderText(f"Value for {{{var_name}}}")
self.var_inputs[var_name] = widget
self.vars_form.addRow(f"{{{var_name}}}:", widget)
def _add_assertion_row(self):
"""Add a new assertion row to the table."""
row = self.assertions_table.rowCount()
self.assertions_table.insertRow(row)
# Type dropdown
type_combo = QComboBox()
for type_id, display_name, tooltip in self.ASSERTION_TYPES:
type_combo.addItem(display_name, type_id)
idx = type_combo.count() - 1
type_combo.setItemData(idx, tooltip, Qt.ToolTipRole)
self.assertions_table.setCellWidget(row, 0, type_combo)
# Value input
value_edit = QLineEdit()
value_edit.setPlaceholderText("Expected value (if applicable)")
self.assertions_table.setCellWidget(row, 1, value_edit)
# Remove button
btn_remove = QPushButton("×")
btn_remove.setFixedWidth(30)
btn_remove.clicked.connect(lambda: self._remove_assertion_row(row))
self.assertions_table.setCellWidget(row, 2, btn_remove)
def _remove_assertion_row(self, row: int):
"""Remove an assertion row."""
self.assertions_table.removeRow(row)
# Update remove button connections for remaining rows
for i in range(self.assertions_table.rowCount()):
btn = self.assertions_table.cellWidget(i, 2)
if btn:
btn.clicked.disconnect()
btn.clicked.connect(lambda checked=False, r=i: self._remove_assertion_row(r))
def _get_assertions(self) -> list:
"""Get list of assertions from the table."""
assertions = []
for row in range(self.assertions_table.rowCount()):
type_combo = self.assertions_table.cellWidget(row, 0)
value_edit = self.assertions_table.cellWidget(row, 1)
if type_combo:
assertions.append({
"type": type_combo.currentData(),
"display": type_combo.currentText(),
"value": value_edit.text() if value_edit else ""
})
return assertions
def _run_test(self):
"""Run the step test."""
# Collect variable values
variables = {}
for var_name, widget in self.var_inputs.items():
if isinstance(widget, QPlainTextEdit):
variables[var_name] = widget.toPlainText()
else:
variables[var_name] = widget.text()
# Get provider override
provider_override = None
if self.provider_combo and self.provider_combo.currentIndex() > 0:
provider_override = self.provider_combo.currentText()
# Disable run button and show loading
self.btn_run.setEnabled(False)
self.btn_run.setText("Running...")
self.status_label.setText("Executing step...")
self.status_label.setStyleSheet("color: #718096;")
self.output_display.clear()
self.assertion_results.clear()
# Start worker thread
self._worker = StepTestWorker(self.step, variables, provider_override)
self._worker.finished.connect(self._on_test_finished)
self._worker.start()
def _on_test_finished(self, result: dict):
"""Handle test completion."""
self.btn_run.setEnabled(True)
self.btn_run.setText("Run Step")
# Display result
if result["success"]:
self.status_label.setText(f"✓ Step completed in {result['elapsed_ms']}ms")
self.status_label.setStyleSheet("color: #38a169; font-weight: bold;")
# Show output
output_text = result["output"]
if result["output_vars"]:
output_text += "\n\n--- Output Variables ---\n"
for var, value in result["output_vars"].items():
preview = value[:200] + "..." if len(value) > 200 else value
output_text += f"{var} = {preview}\n"
self.output_display.setPlainText(output_text)
else:
self.status_label.setText(f"✗ Step failed ({result['elapsed_ms']}ms)")
self.status_label.setStyleSheet("color: #e53e3e; font-weight: bold;")
error_text = result.get("error", "Unknown error")
self.output_display.setHtml(f"<span style='color: #e53e3e;'><b>Error:</b> {error_text}</span>")
# Run assertions
assertions = self._get_assertions()
if assertions and result["success"]:
self._run_assertions(result["output"], assertions)
def _run_assertions(self, output: str, assertions: list):
"""Run assertions against the output."""
results = []
all_passed = True
for assertion in assertions:
a_type = assertion["type"]
a_value = assertion["value"]
a_display = assertion["display"]
passed = False
message = ""
try:
if a_type == "not_empty":
passed = bool(output.strip())
message = "Output is not empty" if passed else "Output is empty"
elif a_type == "contains":
passed = a_value in output
message = f"Output contains '{a_value}'" if passed else f"Output does not contain '{a_value}'"
elif a_type == "not_contains":
passed = a_value not in output
message = f"Output does not contain '{a_value}'" if passed else f"Output contains '{a_value}'"
elif a_type == "equals":
passed = output.strip() == a_value.strip()
message = "Output equals expected" if passed else "Output does not equal expected"
elif a_type == "valid_json":
try:
json.loads(output)
passed = True
message = "Output is valid JSON"
except json.JSONDecodeError as e:
passed = False
message = f"Invalid JSON: {e}"
elif a_type == "valid_python":
try:
import ast
ast.parse(output)
passed = True
message = "Output is valid Python"
except SyntaxError as e:
passed = False
message = f"Invalid Python: {e}"
elif a_type == "matches_regex":
try:
passed = bool(re.search(a_value, output))
message = f"Output matches regex" if passed else f"Output does not match regex"
except re.error as e:
passed = False
message = f"Invalid regex: {e}"
elif a_type == "min_length":
try:
min_len = int(a_value)
passed = len(output) >= min_len
message = f"Length {len(output)} >= {min_len}" if passed else f"Length {len(output)} < {min_len}"
except ValueError:
passed = False
message = "Invalid minimum length value"
elif a_type == "max_length":
try:
max_len = int(a_value)
passed = len(output) <= max_len
message = f"Length {len(output)} <= {max_len}" if passed else f"Length {len(output)} > {max_len}"
except ValueError:
passed = False
message = "Invalid maximum length value"
except Exception as e:
passed = False
message = f"Error: {e}"
if not passed:
all_passed = False
results.append((a_display, passed, message))
# Display results
result_html = "<b>Assertion Results:</b><br>"
for display, passed, message in results:
icon = "" if passed else ""
color = "#38a169" if passed else "#e53e3e"
result_html += f"<span style='color: {color};'>{icon} {display}: {message}</span><br>"
if all_passed:
result_html = f"<span style='color: #38a169; font-weight: bold;'>All {len(results)} assertion(s) passed!</span><br>" + result_html
else:
failed_count = sum(1 for _, p, _ in results if not p)
result_html = f"<span style='color: #e53e3e; font-weight: bold;'>{failed_count} of {len(results)} assertion(s) failed</span><br>" + result_html
self.assertion_results.setText(result_html)

View File

@ -233,6 +233,12 @@ class ToolBuilderPage(QWidget):
self.btn_edit_step.clicked.connect(self._edit_step) self.btn_edit_step.clicked.connect(self._edit_step)
steps_btns.addWidget(self.btn_edit_step) steps_btns.addWidget(self.btn_edit_step)
self.btn_test_step = QPushButton("Test")
self.btn_test_step.setObjectName("secondary")
self.btn_test_step.setToolTip("Test the selected step with custom input")
self.btn_test_step.clicked.connect(self._test_step)
steps_btns.addWidget(self.btn_test_step)
self.btn_del_step = QPushButton("Delete") self.btn_del_step = QPushButton("Delete")
self.btn_del_step.setObjectName("danger") self.btn_del_step.setObjectName("danger")
self.btn_del_step.setToolTip("Delete the selected step") self.btn_del_step.setToolTip("Delete the selected step")
@ -691,6 +697,21 @@ class ToolBuilderPage(QWidget):
self._add_tool_dependency(new_step.tool) self._add_tool_dependency(new_step.tool)
self._refresh_steps() self._refresh_steps()
def _test_step(self):
"""Test the selected step with custom input."""
items = self.steps_list.selectedItems()
if not items:
QMessageBox.information(self, "Test Step", "Please select a step to test")
return
step = items[0].data(Qt.UserRole)
idx = self.steps_list.row(items[0])
available_vars = self._get_available_vars(up_to_step=idx)
from ..dialogs.test_step_dialog import TestStepDialog
dialog = TestStepDialog(self, step, available_vars=available_vars)
dialog.exec()
def _delete_step(self): def _delete_step(self):
"""Delete selected step.""" """Delete selected step."""
items = self.steps_list.selectedItems() items = self.steps_list.selectedItems()