508 lines
15 KiB
Python
Executable File
508 lines
15 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Tool vetting/scrutiny module for CmdForge.
|
|
|
|
Performs AI-powered analysis of tools to assess quality and safety:
|
|
- Honesty: Does the tool do what it claims?
|
|
- Transparency: Is the prompt clear and understandable?
|
|
- Scope: Is the tool appropriately scoped?
|
|
- Efficiency: Is the prompt well-structured?
|
|
- Safety: Are there any concerning patterns?
|
|
|
|
Usage:
|
|
# Vet a single tool
|
|
python scripts/scrutiny.py path/to/tool/config.yaml
|
|
|
|
# Vet all tools in directory
|
|
python scripts/scrutiny.py --all ~/.cmdforge/
|
|
|
|
# Output as JSON
|
|
python scripts/scrutiny.py --json path/to/tool/config.yaml
|
|
|
|
# Use specific provider for analysis
|
|
python scripts/scrutiny.py --provider claude path/to/tool/config.yaml
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from dataclasses import dataclass, field, asdict
|
|
from enum import Enum
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import yaml
|
|
|
|
|
|
class VetResult(Enum):
|
|
"""Vetting decision."""
|
|
APPROVE = "approve" # Auto-approve - meets all criteria
|
|
REVIEW = "review" # Needs human review - some concerns
|
|
REJECT = "reject" # Auto-reject - fails criteria
|
|
ERROR = "error" # Could not vet
|
|
|
|
|
|
@dataclass
|
|
class VetScore:
|
|
"""Individual score for a vetting criterion."""
|
|
criterion: str
|
|
score: float # 0.0 to 1.0
|
|
max_score: float = 1.0
|
|
notes: str = ""
|
|
concerns: list[str] = field(default_factory=list)
|
|
|
|
|
|
@dataclass
|
|
class VetReport:
|
|
"""Complete vetting report for a tool."""
|
|
tool_name: str
|
|
tool_path: str
|
|
result: VetResult
|
|
overall_score: float # 0.0 to 1.0
|
|
scores: list[VetScore] = field(default_factory=list)
|
|
suggestions: list[str] = field(default_factory=list)
|
|
error: Optional[str] = None
|
|
|
|
def to_dict(self) -> dict:
|
|
"""Convert to dictionary for JSON serialization."""
|
|
d = asdict(self)
|
|
d['result'] = self.result.value
|
|
return d
|
|
|
|
|
|
# Thresholds for auto-approve/reject
|
|
APPROVE_THRESHOLD = 0.8 # Score >= 0.8 -> auto-approve
|
|
REJECT_THRESHOLD = 0.3 # Score < 0.3 -> auto-reject
|
|
|
|
|
|
def load_tool_config(path: Path) -> Optional[dict]:
|
|
"""Load tool configuration from YAML file."""
|
|
if path.is_dir():
|
|
config_file = path / "config.yaml"
|
|
else:
|
|
config_file = path
|
|
|
|
if not config_file.exists():
|
|
return None
|
|
|
|
with open(config_file) as f:
|
|
return yaml.safe_load(f)
|
|
|
|
|
|
def vet_honesty(config: dict) -> VetScore:
|
|
"""Check if tool description matches what it actually does."""
|
|
score = VetScore(criterion="honesty", score=0.0, notes="")
|
|
concerns = []
|
|
|
|
name = config.get("name", "")
|
|
description = config.get("description", "")
|
|
steps = config.get("steps", [])
|
|
|
|
# Check that description exists
|
|
if not description:
|
|
concerns.append("Missing description")
|
|
score.score = 0.3
|
|
else:
|
|
score.score = 0.6
|
|
|
|
# Check that steps exist
|
|
if not steps:
|
|
concerns.append("No execution steps defined")
|
|
score.score = min(score.score, 0.2)
|
|
else:
|
|
# Check if description keywords appear in prompts
|
|
desc_words = set(description.lower().split())
|
|
prompt_text = ""
|
|
for step in steps:
|
|
if step.get("type") == "prompt":
|
|
prompt_text += step.get("prompt", "").lower() + " "
|
|
|
|
# Simple keyword overlap check
|
|
prompt_words = set(prompt_text.split())
|
|
overlap = desc_words & prompt_words
|
|
meaningful_overlap = overlap - {"the", "a", "an", "and", "or", "is", "to", "for", "of", "in"}
|
|
|
|
if len(meaningful_overlap) >= 2:
|
|
score.score = min(1.0, score.score + 0.3)
|
|
score.notes = f"Description matches prompt content ({len(meaningful_overlap)} keywords)"
|
|
else:
|
|
concerns.append("Description may not match actual behavior")
|
|
|
|
score.concerns = concerns
|
|
return score
|
|
|
|
|
|
def vet_transparency(config: dict) -> VetScore:
|
|
"""Check if the tool's behavior is clear and understandable."""
|
|
score = VetScore(criterion="transparency", score=0.0, notes="")
|
|
concerns = []
|
|
|
|
steps = config.get("steps", [])
|
|
|
|
if not steps:
|
|
concerns.append("No steps to analyze")
|
|
score.concerns = concerns
|
|
return score
|
|
|
|
# Analyze each step
|
|
total_prompt_length = 0
|
|
has_clear_instructions = False
|
|
|
|
for step in steps:
|
|
if step.get("type") == "prompt":
|
|
prompt = step.get("prompt", "")
|
|
total_prompt_length += len(prompt)
|
|
|
|
# Check for clear instruction patterns
|
|
instruction_patterns = [
|
|
"you are", "your task", "please", "analyze", "extract",
|
|
"summarize", "create", "write", "explain", "review"
|
|
]
|
|
prompt_lower = prompt.lower()
|
|
if any(p in prompt_lower for p in instruction_patterns):
|
|
has_clear_instructions = True
|
|
|
|
# Score based on findings
|
|
if has_clear_instructions:
|
|
score.score += 0.5
|
|
score.notes = "Contains clear instructions"
|
|
|
|
if total_prompt_length > 50:
|
|
score.score += 0.3
|
|
score.notes += "; Substantial prompt content"
|
|
elif total_prompt_length > 0:
|
|
score.score += 0.1
|
|
concerns.append("Very short prompt - may lack clarity")
|
|
|
|
# Check for output variable naming
|
|
for step in steps:
|
|
output_var = step.get("output_var", "")
|
|
if output_var and output_var != "response":
|
|
score.score += 0.2
|
|
score.notes += "; Descriptive output variable"
|
|
break
|
|
|
|
score.score = min(1.0, score.score)
|
|
score.concerns = concerns
|
|
return score
|
|
|
|
|
|
def vet_scope(config: dict) -> VetScore:
|
|
"""Check if tool is appropriately scoped (not too broad/narrow)."""
|
|
score = VetScore(criterion="scope", score=0.0, notes="")
|
|
concerns = []
|
|
|
|
description = config.get("description", "")
|
|
steps = config.get("steps", [])
|
|
arguments = config.get("arguments", [])
|
|
|
|
# Start with base score
|
|
score.score = 0.5
|
|
|
|
# Single-step tools are well-scoped
|
|
if len(steps) == 1:
|
|
score.score += 0.2
|
|
score.notes = "Single-step tool - focused scope"
|
|
elif len(steps) <= 3:
|
|
score.score += 0.1
|
|
score.notes = "Multi-step tool with reasonable complexity"
|
|
else:
|
|
concerns.append(f"Complex tool with {len(steps)} steps - may be over-scoped")
|
|
score.score -= 0.1
|
|
|
|
# Check for overly generic descriptions
|
|
generic_terms = ["everything", "anything", "all", "any task", "general purpose"]
|
|
desc_lower = description.lower()
|
|
if any(term in desc_lower for term in generic_terms):
|
|
concerns.append("Description suggests overly broad scope")
|
|
score.score -= 0.2
|
|
|
|
# Arguments indicate configurable scope (good)
|
|
if arguments:
|
|
score.score += 0.1
|
|
score.notes += "; Configurable via arguments"
|
|
|
|
score.score = max(0.0, min(1.0, score.score))
|
|
score.concerns = concerns
|
|
return score
|
|
|
|
|
|
def vet_efficiency(config: dict) -> VetScore:
|
|
"""Check if prompt is well-structured and efficient."""
|
|
score = VetScore(criterion="efficiency", score=0.0, notes="")
|
|
concerns = []
|
|
|
|
steps = config.get("steps", [])
|
|
|
|
# Analyze prompts
|
|
for step in steps:
|
|
if step.get("type") == "prompt":
|
|
prompt = step.get("prompt", "")
|
|
|
|
# Check for excessive repetition
|
|
words = prompt.lower().split()
|
|
word_counts = {}
|
|
for word in words:
|
|
if len(word) > 4: # Only check meaningful words
|
|
word_counts[word] = word_counts.get(word, 0) + 1
|
|
|
|
max_repetition = max(word_counts.values()) if word_counts else 0
|
|
if max_repetition > 5:
|
|
concerns.append(f"Repetitive language detected ({max_repetition}x)")
|
|
score.score = max(0.0, score.score - 0.2)
|
|
|
|
# Check for structured output hints
|
|
structure_patterns = [
|
|
"markdown", "json", "format", "structure", "sections",
|
|
"bullet", "numbered", "list", "table"
|
|
]
|
|
if any(p in prompt.lower() for p in structure_patterns):
|
|
score.score += 0.3
|
|
score.notes = "Specifies output structure"
|
|
|
|
# Reasonable length (not too short, not excessive)
|
|
if 100 <= len(prompt) <= 5000:
|
|
score.score += 0.4
|
|
elif len(prompt) < 100:
|
|
concerns.append("Very short prompt - may lack guidance")
|
|
score.score += 0.2
|
|
else:
|
|
concerns.append("Very long prompt - may be inefficient")
|
|
score.score += 0.2
|
|
|
|
# Base score if steps exist
|
|
if steps:
|
|
score.score += 0.3
|
|
|
|
score.score = min(1.0, score.score)
|
|
score.concerns = concerns
|
|
return score
|
|
|
|
|
|
def vet_safety(config: dict) -> VetScore:
|
|
"""Check for concerning patterns in the tool."""
|
|
score = VetScore(criterion="safety", score=1.0, notes="No safety concerns")
|
|
concerns = []
|
|
|
|
steps = config.get("steps", [])
|
|
|
|
# Check for code steps
|
|
code_step_count = 0
|
|
for step in steps:
|
|
if step.get("type") == "code":
|
|
code_step_count += 1
|
|
code = step.get("code", "")
|
|
|
|
# Check for potentially dangerous patterns
|
|
dangerous_patterns = [
|
|
("subprocess", "Executes shell commands"),
|
|
("os.system", "Executes shell commands"),
|
|
("eval(", "Dynamic code execution"),
|
|
("exec(", "Dynamic code execution"),
|
|
("open(", "File operations"),
|
|
("requests.", "Network requests"),
|
|
("urllib", "Network requests"),
|
|
("shutil.rmtree", "Recursive deletion"),
|
|
]
|
|
|
|
for pattern, concern in dangerous_patterns:
|
|
if pattern in code:
|
|
concerns.append(f"Code contains {concern.lower()}")
|
|
score.score -= 0.15
|
|
|
|
if code_step_count > 0:
|
|
score.notes = f"Contains {code_step_count} code step(s)"
|
|
if not concerns:
|
|
score.notes += " - no dangerous patterns detected"
|
|
|
|
score.score = max(0.0, score.score)
|
|
score.concerns = concerns
|
|
return score
|
|
|
|
|
|
def vet_tool(config: dict, tool_path: str) -> VetReport:
|
|
"""Perform complete vetting of a tool."""
|
|
name = config.get("name", "unknown")
|
|
|
|
# Run all checks
|
|
scores = [
|
|
vet_honesty(config),
|
|
vet_transparency(config),
|
|
vet_scope(config),
|
|
vet_efficiency(config),
|
|
vet_safety(config),
|
|
]
|
|
|
|
# Calculate overall score (weighted average)
|
|
weights = {
|
|
"honesty": 0.25,
|
|
"transparency": 0.20,
|
|
"scope": 0.15,
|
|
"efficiency": 0.15,
|
|
"safety": 0.25,
|
|
}
|
|
|
|
total_weight = sum(weights.values())
|
|
weighted_sum = sum(s.score * weights.get(s.criterion, 0.1) for s in scores)
|
|
overall_score = weighted_sum / total_weight
|
|
|
|
# Determine result
|
|
if overall_score >= APPROVE_THRESHOLD:
|
|
result = VetResult.APPROVE
|
|
elif overall_score < REJECT_THRESHOLD:
|
|
result = VetResult.REJECT
|
|
else:
|
|
result = VetResult.REVIEW
|
|
|
|
# Collect all concerns for suggestions
|
|
suggestions = []
|
|
for s in scores:
|
|
for concern in s.concerns:
|
|
suggestions.append(f"[{s.criterion}] {concern}")
|
|
|
|
return VetReport(
|
|
tool_name=name,
|
|
tool_path=tool_path,
|
|
result=result,
|
|
overall_score=overall_score,
|
|
scores=scores,
|
|
suggestions=suggestions,
|
|
)
|
|
|
|
|
|
def vet_directory(directory: Path, provider: Optional[str] = None) -> list[VetReport]:
|
|
"""Vet all tools in a directory."""
|
|
reports = []
|
|
|
|
for entry in directory.iterdir():
|
|
config_file = None
|
|
if entry.is_dir():
|
|
config_file = entry / "config.yaml"
|
|
elif entry.suffix in [".yaml", ".yml"]:
|
|
config_file = entry
|
|
|
|
if config_file and config_file.exists():
|
|
config = load_tool_config(config_file)
|
|
if config:
|
|
report = vet_tool(config, str(entry))
|
|
reports.append(report)
|
|
|
|
return reports
|
|
|
|
|
|
def print_report(report: VetReport, verbose: bool = False):
|
|
"""Print a vetting report to console."""
|
|
# Result emoji
|
|
result_emoji = {
|
|
VetResult.APPROVE: "✅",
|
|
VetResult.REVIEW: "⚠️",
|
|
VetResult.REJECT: "❌",
|
|
VetResult.ERROR: "💥",
|
|
}
|
|
|
|
emoji = result_emoji.get(report.result, "❓")
|
|
print(f"\n{emoji} {report.tool_name}: {report.result.value.upper()} (score: {report.overall_score:.2f})")
|
|
|
|
if verbose or report.result != VetResult.APPROVE:
|
|
print(f" Path: {report.tool_path}")
|
|
|
|
# Print individual scores
|
|
for score in report.scores:
|
|
bar = "█" * int(score.score * 10) + "░" * (10 - int(score.score * 10))
|
|
print(f" {score.criterion:12} [{bar}] {score.score:.2f}")
|
|
if score.concerns:
|
|
for concern in score.concerns:
|
|
print(f" ⚠ {concern}")
|
|
|
|
# Print suggestions
|
|
if report.suggestions and verbose:
|
|
print(" Suggestions:")
|
|
for suggestion in report.suggestions:
|
|
print(f" • {suggestion}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Vet CmdForge tools for quality and safety",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog=__doc__
|
|
)
|
|
|
|
parser.add_argument(
|
|
"path",
|
|
type=Path,
|
|
nargs="?",
|
|
help="Tool config file or directory to vet"
|
|
)
|
|
parser.add_argument(
|
|
"--all",
|
|
action="store_true",
|
|
help="Vet all tools in directory"
|
|
)
|
|
parser.add_argument(
|
|
"--json",
|
|
action="store_true",
|
|
help="Output as JSON"
|
|
)
|
|
parser.add_argument(
|
|
"--verbose", "-v",
|
|
action="store_true",
|
|
help="Show detailed output"
|
|
)
|
|
parser.add_argument(
|
|
"--provider",
|
|
default=None,
|
|
help="AI provider for enhanced analysis (future feature)"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.path:
|
|
parser.error("Please specify a path to vet")
|
|
|
|
# Collect reports
|
|
reports = []
|
|
|
|
if args.all or args.path.is_dir():
|
|
reports = vet_directory(args.path, args.provider)
|
|
else:
|
|
config = load_tool_config(args.path)
|
|
if not config:
|
|
print(f"Error: Could not load tool config from {args.path}", file=sys.stderr)
|
|
return 1
|
|
report = vet_tool(config, str(args.path))
|
|
reports.append(report)
|
|
|
|
if not reports:
|
|
print("No tools found to vet", file=sys.stderr)
|
|
return 1
|
|
|
|
# Output
|
|
if args.json:
|
|
output = [r.to_dict() for r in reports]
|
|
print(json.dumps(output, indent=2))
|
|
else:
|
|
# Summary
|
|
approved = sum(1 for r in reports if r.result == VetResult.APPROVE)
|
|
review = sum(1 for r in reports if r.result == VetResult.REVIEW)
|
|
rejected = sum(1 for r in reports if r.result == VetResult.REJECT)
|
|
|
|
print(f"Vetting {len(reports)} tool(s)...")
|
|
|
|
for report in reports:
|
|
print_report(report, args.verbose)
|
|
|
|
print(f"\n{'─' * 40}")
|
|
print(f"Summary: {approved} approved, {review} need review, {rejected} rejected")
|
|
|
|
# Return code based on results
|
|
if rejected > 0:
|
|
return 2
|
|
elif review > 0:
|
|
return 1
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|