Add reference-photo input: "build something like this"

Attach a photo (📎 button, drag-drop, paste, or an image URL) and the driver
hands it to claude -p, which reads the image (its Read tool sees images) and
emits the usual tool-call JSON to build a simplified, buildable interpretation
in dimensional lumber — no API keys, same claude -p pipe.

- driver: interpret(image_path=) prepends a reference-photo directive with the
  image's absolute path; find_image_url() + fetch_image() download a linked
  image to a temp file; woodshop-talk --image (path or URL) for CLI/voice.
- controller.run_command(image_path=) passthrough.
- command bar: 📎 attach (file picker), drag-drop image, Ctrl+V paste image,
  and image-URL-in-text detection; downloads run off the UI thread; an image
  chip shows/clears the attachment.
- tests: URL detection, image directive in prompt, fetch_image temp write,
  controller passthrough, command-bar attach + default-text smoke. 216 pass.

Honest limit: the live image round-trip needs a real display/model call to
verify — wired + unit-tested, please confirm it sees the photo on your machine.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
rob 2026-05-30 22:20:57 -03:00
parent b9b0871ac3
commit c623ad2576
7 changed files with 252 additions and 18 deletions

View File

@ -8,6 +8,11 @@ Talk to it like the Star Trek holodeck and watch furniture build itself:
> *"Build a coffee table: a four foot by two foot frame from 2x4s, with four legs 18 inches tall standing at the corners."*
You can also **attach a reference photo** (📎, drag-drop, paste, or an image URL)
and say *"build something like this"* — WoodShop hands the image to the model and
builds a simplified, buildable interpretation in dimensional lumber that you then
refine by voice/text. (It's an interpretation, not a measured replica.)
Each board is real dimensional lumber (a 2x4 is modeled at its true 1.5″ × 3.5″),
so the result is buildable — export to **STEP** (CAD/CNC) or **STL** (3D print),
and get a **cut list with board-feet and a shopping estimate**.

View File

@ -17,15 +17,53 @@ from __future__ import annotations
import argparse
import json
import os
import re
import shutil
import subprocess
import sys
import tempfile
import urllib.request
TOOL_FILTER = "wood-*" # auto-discover every wood-* tool, no hardcoded list
REASON_PROVIDER = "claude -p" # chosen for reliable structured tool-calling
_MAX_HISTORY = 6 # turns of recent conversation fed back for reference-resolution
# A reference photo can be attached to "build something like this". claude -p
# reads the image file (via its Read tool), so we just hand it an absolute path.
_IMG_URL = re.compile(r'https?://\S+?\.(?:png|jpe?g|webp|gif|bmp)\b', re.I)
_IMAGE_DIRECTIVE = (
"A REFERENCE PHOTO of furniture is saved at this path:\n {path}\n"
"Open and look at that image file. The user wants to build something LIKE it "
"from dimensional lumber and plywood. Infer the major parts, rough proportions, "
"and joinery, and emit the tool calls to build a SIMPLIFIED, buildable version "
"with reasonable real dimensions in inches. This is an interpretation, not an "
"exact replica — prefer standard stock sizes and right angles.\n\n"
)
def find_image_url(text: str) -> str | None:
m = _IMG_URL.search(text or "")
return m.group(0) if m else None
def fetch_image(url: str, timeout: int = 20) -> str:
"""Download an image URL to a temp file and return its path. Raises on
failure (caller decides how to surface it)."""
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urllib.request.urlopen(req, timeout=timeout) as resp:
ctype = (resp.headers.get("Content-Type") or "").split(";")[0].strip().lower()
data = resp.read()
ext = {"image/png": ".png", "image/jpeg": ".jpg", "image/webp": ".webp",
"image/gif": ".gif", "image/bmp": ".bmp"}.get(ctype)
if ext is None:
m = re.search(r'\.(png|jpe?g|webp|gif|bmp)\b', url, re.I)
ext = "." + m.group(1).lower() if m else ".img"
fd, path = tempfile.mkstemp(suffix=ext, prefix="woodshop-ref-")
with os.fdopen(fd, "wb") as f:
f.write(data)
return path
# A board placed earlier in the SAME utterance is referenced as $1, $2, ...
_SYMBOL = re.compile(r"\$(\d+)")
@ -143,10 +181,13 @@ def _render_history(history: list[tuple[str, str]] | None) -> str:
def interpret(utterance: str, schemas: str, scene_text: str | None = None,
history: list[tuple[str, str]] | None = None) -> list[dict]:
history: list[tuple[str, str]] | None = None,
image_path: str | None = None) -> list[dict]:
scene = scene_text if scene_text is not None else scene_summary()
prompt = SYSTEM.format(schemas=schemas, scene=scene, utterance=utterance,
history=_render_history(history))
if image_path:
prompt = _IMAGE_DIRECTIVE.format(path=os.path.abspath(image_path)) + prompt
raw = _run(REASON_PROVIDER.split(), stdin=prompt)
calls = _extract_calls(raw)
if calls is None:
@ -242,8 +283,9 @@ def summarize(calls: list[dict], messages: list[str]) -> str:
def handle(utterance: str, schemas: str, voice: bool, verbose: bool,
history: list[tuple[str, str]] | None = None) -> None:
calls = interpret(utterance, schemas, history=history)
history: list[tuple[str, str]] | None = None,
image_path: str | None = None) -> None:
calls = interpret(utterance, schemas, history=history, image_path=image_path)
messages = dispatch(calls, verbose=verbose)
full = " ".join(m for m in messages if m).strip()
spoken = summarize(calls, messages)
@ -271,9 +313,14 @@ def main(argv: list[str] | None = None) -> int:
ap.add_argument("--voice", action="store_true", help="Listen on the mic instead of typing")
ap.add_argument("--duration", type=int, default=6, help="Mic recording seconds (--voice)")
ap.add_argument("--once", help="Run a single command (non-interactive) and exit")
ap.add_argument("--image", help="Reference photo (path or URL) for 'build something like this'")
ap.add_argument("--quiet", action="store_true", help="Don't print per-call detail")
args = ap.parse_args(argv)
image_path = None
if args.image:
image_path = fetch_image(args.image) if args.image.startswith("http") else args.image
schemas = load_schemas()
if not schemas:
print("Could not load wood-* tool schemas (is CmdForge/pa-load-tools available?)",
@ -281,7 +328,8 @@ def main(argv: list[str] | None = None) -> int:
return 1
if args.once is not None:
handle(args.once, schemas, voice=args.voice, verbose=not args.quiet)
handle(args.once, schemas, voice=args.voice, verbose=not args.quiet,
image_path=image_path)
return 0
print("WoodShop ready. Say things like 'place a 6 foot 2x4'. Ctrl-C to quit.")
@ -295,7 +343,8 @@ def main(argv: list[str] | None = None) -> int:
return 0
try:
handle(utterance, schemas, voice=args.voice, verbose=not args.quiet,
history=history)
history=history, image_path=image_path)
image_path = None # the reference photo applies to the first turn only
except Exception as exc: # never let one bad command kill the session
print(f"WoodShop: sorry, that command failed ({exc}).")

View File

@ -1,17 +1,23 @@
"""Command bar: type a command or push-to-talk, see the transcript, optionally
hear the reply. Slow work (LLM, dictate, TTS) runs off the UI thread."""
hear the reply. You can also attach a reference photo ("build something like
this") by the 📎 button, drag-drop, paste, or a pasted image URL. Slow work
(LLM, dictate, TTS, image download) runs off the UI thread."""
from __future__ import annotations
import os
import subprocess
from PySide6.QtCore import Qt, QThreadPool
from PySide6.QtWidgets import (QCheckBox, QHBoxLayout, QLabel, QLineEdit,
QPushButton, QTextEdit, QVBoxLayout, QWidget)
from PySide6.QtGui import QKeySequence
from PySide6.QtWidgets import (QApplication, QCheckBox, QFileDialog, QHBoxLayout, QLabel,
QLineEdit, QPushButton, QTextEdit, QVBoxLayout, QWidget)
from .. import driver
from .controller import Controller
from .workers import run_async
_WHO_COLOR = {"you": "#9cdcfe", "ws": "#c8965a", "sys": "#e06c75"}
_IMAGE_EXTS = (".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp")
class CommandBar(QWidget):
@ -19,6 +25,8 @@ class CommandBar(QWidget):
super().__init__(parent)
self.c = controller
self.pool = pool
self._pending_image: str | None = None # attached reference photo path
self.setAcceptDrops(True) # drop an image onto the bar
root = QVBoxLayout(self)
self.transcript = QTextEdit(readOnly=True)
@ -32,6 +40,12 @@ class CommandBar(QWidget):
self.mic.clicked.connect(self._listen)
row.addWidget(self.mic)
self.attach = QPushButton("📎")
self.attach.setToolTip("Attach a reference photo — then say 'build something like this'")
self.attach.setFixedWidth(40)
self.attach.clicked.connect(self._attach_image)
row.addWidget(self.attach)
self.input = QLineEdit()
self.input.setPlaceholderText("Type a command, e.g. 'build a coffee table' — Enter to send")
self.input.returnPressed.connect(self._send)
@ -45,6 +59,9 @@ class CommandBar(QWidget):
bottom = QHBoxLayout()
self.speak = QCheckBox("Speak replies")
bottom.addWidget(self.speak)
self.image_chip = QLabel("") # shows the attached photo name
self.image_chip.setStyleSheet("color:#c8965a")
bottom.addWidget(self.image_chip)
bottom.addStretch()
self.status = QLabel("")
bottom.addWidget(self.status)
@ -52,6 +69,69 @@ class CommandBar(QWidget):
self.c.logged.connect(self._log)
# ----- reference image ---------------------------------------------
def _set_image(self, path: str | None) -> None:
self._pending_image = path
if path:
name = os.path.basename(path)
self.image_chip.setText(f"📎 {name}")
self.image_chip.setToolTip("Click to remove the attached photo")
else:
self.image_chip.setText("")
self.image_chip.setToolTip("")
def mousePressEvent(self, e):
# click the chip text to clear the attachment
if self._pending_image and self.image_chip.geometry().contains(e.pos()):
self._set_image(None)
super().mousePressEvent(e)
def _attach_image(self) -> None:
path, _ = QFileDialog.getOpenFileName(
self, "Attach reference photo", "",
"Images (*.png *.jpg *.jpeg *.webp *.gif *.bmp)")
if path:
self._set_image(path)
if not self.input.text().strip():
self.input.setText("build something like this")
def dragEnterEvent(self, e):
md = e.mimeData()
if md.hasImage() or any(u.toLocalFile().lower().endswith(_IMAGE_EXTS)
for u in md.urls()):
e.acceptProposedAction()
def dropEvent(self, e):
md = e.mimeData()
for u in md.urls():
p = u.toLocalFile()
if p.lower().endswith(_IMAGE_EXTS):
self._set_image(p)
break
else:
if md.hasImage():
self._save_clipboard_image(md.imageData())
if self._pending_image and not self.input.text().strip():
self.input.setText("build something like this")
def _save_clipboard_image(self, qimage) -> None:
import tempfile
if qimage is None or qimage.isNull():
return
fd, path = tempfile.mkstemp(suffix=".png", prefix="woodshop-paste-")
os.close(fd)
if qimage.save(path, "PNG"):
self._set_image(path)
def keyPressEvent(self, e):
# paste an image straight from the clipboard (Ctrl+V) when the bar has focus
if e.matches(QKeySequence.Paste):
img = QApplication.clipboard().image()
if not img.isNull():
self._save_clipboard_image(img)
return
super().keyPressEvent(e)
# ----- logging -----------------------------------------------------
def _log(self, who: str, text: str) -> None:
if not text:
@ -65,26 +145,34 @@ class CommandBar(QWidget):
def _busy(self, on: bool, msg: str = "") -> None:
self.input.setEnabled(not on)
self.mic.setEnabled(not on)
self.attach.setEnabled(not on)
self.status.setText(msg)
# ----- send typed/spoken command -----------------------------------
def _send(self) -> None:
text = self.input.text().strip()
if not text:
if not text and not self._pending_image:
return
self.input.clear()
self._run(text)
self._run(text or "build something like this")
def submit(self, text: str) -> None:
"""Run a command programmatically (e.g. from a Build-menu template)."""
self._run(text)
def _run(self, text: str) -> None:
self._log("you", text)
self._busy(True, "thinking…")
image = self._pending_image
url = None if image else driver.find_image_url(text)
note = " 📎 photo" if (image or url) else ""
self._log("you", text + note)
self._set_image(None)
self._busy(True, "looking…" if (image or url) else "thinking…")
def work():
return self.c.run_command(text)
path = image
if path is None and url:
path = driver.fetch_image(url) # download the linked image
return self.c.run_command(text, image_path=path)
def done(summary):
self._busy(False)

View File

@ -427,9 +427,9 @@ class Controller(QObject):
except (SceneError, ValueError, KeyError) as exc:
return str(exc).strip('"')
def run_command(self, text: str) -> str:
"""Interpret a spoken/typed command and apply it. Returns a spoken summary.
(Slow call from a worker thread.)"""
def run_command(self, text: str, image_path: str | None = None) -> str:
"""Interpret a spoken/typed command (optionally with a reference photo) and
apply it. Returns a spoken summary. (Slow call from a worker thread.)"""
from ..scene import spatial_summary
self.save() # ensure disk reflects current state
sel = ", ".join(self.selected) if self.selected else "none"
@ -437,7 +437,7 @@ class Controller(QObject):
+ f"\nCurrently selected ('these' / 'them' / 'the selected'): {sel}"
+ "\n" + spatial_summary(self.scene))
calls = driver.interpret(text, self.schemas(), scene_text=scene_text,
history=self._history)
history=self._history, image_path=image_path)
messages = driver.dispatch(calls, verbose=False, executor=self.execute_call)
self._commit()
spoken = driver.summarize(calls, messages)

38
tests/test_command_bar.py Normal file
View File

@ -0,0 +1,38 @@
"""Offscreen smoke tests for the command bar's image attachment."""
import os
import pytest
os.environ.setdefault("QT_QPA_PLATFORM", "offscreen")
pytest.importorskip("PySide6")
from PySide6.QtCore import QThreadPool # noqa: E402
from PySide6.QtWidgets import QApplication # noqa: E402
from woodshop.gui.command_bar import CommandBar # noqa: E402
from woodshop.gui.controller import Controller # noqa: E402
_app = QApplication.instance() or QApplication([])
def test_attach_sets_pending_and_chip(tmp_path):
c = Controller(str(tmp_path / "s.json"))
bar = CommandBar(c, QThreadPool.globalInstance())
img = tmp_path / "chair.png"
img.write_bytes(b"\x89PNG")
bar._set_image(str(img))
assert bar._pending_image == str(img)
assert "chair.png" in bar.image_chip.text()
bar._set_image(None)
assert bar._pending_image is None and bar.image_chip.text() == ""
def test_send_with_only_image_uses_default_text(tmp_path, monkeypatch):
c = Controller(str(tmp_path / "s.json"))
bar = CommandBar(c, QThreadPool.globalInstance())
calls = {}
monkeypatch.setattr(bar, "_run", lambda text: calls.setdefault("text", text))
bar._set_image(str(tmp_path / "x.png"))
bar.input.clear()
bar._send()
assert calls["text"] == "build something like this"

View File

@ -146,3 +146,44 @@ def test_woodshop_cmd_falls_back_to_module(monkeypatch):
monkeypatch.setattr(driver.shutil, "which", lambda name: None)
cmd = driver.woodshop_cmd()
assert cmd[1:] == ["-m", "woodshop"] and cmd[0] # python -m woodshop
def test_find_image_url():
assert driver.find_image_url("build like this https://x.com/chair.jpg please") \
== "https://x.com/chair.jpg"
assert driver.find_image_url("https://x.com/a.PNG") == "https://x.com/a.PNG"
assert driver.find_image_url("no image here http://x.com/page") is None
def test_interpret_includes_image_directive(monkeypatch, tmp_path):
captured = {}
def fake_run(cmd, stdin=""):
captured["prompt"] = stdin
return "[]"
img = tmp_path / "ref.jpg"
img.write_bytes(b"\xff\xd8\xff") # not a real jpeg, just a path
monkeypatch.setattr(driver, "_run", fake_run)
driver.interpret("build something like this", schemas="[]", scene_text="empty",
image_path=str(img))
assert "REFERENCE PHOTO" in captured["prompt"]
assert str(img) in captured["prompt"]
def test_fetch_image_writes_temp(monkeypatch):
import io
class FakeResp:
headers = {"Content-Type": "image/png"}
def __enter__(self): return self
def __exit__(self, *a): return False
def read(self): return b"\x89PNG\r\n\x1a\n"
monkeypatch.setattr(driver.urllib.request, "urlopen", lambda *a, **k: FakeResp())
path = driver.fetch_image("https://x.com/chair.png")
assert path.endswith(".png")
with open(path, "rb") as f:
assert f.read().startswith(b"\x89PNG")
import os as _os
_os.remove(path)

View File

@ -145,7 +145,7 @@ def test_run_command_threads_history(tmp_path, monkeypatch):
c = _controller(tmp_path)
seen = {}
def fake_interpret(text, schemas, scene_text=None, history=None):
def fake_interpret(text, schemas, scene_text=None, history=None, image_path=None):
seen["history"] = list(history or [])
return [{"tool": "say", "args": {"text": "want me to add tenons?"}}]
@ -156,3 +156,16 @@ def test_run_command_threads_history(tmp_path, monkeypatch):
c.run_command("yes")
assert seen["history"] == [("build a table", "want me to add tenons?")]
def test_run_command_forwards_image_path(tmp_path, monkeypatch):
c = _controller(tmp_path)
seen = {}
def fake_interpret(text, schemas, scene_text=None, history=None, image_path=None):
seen["image_path"] = image_path
return [{"tool": "say", "args": {"text": "ok"}}]
monkeypatch.setattr(driver, "interpret", fake_interpret)
c.run_command("build like this", image_path="/tmp/ref.jpg")
assert seen["image_path"] == "/tmp/ref.jpg"