Add reference-photo input: "build something like this"

Attach a photo (📎 button, drag-drop, paste, or an image URL) and the driver hands it to claude -p, which reads the image (its Read tool sees images) and emits the usual tool-call JSON to build a simplified, buildable interpretation in dimensional lumber — no API keys, same claude -p pipe. - driver: interpret(image_path=) prepends a reference-photo directive with the image's absolute path; find_image_url() + fetch_image() download a linked image to a temp file; woodshop-talk --image (path or URL) for CLI/voice. - controller.run_command(image_path=) passthrough. - command bar: 📎 attach (file picker), drag-drop image, Ctrl+V paste image, and image-URL-in-text detection; downloads run off the UI thread; an image chip shows/clears the attachment. - tests: URL detection, image directive in prompt, fetch_image temp write, controller passthrough, command-bar attach + default-text smoke. 216 pass. Honest limit: the live image round-trip needs a real display/model call to verify — wired + unit-tested, please confirm it sees the photo on your machine. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-30 22:20:57 -03:00 · 2026-05-30 22:20:57 -03:00 · c623ad2576
parent b9b0871ac3
commit c623ad2576
7 changed files with 252 additions and 18 deletions
--- a/README.md
+++ b/README.md
@ -8,6 +8,11 @@ Talk to it like the Star Trek holodeck and watch furniture build itself:
 > *"Build a coffee table: a four foot by two foot frame from 2x4s, with four legs 18 inches tall standing at the corners."*
 You can also **attach a reference photo** (📎, drag-drop, paste, or an image URL)
 and say *"build something like this"* — WoodShop hands the image to the model and
 builds a simplified, buildable interpretation in dimensional lumber that you then
 refine by voice/text. (It's an interpretation, not a measured replica.)
 Each board is real dimensional lumber (a 2x4 is modeled at its true 1.5″ × 3.5″),
 so the result is buildable — export to **STEP** (CAD/CNC) or **STL** (3D print),
 and get a **cut list with board-feet and a shopping estimate**.
--- a/src/woodshop/driver.py
+++ b/src/woodshop/driver.py
@ -17,15 +17,53 @@ from __future__ import annotations
 import argparse
 import json
 import os
 import re
 import shutil
 import subprocess
 import sys
 import tempfile
 import urllib.request
 TOOL_FILTER = "wood-*"  # auto-discover every wood-* tool, no hardcoded list
 REASON_PROVIDER = "claude -p"  # chosen for reliable structured tool-calling
 _MAX_HISTORY = 6  # turns of recent conversation fed back for reference-resolution
 # A reference photo can be attached to "build something like this". claude -p
 # reads the image file (via its Read tool), so we just hand it an absolute path.
 _IMG_URL = re.compile(r'https?://\S+?\.(?:png|jpe?g|webp|gif|bmp)\b', re.I)
 _IMAGE_DIRECTIVE = (
    "A REFERENCE PHOTO of furniture is saved at this path:\n  {path}\n"
    "Open and look at that image file. The user wants to build something LIKE it "
    "from dimensional lumber and plywood. Infer the major parts, rough proportions, "
    "and joinery, and emit the tool calls to build a SIMPLIFIED, buildable version "
    "with reasonable real dimensions in inches. This is an interpretation, not an "
    "exact replica — prefer standard stock sizes and right angles.\n\n"
 )
 def find_image_url(text: str) -> str | None:
    m = _IMG_URL.search(text or "")
    return m.group(0) if m else None
 def fetch_image(url: str, timeout: int = 20) -> str:
    """Download an image URL to a temp file and return its path. Raises on
    failure (caller decides how to surface it)."""
    req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
    with urllib.request.urlopen(req, timeout=timeout) as resp:
        ctype = (resp.headers.get("Content-Type") or "").split(";")[0].strip().lower()
        data = resp.read()
    ext = {"image/png": ".png", "image/jpeg": ".jpg", "image/webp": ".webp",
           "image/gif": ".gif", "image/bmp": ".bmp"}.get(ctype)
    if ext is None:
        m = re.search(r'\.(png|jpe?g|webp|gif|bmp)\b', url, re.I)
        ext = "." + m.group(1).lower() if m else ".img"
    fd, path = tempfile.mkstemp(suffix=ext, prefix="woodshop-ref-")
    with os.fdopen(fd, "wb") as f:
        f.write(data)
    return path
 # A board placed earlier in the SAME utterance is referenced as $1, $2, ...
 _SYMBOL = re.compile(r"\$(\d+)")
@ -143,10 +181,13 @@ def _render_history(history: list[tuple[str, str]] | None) -> str:
 def interpret(utterance: str, schemas: str, scene_text: str | None = None,
-              history: list[tuple[str, str]] | None = None) -> list[dict]:
+              history: list[tuple[str, str]] | None = None,
              image_path: str | None = None) -> list[dict]:
    scene = scene_text if scene_text is not None else scene_summary()
    prompt = SYSTEM.format(schemas=schemas, scene=scene, utterance=utterance,
                           history=_render_history(history))
    if image_path:
        prompt = _IMAGE_DIRECTIVE.format(path=os.path.abspath(image_path)) + prompt
    raw = _run(REASON_PROVIDER.split(), stdin=prompt)
    calls = _extract_calls(raw)
    if calls is None:
@ -242,8 +283,9 @@ def summarize(calls: list[dict], messages: list[str]) -> str:
 def handle(utterance: str, schemas: str, voice: bool, verbose: bool,
-           history: list[tuple[str, str]] | None = None) -> None:
+           history: list[tuple[str, str]] | None = None,
-    calls = interpret(utterance, schemas, history=history)
+           image_path: str | None = None) -> None:
    calls = interpret(utterance, schemas, history=history, image_path=image_path)
    messages = dispatch(calls, verbose=verbose)
    full = " ".join(m for m in messages if m).strip()
    spoken = summarize(calls, messages)
@ -271,9 +313,14 @@ def main(argv: list[str] | None = None) -> int:
    ap.add_argument("--voice", action="store_true", help="Listen on the mic instead of typing")
    ap.add_argument("--duration", type=int, default=6, help="Mic recording seconds (--voice)")
    ap.add_argument("--once", help="Run a single command (non-interactive) and exit")
    ap.add_argument("--image", help="Reference photo (path or URL) for 'build something like this'")
    ap.add_argument("--quiet", action="store_true", help="Don't print per-call detail")
    args = ap.parse_args(argv)
    image_path = None
    if args.image:
        image_path = fetch_image(args.image) if args.image.startswith("http") else args.image
    schemas = load_schemas()
    if not schemas:
        print("Could not load wood-* tool schemas (is CmdForge/pa-load-tools available?)",
@ -281,7 +328,8 @@ def main(argv: list[str] | None = None) -> int:
        return 1
    if args.once is not None:
-        handle(args.once, schemas, voice=args.voice, verbose=not args.quiet)
+        handle(args.once, schemas, voice=args.voice, verbose=not args.quiet,
               image_path=image_path)
        return 0
    print("WoodShop ready. Say things like 'place a 6 foot 2x4'. Ctrl-C to quit.")
@ -295,7 +343,8 @@ def main(argv: list[str] | None = None) -> int:
            return 0
        try:
            handle(utterance, schemas, voice=args.voice, verbose=not args.quiet,
-                   history=history)
+                   history=history, image_path=image_path)
            image_path = None        # the reference photo applies to the first turn only
        except Exception as exc:  # never let one bad command kill the session
            print(f"WoodShop: sorry, that command failed ({exc}).")
--- a/src/woodshop/gui/command_bar.py
+++ b/src/woodshop/gui/command_bar.py
@ -1,17 +1,23 @@
 """Command bar: type a command or push-to-talk, see the transcript, optionally
-hear the reply. Slow work (LLM, dictate, TTS) runs off the UI thread."""
+hear the reply. You can also attach a reference photo ("build something like
 this") by the 📎 button, drag-drop, paste, or a pasted image URL. Slow work
 (LLM, dictate, TTS, image download) runs off the UI thread."""
 from __future__ import annotations
 import os
 import subprocess
 from PySide6.QtCore import Qt, QThreadPool
-from PySide6.QtWidgets import (QCheckBox, QHBoxLayout, QLabel, QLineEdit,
+from PySide6.QtGui import QKeySequence
-                               QPushButton, QTextEdit, QVBoxLayout, QWidget)
+from PySide6.QtWidgets import (QApplication, QCheckBox, QFileDialog, QHBoxLayout, QLabel,
                               QLineEdit, QPushButton, QTextEdit, QVBoxLayout, QWidget)
 from .. import driver
 from .controller import Controller
 from .workers import run_async
 _WHO_COLOR = {"you": "#9cdcfe", "ws": "#c8965a", "sys": "#e06c75"}
 _IMAGE_EXTS = (".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp")
 class CommandBar(QWidget):
@ -19,6 +25,8 @@ class CommandBar(QWidget):
        super().__init__(parent)
        self.c = controller
        self.pool = pool
        self._pending_image: str | None = None     # attached reference photo path
        self.setAcceptDrops(True)                   # drop an image onto the bar
        root = QVBoxLayout(self)
        self.transcript = QTextEdit(readOnly=True)
@ -32,6 +40,12 @@ class CommandBar(QWidget):
        self.mic.clicked.connect(self._listen)
        row.addWidget(self.mic)
        self.attach = QPushButton("📎")
        self.attach.setToolTip("Attach a reference photo — then say 'build something like this'")
        self.attach.setFixedWidth(40)
        self.attach.clicked.connect(self._attach_image)
        row.addWidget(self.attach)
        self.input = QLineEdit()
        self.input.setPlaceholderText("Type a command, e.g. 'build a coffee table'  —  Enter to send")
        self.input.returnPressed.connect(self._send)
@ -45,6 +59,9 @@ class CommandBar(QWidget):
        bottom = QHBoxLayout()
        self.speak = QCheckBox("Speak replies")
        bottom.addWidget(self.speak)
        self.image_chip = QLabel("")               # shows the attached photo name
        self.image_chip.setStyleSheet("color:#c8965a")
        bottom.addWidget(self.image_chip)
        bottom.addStretch()
        self.status = QLabel("")
        bottom.addWidget(self.status)
@ -52,6 +69,69 @@ class CommandBar(QWidget):
        self.c.logged.connect(self._log)
    # ----- reference image ---------------------------------------------
    def _set_image(self, path: str | None) -> None:
        self._pending_image = path
        if path:
            name = os.path.basename(path)
            self.image_chip.setText(f"📎 {name}  ✕")
            self.image_chip.setToolTip("Click to remove the attached photo")
        else:
            self.image_chip.setText("")
            self.image_chip.setToolTip("")
    def mousePressEvent(self, e):
        # click the chip text to clear the attachment
        if self._pending_image and self.image_chip.geometry().contains(e.pos()):
            self._set_image(None)
        super().mousePressEvent(e)
    def _attach_image(self) -> None:
        path, _ = QFileDialog.getOpenFileName(
            self, "Attach reference photo", "",
            "Images (*.png *.jpg *.jpeg *.webp *.gif *.bmp)")
        if path:
            self._set_image(path)
            if not self.input.text().strip():
                self.input.setText("build something like this")
    def dragEnterEvent(self, e):
        md = e.mimeData()
        if md.hasImage() or any(u.toLocalFile().lower().endswith(_IMAGE_EXTS)
                                for u in md.urls()):
            e.acceptProposedAction()
    def dropEvent(self, e):
        md = e.mimeData()
        for u in md.urls():
            p = u.toLocalFile()
            if p.lower().endswith(_IMAGE_EXTS):
                self._set_image(p)
                break
        else:
            if md.hasImage():
                self._save_clipboard_image(md.imageData())
        if self._pending_image and not self.input.text().strip():
            self.input.setText("build something like this")
    def _save_clipboard_image(self, qimage) -> None:
        import tempfile
        if qimage is None or qimage.isNull():
            return
        fd, path = tempfile.mkstemp(suffix=".png", prefix="woodshop-paste-")
        os.close(fd)
        if qimage.save(path, "PNG"):
            self._set_image(path)
    def keyPressEvent(self, e):
        # paste an image straight from the clipboard (Ctrl+V) when the bar has focus
        if e.matches(QKeySequence.Paste):
            img = QApplication.clipboard().image()
            if not img.isNull():
                self._save_clipboard_image(img)
                return
        super().keyPressEvent(e)
    # ----- logging -----------------------------------------------------
    def _log(self, who: str, text: str) -> None:
        if not text:
@ -65,26 +145,34 @@ class CommandBar(QWidget):
    def _busy(self, on: bool, msg: str = "") -> None:
        self.input.setEnabled(not on)
        self.mic.setEnabled(not on)
        self.attach.setEnabled(not on)
        self.status.setText(msg)
    # ----- send typed/spoken command -----------------------------------
    def _send(self) -> None:
        text = self.input.text().strip()
-        if not text:
+        if not text and not self._pending_image:
            return
        self.input.clear()
-        self._run(text)
+        self._run(text or "build something like this")
    def submit(self, text: str) -> None:
        """Run a command programmatically (e.g. from a Build-menu template)."""
        self._run(text)
    def _run(self, text: str) -> None:
-        self._log("you", text)
+        image = self._pending_image
-        self._busy(True, "thinking…")
+        url = None if image else driver.find_image_url(text)
        note = "  📎 photo" if (image or url) else ""
        self._log("you", text + note)
        self._set_image(None)
        self._busy(True, "looking…" if (image or url) else "thinking…")
        def work():
-            return self.c.run_command(text)
+            path = image
            if path is None and url:
                path = driver.fetch_image(url)         # download the linked image
            return self.c.run_command(text, image_path=path)
        def done(summary):
            self._busy(False)
--- a/src/woodshop/gui/controller.py
+++ b/src/woodshop/gui/controller.py
@ -427,9 +427,9 @@ class Controller(QObject):
        except (SceneError, ValueError, KeyError) as exc:
            return str(exc).strip('"')
-    def run_command(self, text: str) -> str:
+    def run_command(self, text: str, image_path: str | None = None) -> str:
-        """Interpret a spoken/typed command and apply it. Returns a spoken summary.
+        """Interpret a spoken/typed command (optionally with a reference photo) and
-        (Slow — call from a worker thread.)"""
+        apply it. Returns a spoken summary. (Slow — call from a worker thread.)"""
        from ..scene import spatial_summary
        self.save()  # ensure disk reflects current state
        sel = ", ".join(self.selected) if self.selected else "none"
@ -437,7 +437,7 @@ class Controller(QObject):
                      + f"\nCurrently selected ('these' / 'them' / 'the selected'): {sel}"
                      + "\n" + spatial_summary(self.scene))
        calls = driver.interpret(text, self.schemas(), scene_text=scene_text,
-                                 history=self._history)
+                                 history=self._history, image_path=image_path)
        messages = driver.dispatch(calls, verbose=False, executor=self.execute_call)
        self._commit()
        spoken = driver.summarize(calls, messages)
--- a/tests/test_command_bar.py
+++ b/tests/test_command_bar.py
@ -0,0 +1,38 @@
 """Offscreen smoke tests for the command bar's image attachment."""
 import os
 import pytest
 os.environ.setdefault("QT_QPA_PLATFORM", "offscreen")
 pytest.importorskip("PySide6")
 from PySide6.QtCore import QThreadPool  # noqa: E402
 from PySide6.QtWidgets import QApplication  # noqa: E402
 from woodshop.gui.command_bar import CommandBar  # noqa: E402
 from woodshop.gui.controller import Controller  # noqa: E402
 _app = QApplication.instance() or QApplication([])
 def test_attach_sets_pending_and_chip(tmp_path):
    c = Controller(str(tmp_path / "s.json"))
    bar = CommandBar(c, QThreadPool.globalInstance())
    img = tmp_path / "chair.png"
    img.write_bytes(b"\x89PNG")
    bar._set_image(str(img))
    assert bar._pending_image == str(img)
    assert "chair.png" in bar.image_chip.text()
    bar._set_image(None)
    assert bar._pending_image is None and bar.image_chip.text() == ""
 def test_send_with_only_image_uses_default_text(tmp_path, monkeypatch):
    c = Controller(str(tmp_path / "s.json"))
    bar = CommandBar(c, QThreadPool.globalInstance())
    calls = {}
    monkeypatch.setattr(bar, "_run", lambda text: calls.setdefault("text", text))
    bar._set_image(str(tmp_path / "x.png"))
    bar.input.clear()
    bar._send()
    assert calls["text"] == "build something like this"
--- a/tests/test_driver.py
+++ b/tests/test_driver.py
@ -146,3 +146,44 @@ def test_woodshop_cmd_falls_back_to_module(monkeypatch):
    monkeypatch.setattr(driver.shutil, "which", lambda name: None)
    cmd = driver.woodshop_cmd()
    assert cmd[1:] == ["-m", "woodshop"] and cmd[0]   # python -m woodshop
 def test_find_image_url():
    assert driver.find_image_url("build like this https://x.com/chair.jpg please") \
        == "https://x.com/chair.jpg"
    assert driver.find_image_url("https://x.com/a.PNG") == "https://x.com/a.PNG"
    assert driver.find_image_url("no image here http://x.com/page") is None
 def test_interpret_includes_image_directive(monkeypatch, tmp_path):
    captured = {}
    def fake_run(cmd, stdin=""):
        captured["prompt"] = stdin
        return "[]"
    img = tmp_path / "ref.jpg"
    img.write_bytes(b"\xff\xd8\xff")               # not a real jpeg, just a path
    monkeypatch.setattr(driver, "_run", fake_run)
    driver.interpret("build something like this", schemas="[]", scene_text="empty",
                     image_path=str(img))
    assert "REFERENCE PHOTO" in captured["prompt"]
    assert str(img) in captured["prompt"]
 def test_fetch_image_writes_temp(monkeypatch):
    import io
    class FakeResp:
        headers = {"Content-Type": "image/png"}
        def __enter__(self): return self
        def __exit__(self, *a): return False
        def read(self): return b"\x89PNG\r\n\x1a\n"
    monkeypatch.setattr(driver.urllib.request, "urlopen", lambda *a, **k: FakeResp())
    path = driver.fetch_image("https://x.com/chair.png")
    assert path.endswith(".png")
    with open(path, "rb") as f:
        assert f.read().startswith(b"\x89PNG")
    import os as _os
    _os.remove(path)
--- a/tests/test_gui_controller.py
+++ b/tests/test_gui_controller.py
@ -145,7 +145,7 @@ def test_run_command_threads_history(tmp_path, monkeypatch):
    c = _controller(tmp_path)
    seen = {}
-    def fake_interpret(text, schemas, scene_text=None, history=None):
+    def fake_interpret(text, schemas, scene_text=None, history=None, image_path=None):
        seen["history"] = list(history or [])
        return [{"tool": "say", "args": {"text": "want me to add tenons?"}}]
@ -156,3 +156,16 @@ def test_run_command_threads_history(tmp_path, monkeypatch):
    c.run_command("yes")
    assert seen["history"] == [("build a table", "want me to add tenons?")]
 def test_run_command_forwards_image_path(tmp_path, monkeypatch):
    c = _controller(tmp_path)
    seen = {}
    def fake_interpret(text, schemas, scene_text=None, history=None, image_path=None):
        seen["image_path"] = image_path
        return [{"tool": "say", "args": {"text": "ok"}}]
    monkeypatch.setattr(driver, "interpret", fake_interpret)
    c.run_command("build like this", image_path="/tmp/ref.jpg")
    assert seen["image_path"] == "/tmp/ref.jpg"