Add reference-photo input: "build something like this"

Attach a photo (📎 button, drag-drop, paste, or an image URL) and the driver hands it to claude -p, which reads the image (its Read tool sees images) and emits the usual tool-call JSON to build a simplified, buildable interpretation in dimensional lumber — no API keys, same claude -p pipe. - driver: interpret(image_path=) prepends a reference-photo directive with the image's absolute path; find_image_url() + fetch_image() download a linked image to a temp file; woodshop-talk --image (path or URL) for CLI/voice. - controller.run_command(image_path=) passthrough. - command bar: 📎 attach (file picker), drag-drop image, Ctrl+V paste image, and image-URL-in-text detection; downloads run off the UI thread; an image chip shows/clears the attachment. - tests: URL detection, image directive in prompt, fetch_image temp write, controller passthrough, command-bar attach + default-text smoke. 216 pass. Honest limit: the live image round-trip needs a real display/model call to verify — wired + unit-tested, please confirm it sees the photo on your machine. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-30 22:20:57 -03:00 · 2026-05-30 22:20:57 -03:00 · c623ad2576
parent b9b0871ac3
commit c623ad2576
7 changed files with 252 additions and 18 deletions
--- a/README.md
+++ b/README.md
@ -8,6 +8,11 @@ Talk to it like the Star Trek holodeck and watch furniture build itself:

 > *"Build a coffee table: a four foot by two foot frame from 2x4s, with four legs 18 inches tall standing at the corners."*

+You can also **attach a reference photo** (📎, drag-drop, paste, or an image URL)
+and say *"build something like this"* — WoodShop hands the image to the model and
+builds a simplified, buildable interpretation in dimensional lumber that you then
+refine by voice/text. (It's an interpretation, not a measured replica.)
+
 Each board is real dimensional lumber (a 2x4 is modeled at its true 1.5″ × 3.5″),
 so the result is buildable — export to **STEP** (CAD/CNC) or **STL** (3D print),
 and get a **cut list with board-feet and a shopping estimate**.
--- a/src/woodshop/driver.py
+++ b/src/woodshop/driver.py
@ -17,15 +17,53 @@ from __future__ import annotations

 import argparse
 import json
+import os
 import re
 import shutil
 import subprocess
 import sys
+import tempfile
+import urllib.request

 TOOL_FILTER = "wood-*"  # auto-discover every wood-* tool, no hardcoded list
 REASON_PROVIDER = "claude -p"  # chosen for reliable structured tool-calling
 _MAX_HISTORY = 6  # turns of recent conversation fed back for reference-resolution

+# A reference photo can be attached to "build something like this". claude -p
+# reads the image file (via its Read tool), so we just hand it an absolute path.
+_IMG_URL = re.compile(r'https?://\S+?\.(?:png|jpe?g|webp|gif|bmp)\b', re.I)
+_IMAGE_DIRECTIVE = (
+    "A REFERENCE PHOTO of furniture is saved at this path:\n  {path}\n"
+    "Open and look at that image file. The user wants to build something LIKE it "
+    "from dimensional lumber and plywood. Infer the major parts, rough proportions, "
+    "and joinery, and emit the tool calls to build a SIMPLIFIED, buildable version "
+    "with reasonable real dimensions in inches. This is an interpretation, not an "
+    "exact replica — prefer standard stock sizes and right angles.\n\n"
+)
+
+
+def find_image_url(text: str) -> str | None:
+    m = _IMG_URL.search(text or "")
+    return m.group(0) if m else None
+
+
+def fetch_image(url: str, timeout: int = 20) -> str:
+    """Download an image URL to a temp file and return its path. Raises on
+    failure (caller decides how to surface it)."""
+    req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
+    with urllib.request.urlopen(req, timeout=timeout) as resp:
+        ctype = (resp.headers.get("Content-Type") or "").split(";")[0].strip().lower()
+        data = resp.read()
+    ext = {"image/png": ".png", "image/jpeg": ".jpg", "image/webp": ".webp",
+           "image/gif": ".gif", "image/bmp": ".bmp"}.get(ctype)
+    if ext is None:
+        m = re.search(r'\.(png|jpe?g|webp|gif|bmp)\b', url, re.I)
+        ext = "." + m.group(1).lower() if m else ".img"
+    fd, path = tempfile.mkstemp(suffix=ext, prefix="woodshop-ref-")
+    with os.fdopen(fd, "wb") as f:
+        f.write(data)
+    return path
+
 # A board placed earlier in the SAME utterance is referenced as $1, $2, ...
 _SYMBOL = re.compile(r"\$(\d+)")

@ -143,10 +181,13 @@ def _render_history(history: list[tuple[str, str]] | None) -> str:


 def interpret(utterance: str, schemas: str, scene_text: str | None = None,
-              history: list[tuple[str, str]] | None = None) -> list[dict]:
+              history: list[tuple[str, str]] | None = None,
+              image_path: str | None = None) -> list[dict]:
    scene = scene_text if scene_text is not None else scene_summary()
    prompt = SYSTEM.format(schemas=schemas, scene=scene, utterance=utterance,
                           history=_render_history(history))
+    if image_path:
+        prompt = _IMAGE_DIRECTIVE.format(path=os.path.abspath(image_path)) + prompt
    raw = _run(REASON_PROVIDER.split(), stdin=prompt)
    calls = _extract_calls(raw)
    if calls is None:
@ -242,8 +283,9 @@ def summarize(calls: list[dict], messages: list[str]) -> str:


 def handle(utterance: str, schemas: str, voice: bool, verbose: bool,
-           history: list[tuple[str, str]] | None = None) -> None:
-    calls = interpret(utterance, schemas, history=history)
+           history: list[tuple[str, str]] | None = None,
+           image_path: str | None = None) -> None:
+    calls = interpret(utterance, schemas, history=history, image_path=image_path)
    messages = dispatch(calls, verbose=verbose)
    full = " ".join(m for m in messages if m).strip()
    spoken = summarize(calls, messages)
@ -271,9 +313,14 @@ def main(argv: list[str] | None = None) -> int:
    ap.add_argument("--voice", action="store_true", help="Listen on the mic instead of typing")
    ap.add_argument("--duration", type=int, default=6, help="Mic recording seconds (--voice)")
    ap.add_argument("--once", help="Run a single command (non-interactive) and exit")
+    ap.add_argument("--image", help="Reference photo (path or URL) for 'build something like this'")
    ap.add_argument("--quiet", action="store_true", help="Don't print per-call detail")
    args = ap.parse_args(argv)

+    image_path = None
+    if args.image:
+        image_path = fetch_image(args.image) if args.image.startswith("http") else args.image
+
    schemas = load_schemas()
    if not schemas:
        print("Could not load wood-* tool schemas (is CmdForge/pa-load-tools available?)",
@ -281,7 +328,8 @@ def main(argv: list[str] | None = None) -> int:
        return 1

    if args.once is not None:
-        handle(args.once, schemas, voice=args.voice, verbose=not args.quiet)
+        handle(args.once, schemas, voice=args.voice, verbose=not args.quiet,
+               image_path=image_path)
        return 0

    print("WoodShop ready. Say things like 'place a 6 foot 2x4'. Ctrl-C to quit.")
@ -295,7 +343,8 @@ def main(argv: list[str] | None = None) -> int:
            return 0
        try:
            handle(utterance, schemas, voice=args.voice, verbose=not args.quiet,
-                   history=history)
+                   history=history, image_path=image_path)
+            image_path = None        # the reference photo applies to the first turn only
        except Exception as exc:  # never let one bad command kill the session
            print(f"WoodShop: sorry, that command failed ({exc}).")

--- a/src/woodshop/gui/command_bar.py
+++ b/src/woodshop/gui/command_bar.py
@ -1,17 +1,23 @@
 """Command bar: type a command or push-to-talk, see the transcript, optionally
-hear the reply. Slow work (LLM, dictate, TTS) runs off the UI thread."""
+hear the reply. You can also attach a reference photo ("build something like
+this") by the 📎 button, drag-drop, paste, or a pasted image URL. Slow work
+(LLM, dictate, TTS, image download) runs off the UI thread."""
 from __future__ import annotations

+import os
 import subprocess

 from PySide6.QtCore import Qt, QThreadPool
-from PySide6.QtWidgets import (QCheckBox, QHBoxLayout, QLabel, QLineEdit,
-                               QPushButton, QTextEdit, QVBoxLayout, QWidget)
+from PySide6.QtGui import QKeySequence
+from PySide6.QtWidgets import (QApplication, QCheckBox, QFileDialog, QHBoxLayout, QLabel,
+                               QLineEdit, QPushButton, QTextEdit, QVBoxLayout, QWidget)

+from .. import driver
 from .controller import Controller
 from .workers import run_async

 _WHO_COLOR = {"you": "#9cdcfe", "ws": "#c8965a", "sys": "#e06c75"}
+_IMAGE_EXTS = (".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp")


 class CommandBar(QWidget):
@ -19,6 +25,8 @@ class CommandBar(QWidget):
        super().__init__(parent)
        self.c = controller
        self.pool = pool
+        self._pending_image: str | None = None     # attached reference photo path
+        self.setAcceptDrops(True)                   # drop an image onto the bar

        root = QVBoxLayout(self)
        self.transcript = QTextEdit(readOnly=True)
@ -32,6 +40,12 @@ class CommandBar(QWidget):
        self.mic.clicked.connect(self._listen)
        row.addWidget(self.mic)

+        self.attach = QPushButton("📎")
+        self.attach.setToolTip("Attach a reference photo — then say 'build something like this'")
+        self.attach.setFixedWidth(40)
+        self.attach.clicked.connect(self._attach_image)
+        row.addWidget(self.attach)
+
        self.input = QLineEdit()
        self.input.setPlaceholderText("Type a command, e.g. 'build a coffee table'  —  Enter to send")
        self.input.returnPressed.connect(self._send)
@ -45,6 +59,9 @@ class CommandBar(QWidget):
        bottom = QHBoxLayout()
        self.speak = QCheckBox("Speak replies")
        bottom.addWidget(self.speak)
+        self.image_chip = QLabel("")               # shows the attached photo name
+        self.image_chip.setStyleSheet("color:#c8965a")
+        bottom.addWidget(self.image_chip)
        bottom.addStretch()
        self.status = QLabel("")
        bottom.addWidget(self.status)
@ -52,6 +69,69 @@ class CommandBar(QWidget):

        self.c.logged.connect(self._log)

+    # ----- reference image ---------------------------------------------
+    def _set_image(self, path: str | None) -> None:
+        self._pending_image = path
+        if path:
+            name = os.path.basename(path)
+            self.image_chip.setText(f"📎 {name}  ✕")
+            self.image_chip.setToolTip("Click to remove the attached photo")
+        else:
+            self.image_chip.setText("")
+            self.image_chip.setToolTip("")
+
+    def mousePressEvent(self, e):
+        # click the chip text to clear the attachment
+        if self._pending_image and self.image_chip.geometry().contains(e.pos()):
+            self._set_image(None)
+        super().mousePressEvent(e)
+
+    def _attach_image(self) -> None:
+        path, _ = QFileDialog.getOpenFileName(
+            self, "Attach reference photo", "",
+            "Images (*.png *.jpg *.jpeg *.webp *.gif *.bmp)")
+        if path:
+            self._set_image(path)
+            if not self.input.text().strip():
+                self.input.setText("build something like this")
+
+    def dragEnterEvent(self, e):
+        md = e.mimeData()
+        if md.hasImage() or any(u.toLocalFile().lower().endswith(_IMAGE_EXTS)
+                                for u in md.urls()):
+            e.acceptProposedAction()
+
+    def dropEvent(self, e):
+        md = e.mimeData()
+        for u in md.urls():
+            p = u.toLocalFile()
+            if p.lower().endswith(_IMAGE_EXTS):
+                self._set_image(p)
+                break
+        else:
+            if md.hasImage():
+                self._save_clipboard_image(md.imageData())
+        if self._pending_image and not self.input.text().strip():
+            self.input.setText("build something like this")
+
+    def _save_clipboard_image(self, qimage) -> None:
+        import tempfile
+        if qimage is None or qimage.isNull():
+            return
+        fd, path = tempfile.mkstemp(suffix=".png", prefix="woodshop-paste-")
+        os.close(fd)
+        if qimage.save(path, "PNG"):
+            self._set_image(path)
+
+    def keyPressEvent(self, e):
+        # paste an image straight from the clipboard (Ctrl+V) when the bar has focus
+        if e.matches(QKeySequence.Paste):
+            img = QApplication.clipboard().image()
+            if not img.isNull():
+                self._save_clipboard_image(img)
+                return
+        super().keyPressEvent(e)
+
    # ----- logging -----------------------------------------------------
    def _log(self, who: str, text: str) -> None:
        if not text:
@ -65,26 +145,34 @@ class CommandBar(QWidget):
    def _busy(self, on: bool, msg: str = "") -> None:
        self.input.setEnabled(not on)
        self.mic.setEnabled(not on)
+        self.attach.setEnabled(not on)
        self.status.setText(msg)

    # ----- send typed/spoken command -----------------------------------
    def _send(self) -> None:
        text = self.input.text().strip()
-        if not text:
+        if not text and not self._pending_image:
            return
        self.input.clear()
-        self._run(text)
+        self._run(text or "build something like this")

    def submit(self, text: str) -> None:
        """Run a command programmatically (e.g. from a Build-menu template)."""
        self._run(text)

    def _run(self, text: str) -> None:
-        self._log("you", text)
-        self._busy(True, "thinking…")
+        image = self._pending_image
+        url = None if image else driver.find_image_url(text)
+        note = "  📎 photo" if (image or url) else ""
+        self._log("you", text + note)
+        self._set_image(None)
+        self._busy(True, "looking…" if (image or url) else "thinking…")

        def work():
-            return self.c.run_command(text)
+            path = image
+            if path is None and url:
+                path = driver.fetch_image(url)         # download the linked image
+            return self.c.run_command(text, image_path=path)

        def done(summary):
            self._busy(False)
--- a/src/woodshop/gui/controller.py
+++ b/src/woodshop/gui/controller.py
@ -427,9 +427,9 @@ class Controller(QObject):
        except (SceneError, ValueError, KeyError) as exc:
            return str(exc).strip('"')

-    def run_command(self, text: str) -> str:
-        """Interpret a spoken/typed command and apply it. Returns a spoken summary.
-        (Slow — call from a worker thread.)"""
+    def run_command(self, text: str, image_path: str | None = None) -> str:
+        """Interpret a spoken/typed command (optionally with a reference photo) and
+        apply it. Returns a spoken summary. (Slow — call from a worker thread.)"""
        from ..scene import spatial_summary
        self.save()  # ensure disk reflects current state
        sel = ", ".join(self.selected) if self.selected else "none"
@ -437,7 +437,7 @@ class Controller(QObject):
                      + f"\nCurrently selected ('these' / 'them' / 'the selected'): {sel}"
                      + "\n" + spatial_summary(self.scene))
        calls = driver.interpret(text, self.schemas(), scene_text=scene_text,
-                                 history=self._history)
+                                 history=self._history, image_path=image_path)
        messages = driver.dispatch(calls, verbose=False, executor=self.execute_call)
        self._commit()
        spoken = driver.summarize(calls, messages)
--- a/tests/test_command_bar.py
+++ b/tests/test_command_bar.py
@ -0,0 +1,38 @@
+"""Offscreen smoke tests for the command bar's image attachment."""
+import os
+
+import pytest
+
+os.environ.setdefault("QT_QPA_PLATFORM", "offscreen")
+pytest.importorskip("PySide6")
+
+from PySide6.QtCore import QThreadPool  # noqa: E402
+from PySide6.QtWidgets import QApplication  # noqa: E402
+
+from woodshop.gui.command_bar import CommandBar  # noqa: E402
+from woodshop.gui.controller import Controller  # noqa: E402
+
+_app = QApplication.instance() or QApplication([])
+
+
+def test_attach_sets_pending_and_chip(tmp_path):
+    c = Controller(str(tmp_path / "s.json"))
+    bar = CommandBar(c, QThreadPool.globalInstance())
+    img = tmp_path / "chair.png"
+    img.write_bytes(b"\x89PNG")
+    bar._set_image(str(img))
+    assert bar._pending_image == str(img)
+    assert "chair.png" in bar.image_chip.text()
+    bar._set_image(None)
+    assert bar._pending_image is None and bar.image_chip.text() == ""
+
+
+def test_send_with_only_image_uses_default_text(tmp_path, monkeypatch):
+    c = Controller(str(tmp_path / "s.json"))
+    bar = CommandBar(c, QThreadPool.globalInstance())
+    calls = {}
+    monkeypatch.setattr(bar, "_run", lambda text: calls.setdefault("text", text))
+    bar._set_image(str(tmp_path / "x.png"))
+    bar.input.clear()
+    bar._send()
+    assert calls["text"] == "build something like this"
--- a/tests/test_driver.py
+++ b/tests/test_driver.py
@ -146,3 +146,44 @@ def test_woodshop_cmd_falls_back_to_module(monkeypatch):
    monkeypatch.setattr(driver.shutil, "which", lambda name: None)
    cmd = driver.woodshop_cmd()
    assert cmd[1:] == ["-m", "woodshop"] and cmd[0]   # python -m woodshop
+
+
+def test_find_image_url():
+    assert driver.find_image_url("build like this https://x.com/chair.jpg please") \
+        == "https://x.com/chair.jpg"
+    assert driver.find_image_url("https://x.com/a.PNG") == "https://x.com/a.PNG"
+    assert driver.find_image_url("no image here http://x.com/page") is None
+
+
+def test_interpret_includes_image_directive(monkeypatch, tmp_path):
+    captured = {}
+
+    def fake_run(cmd, stdin=""):
+        captured["prompt"] = stdin
+        return "[]"
+
+    img = tmp_path / "ref.jpg"
+    img.write_bytes(b"\xff\xd8\xff")               # not a real jpeg, just a path
+    monkeypatch.setattr(driver, "_run", fake_run)
+    driver.interpret("build something like this", schemas="[]", scene_text="empty",
+                     image_path=str(img))
+    assert "REFERENCE PHOTO" in captured["prompt"]
+    assert str(img) in captured["prompt"]
+
+
+def test_fetch_image_writes_temp(monkeypatch):
+    import io
+
+    class FakeResp:
+        headers = {"Content-Type": "image/png"}
+        def __enter__(self): return self
+        def __exit__(self, *a): return False
+        def read(self): return b"\x89PNG\r\n\x1a\n"
+
+    monkeypatch.setattr(driver.urllib.request, "urlopen", lambda *a, **k: FakeResp())
+    path = driver.fetch_image("https://x.com/chair.png")
+    assert path.endswith(".png")
+    with open(path, "rb") as f:
+        assert f.read().startswith(b"\x89PNG")
+    import os as _os
+    _os.remove(path)
--- a/tests/test_gui_controller.py
+++ b/tests/test_gui_controller.py
@ -145,7 +145,7 @@ def test_run_command_threads_history(tmp_path, monkeypatch):
    c = _controller(tmp_path)
    seen = {}

-    def fake_interpret(text, schemas, scene_text=None, history=None):
+    def fake_interpret(text, schemas, scene_text=None, history=None, image_path=None):
        seen["history"] = list(history or [])
        return [{"tool": "say", "args": {"text": "want me to add tenons?"}}]

@ -156,3 +156,16 @@ def test_run_command_threads_history(tmp_path, monkeypatch):

    c.run_command("yes")
    assert seen["history"] == [("build a table", "want me to add tenons?")]
+
+
+def test_run_command_forwards_image_path(tmp_path, monkeypatch):
+    c = _controller(tmp_path)
+    seen = {}
+
+    def fake_interpret(text, schemas, scene_text=None, history=None, image_path=None):
+        seen["image_path"] = image_path
+        return [{"tool": "say", "args": {"text": "ok"}}]
+
+    monkeypatch.setattr(driver, "interpret", fake_interpret)
+    c.run_command("build like this", image_path="/tmp/ref.jpg")
+    assert seen["image_path"] == "/tmp/ref.jpg"