From c623ad2576302a289579efcc010b6170b786e29d Mon Sep 17 00:00:00 2001 From: rob Date: Sat, 30 May 2026 22:20:57 -0300 Subject: [PATCH] Add reference-photo input: "build something like this" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Attach a photo (πŸ“Ž button, drag-drop, paste, or an image URL) and the driver hands it to claude -p, which reads the image (its Read tool sees images) and emits the usual tool-call JSON to build a simplified, buildable interpretation in dimensional lumber β€” no API keys, same claude -p pipe. - driver: interpret(image_path=) prepends a reference-photo directive with the image's absolute path; find_image_url() + fetch_image() download a linked image to a temp file; woodshop-talk --image (path or URL) for CLI/voice. - controller.run_command(image_path=) passthrough. - command bar: πŸ“Ž attach (file picker), drag-drop image, Ctrl+V paste image, and image-URL-in-text detection; downloads run off the UI thread; an image chip shows/clears the attachment. - tests: URL detection, image directive in prompt, fetch_image temp write, controller passthrough, command-bar attach + default-text smoke. 216 pass. Honest limit: the live image round-trip needs a real display/model call to verify β€” wired + unit-tested, please confirm it sees the photo on your machine. Co-Authored-By: Claude Opus 4.8 (1M context) --- README.md | 5 ++ src/woodshop/driver.py | 59 ++++++++++++++++-- src/woodshop/gui/command_bar.py | 104 +++++++++++++++++++++++++++++--- src/woodshop/gui/controller.py | 8 +-- tests/test_command_bar.py | 38 ++++++++++++ tests/test_driver.py | 41 +++++++++++++ tests/test_gui_controller.py | 15 ++++- 7 files changed, 252 insertions(+), 18 deletions(-) create mode 100644 tests/test_command_bar.py diff --git a/README.md b/README.md index a330fba..c742220 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,11 @@ Talk to it like the Star Trek holodeck and watch furniture build itself: > *"Build a coffee table: a four foot by two foot frame from 2x4s, with four legs 18 inches tall standing at the corners."* +You can also **attach a reference photo** (πŸ“Ž, drag-drop, paste, or an image URL) +and say *"build something like this"* β€” WoodShop hands the image to the model and +builds a simplified, buildable interpretation in dimensional lumber that you then +refine by voice/text. (It's an interpretation, not a measured replica.) + Each board is real dimensional lumber (a 2x4 is modeled at its true 1.5β€³ Γ— 3.5β€³), so the result is buildable β€” export to **STEP** (CAD/CNC) or **STL** (3D print), and get a **cut list with board-feet and a shopping estimate**. diff --git a/src/woodshop/driver.py b/src/woodshop/driver.py index edbae37..b1d3baa 100644 --- a/src/woodshop/driver.py +++ b/src/woodshop/driver.py @@ -17,15 +17,53 @@ from __future__ import annotations import argparse import json +import os import re import shutil import subprocess import sys +import tempfile +import urllib.request TOOL_FILTER = "wood-*" # auto-discover every wood-* tool, no hardcoded list REASON_PROVIDER = "claude -p" # chosen for reliable structured tool-calling _MAX_HISTORY = 6 # turns of recent conversation fed back for reference-resolution +# A reference photo can be attached to "build something like this". claude -p +# reads the image file (via its Read tool), so we just hand it an absolute path. +_IMG_URL = re.compile(r'https?://\S+?\.(?:png|jpe?g|webp|gif|bmp)\b', re.I) +_IMAGE_DIRECTIVE = ( + "A REFERENCE PHOTO of furniture is saved at this path:\n {path}\n" + "Open and look at that image file. The user wants to build something LIKE it " + "from dimensional lumber and plywood. Infer the major parts, rough proportions, " + "and joinery, and emit the tool calls to build a SIMPLIFIED, buildable version " + "with reasonable real dimensions in inches. This is an interpretation, not an " + "exact replica β€” prefer standard stock sizes and right angles.\n\n" +) + + +def find_image_url(text: str) -> str | None: + m = _IMG_URL.search(text or "") + return m.group(0) if m else None + + +def fetch_image(url: str, timeout: int = 20) -> str: + """Download an image URL to a temp file and return its path. Raises on + failure (caller decides how to surface it).""" + req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}) + with urllib.request.urlopen(req, timeout=timeout) as resp: + ctype = (resp.headers.get("Content-Type") or "").split(";")[0].strip().lower() + data = resp.read() + ext = {"image/png": ".png", "image/jpeg": ".jpg", "image/webp": ".webp", + "image/gif": ".gif", "image/bmp": ".bmp"}.get(ctype) + if ext is None: + m = re.search(r'\.(png|jpe?g|webp|gif|bmp)\b', url, re.I) + ext = "." + m.group(1).lower() if m else ".img" + fd, path = tempfile.mkstemp(suffix=ext, prefix="woodshop-ref-") + with os.fdopen(fd, "wb") as f: + f.write(data) + return path + # A board placed earlier in the SAME utterance is referenced as $1, $2, ... _SYMBOL = re.compile(r"\$(\d+)") @@ -143,10 +181,13 @@ def _render_history(history: list[tuple[str, str]] | None) -> str: def interpret(utterance: str, schemas: str, scene_text: str | None = None, - history: list[tuple[str, str]] | None = None) -> list[dict]: + history: list[tuple[str, str]] | None = None, + image_path: str | None = None) -> list[dict]: scene = scene_text if scene_text is not None else scene_summary() prompt = SYSTEM.format(schemas=schemas, scene=scene, utterance=utterance, history=_render_history(history)) + if image_path: + prompt = _IMAGE_DIRECTIVE.format(path=os.path.abspath(image_path)) + prompt raw = _run(REASON_PROVIDER.split(), stdin=prompt) calls = _extract_calls(raw) if calls is None: @@ -242,8 +283,9 @@ def summarize(calls: list[dict], messages: list[str]) -> str: def handle(utterance: str, schemas: str, voice: bool, verbose: bool, - history: list[tuple[str, str]] | None = None) -> None: - calls = interpret(utterance, schemas, history=history) + history: list[tuple[str, str]] | None = None, + image_path: str | None = None) -> None: + calls = interpret(utterance, schemas, history=history, image_path=image_path) messages = dispatch(calls, verbose=verbose) full = " ".join(m for m in messages if m).strip() spoken = summarize(calls, messages) @@ -271,9 +313,14 @@ def main(argv: list[str] | None = None) -> int: ap.add_argument("--voice", action="store_true", help="Listen on the mic instead of typing") ap.add_argument("--duration", type=int, default=6, help="Mic recording seconds (--voice)") ap.add_argument("--once", help="Run a single command (non-interactive) and exit") + ap.add_argument("--image", help="Reference photo (path or URL) for 'build something like this'") ap.add_argument("--quiet", action="store_true", help="Don't print per-call detail") args = ap.parse_args(argv) + image_path = None + if args.image: + image_path = fetch_image(args.image) if args.image.startswith("http") else args.image + schemas = load_schemas() if not schemas: print("Could not load wood-* tool schemas (is CmdForge/pa-load-tools available?)", @@ -281,7 +328,8 @@ def main(argv: list[str] | None = None) -> int: return 1 if args.once is not None: - handle(args.once, schemas, voice=args.voice, verbose=not args.quiet) + handle(args.once, schemas, voice=args.voice, verbose=not args.quiet, + image_path=image_path) return 0 print("WoodShop ready. Say things like 'place a 6 foot 2x4'. Ctrl-C to quit.") @@ -295,7 +343,8 @@ def main(argv: list[str] | None = None) -> int: return 0 try: handle(utterance, schemas, voice=args.voice, verbose=not args.quiet, - history=history) + history=history, image_path=image_path) + image_path = None # the reference photo applies to the first turn only except Exception as exc: # never let one bad command kill the session print(f"WoodShop: sorry, that command failed ({exc}).") diff --git a/src/woodshop/gui/command_bar.py b/src/woodshop/gui/command_bar.py index 19ce91d..6a5123a 100644 --- a/src/woodshop/gui/command_bar.py +++ b/src/woodshop/gui/command_bar.py @@ -1,17 +1,23 @@ """Command bar: type a command or push-to-talk, see the transcript, optionally -hear the reply. Slow work (LLM, dictate, TTS) runs off the UI thread.""" +hear the reply. You can also attach a reference photo ("build something like +this") by the πŸ“Ž button, drag-drop, paste, or a pasted image URL. Slow work +(LLM, dictate, TTS, image download) runs off the UI thread.""" from __future__ import annotations +import os import subprocess from PySide6.QtCore import Qt, QThreadPool -from PySide6.QtWidgets import (QCheckBox, QHBoxLayout, QLabel, QLineEdit, - QPushButton, QTextEdit, QVBoxLayout, QWidget) +from PySide6.QtGui import QKeySequence +from PySide6.QtWidgets import (QApplication, QCheckBox, QFileDialog, QHBoxLayout, QLabel, + QLineEdit, QPushButton, QTextEdit, QVBoxLayout, QWidget) +from .. import driver from .controller import Controller from .workers import run_async _WHO_COLOR = {"you": "#9cdcfe", "ws": "#c8965a", "sys": "#e06c75"} +_IMAGE_EXTS = (".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp") class CommandBar(QWidget): @@ -19,6 +25,8 @@ class CommandBar(QWidget): super().__init__(parent) self.c = controller self.pool = pool + self._pending_image: str | None = None # attached reference photo path + self.setAcceptDrops(True) # drop an image onto the bar root = QVBoxLayout(self) self.transcript = QTextEdit(readOnly=True) @@ -32,6 +40,12 @@ class CommandBar(QWidget): self.mic.clicked.connect(self._listen) row.addWidget(self.mic) + self.attach = QPushButton("πŸ“Ž") + self.attach.setToolTip("Attach a reference photo β€” then say 'build something like this'") + self.attach.setFixedWidth(40) + self.attach.clicked.connect(self._attach_image) + row.addWidget(self.attach) + self.input = QLineEdit() self.input.setPlaceholderText("Type a command, e.g. 'build a coffee table' β€” Enter to send") self.input.returnPressed.connect(self._send) @@ -45,6 +59,9 @@ class CommandBar(QWidget): bottom = QHBoxLayout() self.speak = QCheckBox("Speak replies") bottom.addWidget(self.speak) + self.image_chip = QLabel("") # shows the attached photo name + self.image_chip.setStyleSheet("color:#c8965a") + bottom.addWidget(self.image_chip) bottom.addStretch() self.status = QLabel("") bottom.addWidget(self.status) @@ -52,6 +69,69 @@ class CommandBar(QWidget): self.c.logged.connect(self._log) + # ----- reference image --------------------------------------------- + def _set_image(self, path: str | None) -> None: + self._pending_image = path + if path: + name = os.path.basename(path) + self.image_chip.setText(f"πŸ“Ž {name} βœ•") + self.image_chip.setToolTip("Click to remove the attached photo") + else: + self.image_chip.setText("") + self.image_chip.setToolTip("") + + def mousePressEvent(self, e): + # click the chip text to clear the attachment + if self._pending_image and self.image_chip.geometry().contains(e.pos()): + self._set_image(None) + super().mousePressEvent(e) + + def _attach_image(self) -> None: + path, _ = QFileDialog.getOpenFileName( + self, "Attach reference photo", "", + "Images (*.png *.jpg *.jpeg *.webp *.gif *.bmp)") + if path: + self._set_image(path) + if not self.input.text().strip(): + self.input.setText("build something like this") + + def dragEnterEvent(self, e): + md = e.mimeData() + if md.hasImage() or any(u.toLocalFile().lower().endswith(_IMAGE_EXTS) + for u in md.urls()): + e.acceptProposedAction() + + def dropEvent(self, e): + md = e.mimeData() + for u in md.urls(): + p = u.toLocalFile() + if p.lower().endswith(_IMAGE_EXTS): + self._set_image(p) + break + else: + if md.hasImage(): + self._save_clipboard_image(md.imageData()) + if self._pending_image and not self.input.text().strip(): + self.input.setText("build something like this") + + def _save_clipboard_image(self, qimage) -> None: + import tempfile + if qimage is None or qimage.isNull(): + return + fd, path = tempfile.mkstemp(suffix=".png", prefix="woodshop-paste-") + os.close(fd) + if qimage.save(path, "PNG"): + self._set_image(path) + + def keyPressEvent(self, e): + # paste an image straight from the clipboard (Ctrl+V) when the bar has focus + if e.matches(QKeySequence.Paste): + img = QApplication.clipboard().image() + if not img.isNull(): + self._save_clipboard_image(img) + return + super().keyPressEvent(e) + # ----- logging ----------------------------------------------------- def _log(self, who: str, text: str) -> None: if not text: @@ -65,26 +145,34 @@ class CommandBar(QWidget): def _busy(self, on: bool, msg: str = "") -> None: self.input.setEnabled(not on) self.mic.setEnabled(not on) + self.attach.setEnabled(not on) self.status.setText(msg) # ----- send typed/spoken command ----------------------------------- def _send(self) -> None: text = self.input.text().strip() - if not text: + if not text and not self._pending_image: return self.input.clear() - self._run(text) + self._run(text or "build something like this") def submit(self, text: str) -> None: """Run a command programmatically (e.g. from a Build-menu template).""" self._run(text) def _run(self, text: str) -> None: - self._log("you", text) - self._busy(True, "thinking…") + image = self._pending_image + url = None if image else driver.find_image_url(text) + note = " πŸ“Ž photo" if (image or url) else "" + self._log("you", text + note) + self._set_image(None) + self._busy(True, "looking…" if (image or url) else "thinking…") def work(): - return self.c.run_command(text) + path = image + if path is None and url: + path = driver.fetch_image(url) # download the linked image + return self.c.run_command(text, image_path=path) def done(summary): self._busy(False) diff --git a/src/woodshop/gui/controller.py b/src/woodshop/gui/controller.py index ec166cc..b577bfb 100644 --- a/src/woodshop/gui/controller.py +++ b/src/woodshop/gui/controller.py @@ -427,9 +427,9 @@ class Controller(QObject): except (SceneError, ValueError, KeyError) as exc: return str(exc).strip('"') - def run_command(self, text: str) -> str: - """Interpret a spoken/typed command and apply it. Returns a spoken summary. - (Slow β€” call from a worker thread.)""" + def run_command(self, text: str, image_path: str | None = None) -> str: + """Interpret a spoken/typed command (optionally with a reference photo) and + apply it. Returns a spoken summary. (Slow β€” call from a worker thread.)""" from ..scene import spatial_summary self.save() # ensure disk reflects current state sel = ", ".join(self.selected) if self.selected else "none" @@ -437,7 +437,7 @@ class Controller(QObject): + f"\nCurrently selected ('these' / 'them' / 'the selected'): {sel}" + "\n" + spatial_summary(self.scene)) calls = driver.interpret(text, self.schemas(), scene_text=scene_text, - history=self._history) + history=self._history, image_path=image_path) messages = driver.dispatch(calls, verbose=False, executor=self.execute_call) self._commit() spoken = driver.summarize(calls, messages) diff --git a/tests/test_command_bar.py b/tests/test_command_bar.py new file mode 100644 index 0000000..3e6f251 --- /dev/null +++ b/tests/test_command_bar.py @@ -0,0 +1,38 @@ +"""Offscreen smoke tests for the command bar's image attachment.""" +import os + +import pytest + +os.environ.setdefault("QT_QPA_PLATFORM", "offscreen") +pytest.importorskip("PySide6") + +from PySide6.QtCore import QThreadPool # noqa: E402 +from PySide6.QtWidgets import QApplication # noqa: E402 + +from woodshop.gui.command_bar import CommandBar # noqa: E402 +from woodshop.gui.controller import Controller # noqa: E402 + +_app = QApplication.instance() or QApplication([]) + + +def test_attach_sets_pending_and_chip(tmp_path): + c = Controller(str(tmp_path / "s.json")) + bar = CommandBar(c, QThreadPool.globalInstance()) + img = tmp_path / "chair.png" + img.write_bytes(b"\x89PNG") + bar._set_image(str(img)) + assert bar._pending_image == str(img) + assert "chair.png" in bar.image_chip.text() + bar._set_image(None) + assert bar._pending_image is None and bar.image_chip.text() == "" + + +def test_send_with_only_image_uses_default_text(tmp_path, monkeypatch): + c = Controller(str(tmp_path / "s.json")) + bar = CommandBar(c, QThreadPool.globalInstance()) + calls = {} + monkeypatch.setattr(bar, "_run", lambda text: calls.setdefault("text", text)) + bar._set_image(str(tmp_path / "x.png")) + bar.input.clear() + bar._send() + assert calls["text"] == "build something like this" diff --git a/tests/test_driver.py b/tests/test_driver.py index 223d9f2..67122e1 100644 --- a/tests/test_driver.py +++ b/tests/test_driver.py @@ -146,3 +146,44 @@ def test_woodshop_cmd_falls_back_to_module(monkeypatch): monkeypatch.setattr(driver.shutil, "which", lambda name: None) cmd = driver.woodshop_cmd() assert cmd[1:] == ["-m", "woodshop"] and cmd[0] # python -m woodshop + + +def test_find_image_url(): + assert driver.find_image_url("build like this https://x.com/chair.jpg please") \ + == "https://x.com/chair.jpg" + assert driver.find_image_url("https://x.com/a.PNG") == "https://x.com/a.PNG" + assert driver.find_image_url("no image here http://x.com/page") is None + + +def test_interpret_includes_image_directive(monkeypatch, tmp_path): + captured = {} + + def fake_run(cmd, stdin=""): + captured["prompt"] = stdin + return "[]" + + img = tmp_path / "ref.jpg" + img.write_bytes(b"\xff\xd8\xff") # not a real jpeg, just a path + monkeypatch.setattr(driver, "_run", fake_run) + driver.interpret("build something like this", schemas="[]", scene_text="empty", + image_path=str(img)) + assert "REFERENCE PHOTO" in captured["prompt"] + assert str(img) in captured["prompt"] + + +def test_fetch_image_writes_temp(monkeypatch): + import io + + class FakeResp: + headers = {"Content-Type": "image/png"} + def __enter__(self): return self + def __exit__(self, *a): return False + def read(self): return b"\x89PNG\r\n\x1a\n" + + monkeypatch.setattr(driver.urllib.request, "urlopen", lambda *a, **k: FakeResp()) + path = driver.fetch_image("https://x.com/chair.png") + assert path.endswith(".png") + with open(path, "rb") as f: + assert f.read().startswith(b"\x89PNG") + import os as _os + _os.remove(path) diff --git a/tests/test_gui_controller.py b/tests/test_gui_controller.py index 093241a..106c10f 100644 --- a/tests/test_gui_controller.py +++ b/tests/test_gui_controller.py @@ -145,7 +145,7 @@ def test_run_command_threads_history(tmp_path, monkeypatch): c = _controller(tmp_path) seen = {} - def fake_interpret(text, schemas, scene_text=None, history=None): + def fake_interpret(text, schemas, scene_text=None, history=None, image_path=None): seen["history"] = list(history or []) return [{"tool": "say", "args": {"text": "want me to add tenons?"}}] @@ -156,3 +156,16 @@ def test_run_command_threads_history(tmp_path, monkeypatch): c.run_command("yes") assert seen["history"] == [("build a table", "want me to add tenons?")] + + +def test_run_command_forwards_image_path(tmp_path, monkeypatch): + c = _controller(tmp_path) + seen = {} + + def fake_interpret(text, schemas, scene_text=None, history=None, image_path=None): + seen["image_path"] = image_path + return [{"tool": "say", "args": {"text": "ok"}}] + + monkeypatch.setattr(driver, "interpret", fake_interpret) + c.run_command("build like this", image_path="/tmp/ref.jpg") + assert seen["image_path"] == "/tmp/ref.jpg"