Add reference-photo input: "build something like this"
Attach a photo (📎 button, drag-drop, paste, or an image URL) and the driver hands it to claude -p, which reads the image (its Read tool sees images) and emits the usual tool-call JSON to build a simplified, buildable interpretation in dimensional lumber — no API keys, same claude -p pipe. - driver: interpret(image_path=) prepends a reference-photo directive with the image's absolute path; find_image_url() + fetch_image() download a linked image to a temp file; woodshop-talk --image (path or URL) for CLI/voice. - controller.run_command(image_path=) passthrough. - command bar: 📎 attach (file picker), drag-drop image, Ctrl+V paste image, and image-URL-in-text detection; downloads run off the UI thread; an image chip shows/clears the attachment. - tests: URL detection, image directive in prompt, fetch_image temp write, controller passthrough, command-bar attach + default-text smoke. 216 pass. Honest limit: the live image round-trip needs a real display/model call to verify — wired + unit-tested, please confirm it sees the photo on your machine. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
b9b0871ac3
commit
c623ad2576
|
|
@ -8,6 +8,11 @@ Talk to it like the Star Trek holodeck and watch furniture build itself:
|
|||
|
||||
> *"Build a coffee table: a four foot by two foot frame from 2x4s, with four legs 18 inches tall standing at the corners."*
|
||||
|
||||
You can also **attach a reference photo** (📎, drag-drop, paste, or an image URL)
|
||||
and say *"build something like this"* — WoodShop hands the image to the model and
|
||||
builds a simplified, buildable interpretation in dimensional lumber that you then
|
||||
refine by voice/text. (It's an interpretation, not a measured replica.)
|
||||
|
||||
Each board is real dimensional lumber (a 2x4 is modeled at its true 1.5″ × 3.5″),
|
||||
so the result is buildable — export to **STEP** (CAD/CNC) or **STL** (3D print),
|
||||
and get a **cut list with board-feet and a shopping estimate**.
|
||||
|
|
|
|||
|
|
@ -17,15 +17,53 @@ from __future__ import annotations
|
|||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import urllib.request
|
||||
|
||||
TOOL_FILTER = "wood-*" # auto-discover every wood-* tool, no hardcoded list
|
||||
REASON_PROVIDER = "claude -p" # chosen for reliable structured tool-calling
|
||||
_MAX_HISTORY = 6 # turns of recent conversation fed back for reference-resolution
|
||||
|
||||
# A reference photo can be attached to "build something like this". claude -p
|
||||
# reads the image file (via its Read tool), so we just hand it an absolute path.
|
||||
_IMG_URL = re.compile(r'https?://\S+?\.(?:png|jpe?g|webp|gif|bmp)\b', re.I)
|
||||
_IMAGE_DIRECTIVE = (
|
||||
"A REFERENCE PHOTO of furniture is saved at this path:\n {path}\n"
|
||||
"Open and look at that image file. The user wants to build something LIKE it "
|
||||
"from dimensional lumber and plywood. Infer the major parts, rough proportions, "
|
||||
"and joinery, and emit the tool calls to build a SIMPLIFIED, buildable version "
|
||||
"with reasonable real dimensions in inches. This is an interpretation, not an "
|
||||
"exact replica — prefer standard stock sizes and right angles.\n\n"
|
||||
)
|
||||
|
||||
|
||||
def find_image_url(text: str) -> str | None:
|
||||
m = _IMG_URL.search(text or "")
|
||||
return m.group(0) if m else None
|
||||
|
||||
|
||||
def fetch_image(url: str, timeout: int = 20) -> str:
|
||||
"""Download an image URL to a temp file and return its path. Raises on
|
||||
failure (caller decides how to surface it)."""
|
||||
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
ctype = (resp.headers.get("Content-Type") or "").split(";")[0].strip().lower()
|
||||
data = resp.read()
|
||||
ext = {"image/png": ".png", "image/jpeg": ".jpg", "image/webp": ".webp",
|
||||
"image/gif": ".gif", "image/bmp": ".bmp"}.get(ctype)
|
||||
if ext is None:
|
||||
m = re.search(r'\.(png|jpe?g|webp|gif|bmp)\b', url, re.I)
|
||||
ext = "." + m.group(1).lower() if m else ".img"
|
||||
fd, path = tempfile.mkstemp(suffix=ext, prefix="woodshop-ref-")
|
||||
with os.fdopen(fd, "wb") as f:
|
||||
f.write(data)
|
||||
return path
|
||||
|
||||
# A board placed earlier in the SAME utterance is referenced as $1, $2, ...
|
||||
_SYMBOL = re.compile(r"\$(\d+)")
|
||||
|
||||
|
|
@ -143,10 +181,13 @@ def _render_history(history: list[tuple[str, str]] | None) -> str:
|
|||
|
||||
|
||||
def interpret(utterance: str, schemas: str, scene_text: str | None = None,
|
||||
history: list[tuple[str, str]] | None = None) -> list[dict]:
|
||||
history: list[tuple[str, str]] | None = None,
|
||||
image_path: str | None = None) -> list[dict]:
|
||||
scene = scene_text if scene_text is not None else scene_summary()
|
||||
prompt = SYSTEM.format(schemas=schemas, scene=scene, utterance=utterance,
|
||||
history=_render_history(history))
|
||||
if image_path:
|
||||
prompt = _IMAGE_DIRECTIVE.format(path=os.path.abspath(image_path)) + prompt
|
||||
raw = _run(REASON_PROVIDER.split(), stdin=prompt)
|
||||
calls = _extract_calls(raw)
|
||||
if calls is None:
|
||||
|
|
@ -242,8 +283,9 @@ def summarize(calls: list[dict], messages: list[str]) -> str:
|
|||
|
||||
|
||||
def handle(utterance: str, schemas: str, voice: bool, verbose: bool,
|
||||
history: list[tuple[str, str]] | None = None) -> None:
|
||||
calls = interpret(utterance, schemas, history=history)
|
||||
history: list[tuple[str, str]] | None = None,
|
||||
image_path: str | None = None) -> None:
|
||||
calls = interpret(utterance, schemas, history=history, image_path=image_path)
|
||||
messages = dispatch(calls, verbose=verbose)
|
||||
full = " ".join(m for m in messages if m).strip()
|
||||
spoken = summarize(calls, messages)
|
||||
|
|
@ -271,9 +313,14 @@ def main(argv: list[str] | None = None) -> int:
|
|||
ap.add_argument("--voice", action="store_true", help="Listen on the mic instead of typing")
|
||||
ap.add_argument("--duration", type=int, default=6, help="Mic recording seconds (--voice)")
|
||||
ap.add_argument("--once", help="Run a single command (non-interactive) and exit")
|
||||
ap.add_argument("--image", help="Reference photo (path or URL) for 'build something like this'")
|
||||
ap.add_argument("--quiet", action="store_true", help="Don't print per-call detail")
|
||||
args = ap.parse_args(argv)
|
||||
|
||||
image_path = None
|
||||
if args.image:
|
||||
image_path = fetch_image(args.image) if args.image.startswith("http") else args.image
|
||||
|
||||
schemas = load_schemas()
|
||||
if not schemas:
|
||||
print("Could not load wood-* tool schemas (is CmdForge/pa-load-tools available?)",
|
||||
|
|
@ -281,7 +328,8 @@ def main(argv: list[str] | None = None) -> int:
|
|||
return 1
|
||||
|
||||
if args.once is not None:
|
||||
handle(args.once, schemas, voice=args.voice, verbose=not args.quiet)
|
||||
handle(args.once, schemas, voice=args.voice, verbose=not args.quiet,
|
||||
image_path=image_path)
|
||||
return 0
|
||||
|
||||
print("WoodShop ready. Say things like 'place a 6 foot 2x4'. Ctrl-C to quit.")
|
||||
|
|
@ -295,7 +343,8 @@ def main(argv: list[str] | None = None) -> int:
|
|||
return 0
|
||||
try:
|
||||
handle(utterance, schemas, voice=args.voice, verbose=not args.quiet,
|
||||
history=history)
|
||||
history=history, image_path=image_path)
|
||||
image_path = None # the reference photo applies to the first turn only
|
||||
except Exception as exc: # never let one bad command kill the session
|
||||
print(f"WoodShop: sorry, that command failed ({exc}).")
|
||||
|
||||
|
|
|
|||
|
|
@ -1,17 +1,23 @@
|
|||
"""Command bar: type a command or push-to-talk, see the transcript, optionally
|
||||
hear the reply. Slow work (LLM, dictate, TTS) runs off the UI thread."""
|
||||
hear the reply. You can also attach a reference photo ("build something like
|
||||
this") by the 📎 button, drag-drop, paste, or a pasted image URL. Slow work
|
||||
(LLM, dictate, TTS, image download) runs off the UI thread."""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
from PySide6.QtCore import Qt, QThreadPool
|
||||
from PySide6.QtWidgets import (QCheckBox, QHBoxLayout, QLabel, QLineEdit,
|
||||
QPushButton, QTextEdit, QVBoxLayout, QWidget)
|
||||
from PySide6.QtGui import QKeySequence
|
||||
from PySide6.QtWidgets import (QApplication, QCheckBox, QFileDialog, QHBoxLayout, QLabel,
|
||||
QLineEdit, QPushButton, QTextEdit, QVBoxLayout, QWidget)
|
||||
|
||||
from .. import driver
|
||||
from .controller import Controller
|
||||
from .workers import run_async
|
||||
|
||||
_WHO_COLOR = {"you": "#9cdcfe", "ws": "#c8965a", "sys": "#e06c75"}
|
||||
_IMAGE_EXTS = (".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp")
|
||||
|
||||
|
||||
class CommandBar(QWidget):
|
||||
|
|
@ -19,6 +25,8 @@ class CommandBar(QWidget):
|
|||
super().__init__(parent)
|
||||
self.c = controller
|
||||
self.pool = pool
|
||||
self._pending_image: str | None = None # attached reference photo path
|
||||
self.setAcceptDrops(True) # drop an image onto the bar
|
||||
|
||||
root = QVBoxLayout(self)
|
||||
self.transcript = QTextEdit(readOnly=True)
|
||||
|
|
@ -32,6 +40,12 @@ class CommandBar(QWidget):
|
|||
self.mic.clicked.connect(self._listen)
|
||||
row.addWidget(self.mic)
|
||||
|
||||
self.attach = QPushButton("📎")
|
||||
self.attach.setToolTip("Attach a reference photo — then say 'build something like this'")
|
||||
self.attach.setFixedWidth(40)
|
||||
self.attach.clicked.connect(self._attach_image)
|
||||
row.addWidget(self.attach)
|
||||
|
||||
self.input = QLineEdit()
|
||||
self.input.setPlaceholderText("Type a command, e.g. 'build a coffee table' — Enter to send")
|
||||
self.input.returnPressed.connect(self._send)
|
||||
|
|
@ -45,6 +59,9 @@ class CommandBar(QWidget):
|
|||
bottom = QHBoxLayout()
|
||||
self.speak = QCheckBox("Speak replies")
|
||||
bottom.addWidget(self.speak)
|
||||
self.image_chip = QLabel("") # shows the attached photo name
|
||||
self.image_chip.setStyleSheet("color:#c8965a")
|
||||
bottom.addWidget(self.image_chip)
|
||||
bottom.addStretch()
|
||||
self.status = QLabel("")
|
||||
bottom.addWidget(self.status)
|
||||
|
|
@ -52,6 +69,69 @@ class CommandBar(QWidget):
|
|||
|
||||
self.c.logged.connect(self._log)
|
||||
|
||||
# ----- reference image ---------------------------------------------
|
||||
def _set_image(self, path: str | None) -> None:
|
||||
self._pending_image = path
|
||||
if path:
|
||||
name = os.path.basename(path)
|
||||
self.image_chip.setText(f"📎 {name} ✕")
|
||||
self.image_chip.setToolTip("Click to remove the attached photo")
|
||||
else:
|
||||
self.image_chip.setText("")
|
||||
self.image_chip.setToolTip("")
|
||||
|
||||
def mousePressEvent(self, e):
|
||||
# click the chip text to clear the attachment
|
||||
if self._pending_image and self.image_chip.geometry().contains(e.pos()):
|
||||
self._set_image(None)
|
||||
super().mousePressEvent(e)
|
||||
|
||||
def _attach_image(self) -> None:
|
||||
path, _ = QFileDialog.getOpenFileName(
|
||||
self, "Attach reference photo", "",
|
||||
"Images (*.png *.jpg *.jpeg *.webp *.gif *.bmp)")
|
||||
if path:
|
||||
self._set_image(path)
|
||||
if not self.input.text().strip():
|
||||
self.input.setText("build something like this")
|
||||
|
||||
def dragEnterEvent(self, e):
|
||||
md = e.mimeData()
|
||||
if md.hasImage() or any(u.toLocalFile().lower().endswith(_IMAGE_EXTS)
|
||||
for u in md.urls()):
|
||||
e.acceptProposedAction()
|
||||
|
||||
def dropEvent(self, e):
|
||||
md = e.mimeData()
|
||||
for u in md.urls():
|
||||
p = u.toLocalFile()
|
||||
if p.lower().endswith(_IMAGE_EXTS):
|
||||
self._set_image(p)
|
||||
break
|
||||
else:
|
||||
if md.hasImage():
|
||||
self._save_clipboard_image(md.imageData())
|
||||
if self._pending_image and not self.input.text().strip():
|
||||
self.input.setText("build something like this")
|
||||
|
||||
def _save_clipboard_image(self, qimage) -> None:
|
||||
import tempfile
|
||||
if qimage is None or qimage.isNull():
|
||||
return
|
||||
fd, path = tempfile.mkstemp(suffix=".png", prefix="woodshop-paste-")
|
||||
os.close(fd)
|
||||
if qimage.save(path, "PNG"):
|
||||
self._set_image(path)
|
||||
|
||||
def keyPressEvent(self, e):
|
||||
# paste an image straight from the clipboard (Ctrl+V) when the bar has focus
|
||||
if e.matches(QKeySequence.Paste):
|
||||
img = QApplication.clipboard().image()
|
||||
if not img.isNull():
|
||||
self._save_clipboard_image(img)
|
||||
return
|
||||
super().keyPressEvent(e)
|
||||
|
||||
# ----- logging -----------------------------------------------------
|
||||
def _log(self, who: str, text: str) -> None:
|
||||
if not text:
|
||||
|
|
@ -65,26 +145,34 @@ class CommandBar(QWidget):
|
|||
def _busy(self, on: bool, msg: str = "") -> None:
|
||||
self.input.setEnabled(not on)
|
||||
self.mic.setEnabled(not on)
|
||||
self.attach.setEnabled(not on)
|
||||
self.status.setText(msg)
|
||||
|
||||
# ----- send typed/spoken command -----------------------------------
|
||||
def _send(self) -> None:
|
||||
text = self.input.text().strip()
|
||||
if not text:
|
||||
if not text and not self._pending_image:
|
||||
return
|
||||
self.input.clear()
|
||||
self._run(text)
|
||||
self._run(text or "build something like this")
|
||||
|
||||
def submit(self, text: str) -> None:
|
||||
"""Run a command programmatically (e.g. from a Build-menu template)."""
|
||||
self._run(text)
|
||||
|
||||
def _run(self, text: str) -> None:
|
||||
self._log("you", text)
|
||||
self._busy(True, "thinking…")
|
||||
image = self._pending_image
|
||||
url = None if image else driver.find_image_url(text)
|
||||
note = " 📎 photo" if (image or url) else ""
|
||||
self._log("you", text + note)
|
||||
self._set_image(None)
|
||||
self._busy(True, "looking…" if (image or url) else "thinking…")
|
||||
|
||||
def work():
|
||||
return self.c.run_command(text)
|
||||
path = image
|
||||
if path is None and url:
|
||||
path = driver.fetch_image(url) # download the linked image
|
||||
return self.c.run_command(text, image_path=path)
|
||||
|
||||
def done(summary):
|
||||
self._busy(False)
|
||||
|
|
|
|||
|
|
@ -427,9 +427,9 @@ class Controller(QObject):
|
|||
except (SceneError, ValueError, KeyError) as exc:
|
||||
return str(exc).strip('"')
|
||||
|
||||
def run_command(self, text: str) -> str:
|
||||
"""Interpret a spoken/typed command and apply it. Returns a spoken summary.
|
||||
(Slow — call from a worker thread.)"""
|
||||
def run_command(self, text: str, image_path: str | None = None) -> str:
|
||||
"""Interpret a spoken/typed command (optionally with a reference photo) and
|
||||
apply it. Returns a spoken summary. (Slow — call from a worker thread.)"""
|
||||
from ..scene import spatial_summary
|
||||
self.save() # ensure disk reflects current state
|
||||
sel = ", ".join(self.selected) if self.selected else "none"
|
||||
|
|
@ -437,7 +437,7 @@ class Controller(QObject):
|
|||
+ f"\nCurrently selected ('these' / 'them' / 'the selected'): {sel}"
|
||||
+ "\n" + spatial_summary(self.scene))
|
||||
calls = driver.interpret(text, self.schemas(), scene_text=scene_text,
|
||||
history=self._history)
|
||||
history=self._history, image_path=image_path)
|
||||
messages = driver.dispatch(calls, verbose=False, executor=self.execute_call)
|
||||
self._commit()
|
||||
spoken = driver.summarize(calls, messages)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,38 @@
|
|||
"""Offscreen smoke tests for the command bar's image attachment."""
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
os.environ.setdefault("QT_QPA_PLATFORM", "offscreen")
|
||||
pytest.importorskip("PySide6")
|
||||
|
||||
from PySide6.QtCore import QThreadPool # noqa: E402
|
||||
from PySide6.QtWidgets import QApplication # noqa: E402
|
||||
|
||||
from woodshop.gui.command_bar import CommandBar # noqa: E402
|
||||
from woodshop.gui.controller import Controller # noqa: E402
|
||||
|
||||
_app = QApplication.instance() or QApplication([])
|
||||
|
||||
|
||||
def test_attach_sets_pending_and_chip(tmp_path):
|
||||
c = Controller(str(tmp_path / "s.json"))
|
||||
bar = CommandBar(c, QThreadPool.globalInstance())
|
||||
img = tmp_path / "chair.png"
|
||||
img.write_bytes(b"\x89PNG")
|
||||
bar._set_image(str(img))
|
||||
assert bar._pending_image == str(img)
|
||||
assert "chair.png" in bar.image_chip.text()
|
||||
bar._set_image(None)
|
||||
assert bar._pending_image is None and bar.image_chip.text() == ""
|
||||
|
||||
|
||||
def test_send_with_only_image_uses_default_text(tmp_path, monkeypatch):
|
||||
c = Controller(str(tmp_path / "s.json"))
|
||||
bar = CommandBar(c, QThreadPool.globalInstance())
|
||||
calls = {}
|
||||
monkeypatch.setattr(bar, "_run", lambda text: calls.setdefault("text", text))
|
||||
bar._set_image(str(tmp_path / "x.png"))
|
||||
bar.input.clear()
|
||||
bar._send()
|
||||
assert calls["text"] == "build something like this"
|
||||
|
|
@ -146,3 +146,44 @@ def test_woodshop_cmd_falls_back_to_module(monkeypatch):
|
|||
monkeypatch.setattr(driver.shutil, "which", lambda name: None)
|
||||
cmd = driver.woodshop_cmd()
|
||||
assert cmd[1:] == ["-m", "woodshop"] and cmd[0] # python -m woodshop
|
||||
|
||||
|
||||
def test_find_image_url():
|
||||
assert driver.find_image_url("build like this https://x.com/chair.jpg please") \
|
||||
== "https://x.com/chair.jpg"
|
||||
assert driver.find_image_url("https://x.com/a.PNG") == "https://x.com/a.PNG"
|
||||
assert driver.find_image_url("no image here http://x.com/page") is None
|
||||
|
||||
|
||||
def test_interpret_includes_image_directive(monkeypatch, tmp_path):
|
||||
captured = {}
|
||||
|
||||
def fake_run(cmd, stdin=""):
|
||||
captured["prompt"] = stdin
|
||||
return "[]"
|
||||
|
||||
img = tmp_path / "ref.jpg"
|
||||
img.write_bytes(b"\xff\xd8\xff") # not a real jpeg, just a path
|
||||
monkeypatch.setattr(driver, "_run", fake_run)
|
||||
driver.interpret("build something like this", schemas="[]", scene_text="empty",
|
||||
image_path=str(img))
|
||||
assert "REFERENCE PHOTO" in captured["prompt"]
|
||||
assert str(img) in captured["prompt"]
|
||||
|
||||
|
||||
def test_fetch_image_writes_temp(monkeypatch):
|
||||
import io
|
||||
|
||||
class FakeResp:
|
||||
headers = {"Content-Type": "image/png"}
|
||||
def __enter__(self): return self
|
||||
def __exit__(self, *a): return False
|
||||
def read(self): return b"\x89PNG\r\n\x1a\n"
|
||||
|
||||
monkeypatch.setattr(driver.urllib.request, "urlopen", lambda *a, **k: FakeResp())
|
||||
path = driver.fetch_image("https://x.com/chair.png")
|
||||
assert path.endswith(".png")
|
||||
with open(path, "rb") as f:
|
||||
assert f.read().startswith(b"\x89PNG")
|
||||
import os as _os
|
||||
_os.remove(path)
|
||||
|
|
|
|||
|
|
@ -145,7 +145,7 @@ def test_run_command_threads_history(tmp_path, monkeypatch):
|
|||
c = _controller(tmp_path)
|
||||
seen = {}
|
||||
|
||||
def fake_interpret(text, schemas, scene_text=None, history=None):
|
||||
def fake_interpret(text, schemas, scene_text=None, history=None, image_path=None):
|
||||
seen["history"] = list(history or [])
|
||||
return [{"tool": "say", "args": {"text": "want me to add tenons?"}}]
|
||||
|
||||
|
|
@ -156,3 +156,16 @@ def test_run_command_threads_history(tmp_path, monkeypatch):
|
|||
|
||||
c.run_command("yes")
|
||||
assert seen["history"] == [("build a table", "want me to add tenons?")]
|
||||
|
||||
|
||||
def test_run_command_forwards_image_path(tmp_path, monkeypatch):
|
||||
c = _controller(tmp_path)
|
||||
seen = {}
|
||||
|
||||
def fake_interpret(text, schemas, scene_text=None, history=None, image_path=None):
|
||||
seen["image_path"] = image_path
|
||||
return [{"tool": "say", "args": {"text": "ok"}}]
|
||||
|
||||
monkeypatch.setattr(driver, "interpret", fake_interpret)
|
||||
c.run_command("build like this", image_path="/tmp/ref.jpg")
|
||||
assert seen["image_path"] == "/tmp/ref.jpg"
|
||||
|
|
|
|||
Loading…
Reference in New Issue