Reference input now accepts PDF plans, 3D models, and web links

Extends "build something like this" beyond photos:

- driver.resolve_reference(source) routes any path/URL: image/PDF → a path
  claude -p reads directly; STL/STEP/OBJ → render_mesh() renders an isometric
  PNG (pyvista; STEP via build123d→STL) and reports the bounding box; a normal
  web URL → fetch_web_text() pulls the page's visible text.
- interpret(reference_text=) injects guide/render-dims text alongside any image
  directive; handle() + controller.run_command() + woodshop-talk --ref pass it.
- command bar: picker/drag-drop accept images + .pdf + 3D files; any pasted URL
  is resolved; resolution (download/render/fetch) runs off the UI thread.
- find_image_url→find_reference_url (any URL); fetch_image→fetch_url (generic).
- tests: URL detect, image+reference-text directives, fetch_url, web-text strip,
  resolve_reference routing per kind, real STL render (skips without GL). 220 pass.

3D render gives the model EXACT proportions (+ bbox) instead of a 2D guess.
Honest limit: render needs the viewer stack + working off-screen GL on your box;
the live model round-trip still wants your eyes to confirm.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
rob 2026-05-30 22:37:38 -03:00
parent c623ad2576
commit 84ae6d8756
6 changed files with 199 additions and 72 deletions

View File

@ -8,10 +8,12 @@ Talk to it like the Star Trek holodeck and watch furniture build itself:
> *"Build a coffee table: a four foot by two foot frame from 2x4s, with four legs 18 inches tall standing at the corners."*
You can also **attach a reference photo** (📎, drag-drop, paste, or an image URL)
and say *"build something like this"* — WoodShop hands the image to the model and
builds a simplified, buildable interpretation in dimensional lumber that you then
refine by voice/text. (It's an interpretation, not a measured replica.)
You can also **attach a reference** (📎, drag-drop, paste, or a URL) and say
*"build something like this"*: a **photo**, a **PDF plan**, a **3D model**
(STL/STEP/OBJ — rendered to an image, with its bounding box measured), or a
**web-page guide** (its text is pulled). WoodShop builds a simplified, buildable
interpretation in dimensional lumber that you then refine by voice/text. (It's an
interpretation, not a measured replica.)
Each board is real dimensional lumber (a 2x4 is modeled at its true 1.5″ × 3.5″),
so the result is buildable — export to **STEP** (CAD/CNC) or **STL** (3D print),

View File

@ -29,41 +29,107 @@ TOOL_FILTER = "wood-*" # auto-discover every wood-* tool, no hardcoded list
REASON_PROVIDER = "claude -p" # chosen for reliable structured tool-calling
_MAX_HISTORY = 6 # turns of recent conversation fed back for reference-resolution
# A reference photo can be attached to "build something like this". claude -p
# reads the image file (via its Read tool), so we just hand it an absolute path.
_IMG_URL = re.compile(r'https?://\S+?\.(?:png|jpe?g|webp|gif|bmp)\b', re.I)
# A reference can be attached to "build something like this": a photo, a PDF
# plan, a 3D model (STL/STEP/OBJ — rendered to an image first), or a web page
# (its text is pulled). claude -p reads images & PDFs directly via its Read tool.
IMG_EXTS = {".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp"}
DOC_EXTS = {".pdf"} # claude -p reads PDFs too
MESH_EXTS = {".stl", ".obj", ".ply", ".step", ".stp"}
_REF_EXTS = IMG_EXTS | DOC_EXTS | MESH_EXTS
_URL = re.compile(r'https?://\S+', re.I)
_IMAGE_DIRECTIVE = (
"A REFERENCE PHOTO of furniture is saved at this path:\n {path}\n"
"Open and look at that image file. The user wants to build something LIKE it "
"from dimensional lumber and plywood. Infer the major parts, rough proportions, "
"and joinery, and emit the tool calls to build a SIMPLIFIED, buildable version "
"with reasonable real dimensions in inches. This is an interpretation, not an "
"exact replica — prefer standard stock sizes and right angles.\n\n"
)
"A REFERENCE (photo / plan drawing / 3D render) is saved at this path:\n {path}\n"
"Open and look at that file. The user wants to build something LIKE it from "
"dimensional lumber and plywood. Infer the major parts, proportions, and "
"joinery, and emit the tool calls to build a SIMPLIFIED, buildable version with "
"reasonable real dimensions in inches. An interpretation, not an exact replica "
"— prefer standard stock sizes and right angles.\n\n")
_TEXT_DIRECTIVE = (
"A build GUIDE / plan was provided as text (below). Use it to build a "
"simplified, buildable version in dimensional lumber.\n--- REFERENCE ---\n"
"{text}\n--- END REFERENCE ---\n\n")
def find_image_url(text: str) -> str | None:
m = _IMG_URL.search(text or "")
def find_reference_url(text: str) -> str | None:
m = _URL.search(text or "")
return m.group(0) if m else None
def fetch_image(url: str, timeout: int = 20) -> str:
"""Download an image URL to a temp file and return its path. Raises on
failure (caller decides how to surface it)."""
def _ext(name: str) -> str:
return os.path.splitext(name.split("?")[0])[1].lower()
def fetch_url(url: str, timeout: int = 20) -> str:
"""Download a URL (image / PDF / 3D file) to a temp file; return its path."""
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urllib.request.urlopen(req, timeout=timeout) as resp:
ctype = (resp.headers.get("Content-Type") or "").split(";")[0].strip().lower()
data = resp.read()
ext = {"image/png": ".png", "image/jpeg": ".jpg", "image/webp": ".webp",
"image/gif": ".gif", "image/bmp": ".bmp"}.get(ctype)
if ext is None:
m = re.search(r'\.(png|jpe?g|webp|gif|bmp)\b', url, re.I)
ext = "." + m.group(1).lower() if m else ".img"
"image/gif": ".gif", "image/bmp": ".bmp", "application/pdf": ".pdf",
"model/stl": ".stl", "application/sla": ".stl"}.get(ctype) or _ext(url) or ".bin"
fd, path = tempfile.mkstemp(suffix=ext, prefix="woodshop-ref-")
with os.fdopen(fd, "wb") as f:
f.write(data)
return path
def fetch_web_text(url: str, limit: int = 8000, timeout: int = 20) -> str:
"""Fetch a web page and return its visible text (tags/scripts stripped)."""
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urllib.request.urlopen(req, timeout=timeout) as resp:
html = resp.read().decode("utf-8", "replace")
html = re.sub(r'(?is)<(script|style)[^>]*>.*?</\1>', ' ', html)
text = re.sub(r'(?s)<[^>]+>', ' ', html)
text = re.sub(r'\s+', ' ', text).strip()
return text[:limit]
def render_mesh(path: str) -> tuple[str, str]:
"""Render a 3D model (STL/OBJ/PLY/STEP) to a PNG and describe its bounding
box. Returns (png_path, dims_text). Needs the viewer stack (pyvista); STEP
also needs build123d."""
import tempfile as _tf
from pathlib import Path as _P
import pyvista as pv
ext = _P(path).suffix.lower()
if ext in (".step", ".stp"):
from build123d import export_stl, import_step
shape = import_step(path)
fd, stl = _tf.mkstemp(suffix=".stl"); os.close(fd)
export_stl(shape, stl)
mesh = pv.read(stl)
else:
mesh = pv.read(path)
b = mesh.bounds
dx, dy, dz = b[1] - b[0], b[3] - b[2], b[5] - b[4]
pl = pv.Plotter(off_screen=True, window_size=(900, 700))
pl.add_mesh(mesh, color="#c8965a", show_edges=True)
pl.view_isometric()
fd, png = _tf.mkstemp(suffix=".png", prefix="woodshop-render-"); os.close(fd)
pl.screenshot(png)
pl.close()
dims = (f"This is a render of a 3D model; its bounding box is about "
f"{dx:.1f} x {dy:.1f} x {dz:.1f} in the file's units (proportions are "
f"exact — treat units as inches unless that's implausible).")
return png, dims
def resolve_reference(source: str) -> tuple[str | None, str | None]:
"""Turn a reference (local path or URL) into (image_path, reference_text) for
interpret(). Image/PDF -> a path claude reads; 3D file -> rendered PNG + dims
text; web page -> page text. Raises on download/render failure."""
is_url = source.startswith(("http://", "https://"))
ext = _ext(source)
if is_url and ext not in _REF_EXTS:
return None, fetch_web_text(source) # a web-page guide
local = fetch_url(source) if is_url else source
if _ext(local) in MESH_EXTS:
return render_mesh(local) # (png, dims)
return local, None # image or PDF — read directly
# A board placed earlier in the SAME utterance is referenced as $1, $2, ...
_SYMBOL = re.compile(r"\$(\d+)")
@ -182,12 +248,16 @@ def _render_history(history: list[tuple[str, str]] | None) -> str:
def interpret(utterance: str, schemas: str, scene_text: str | None = None,
history: list[tuple[str, str]] | None = None,
image_path: str | None = None) -> list[dict]:
image_path: str | None = None, reference_text: str | None = None) -> list[dict]:
scene = scene_text if scene_text is not None else scene_summary()
prompt = SYSTEM.format(schemas=schemas, scene=scene, utterance=utterance,
history=_render_history(history))
prefix = ""
if image_path:
prompt = _IMAGE_DIRECTIVE.format(path=os.path.abspath(image_path)) + prompt
prefix += _IMAGE_DIRECTIVE.format(path=os.path.abspath(image_path))
if reference_text:
prefix += _TEXT_DIRECTIVE.format(text=reference_text[:8000])
prompt = prefix + prompt
raw = _run(REASON_PROVIDER.split(), stdin=prompt)
calls = _extract_calls(raw)
if calls is None:
@ -284,8 +354,9 @@ def summarize(calls: list[dict], messages: list[str]) -> str:
def handle(utterance: str, schemas: str, voice: bool, verbose: bool,
history: list[tuple[str, str]] | None = None,
image_path: str | None = None) -> None:
calls = interpret(utterance, schemas, history=history, image_path=image_path)
image_path: str | None = None, reference_text: str | None = None) -> None:
calls = interpret(utterance, schemas, history=history, image_path=image_path,
reference_text=reference_text)
messages = dispatch(calls, verbose=verbose)
full = " ".join(m for m in messages if m).strip()
spoken = summarize(calls, messages)
@ -313,13 +384,15 @@ def main(argv: list[str] | None = None) -> int:
ap.add_argument("--voice", action="store_true", help="Listen on the mic instead of typing")
ap.add_argument("--duration", type=int, default=6, help="Mic recording seconds (--voice)")
ap.add_argument("--once", help="Run a single command (non-interactive) and exit")
ap.add_argument("--image", help="Reference photo (path or URL) for 'build something like this'")
ap.add_argument("--image", "--ref", dest="ref",
help="Reference for 'build something like this': a photo, PDF, 3D "
"model (stl/step/obj), or web-page URL (path or URL)")
ap.add_argument("--quiet", action="store_true", help="Don't print per-call detail")
args = ap.parse_args(argv)
image_path = None
if args.image:
image_path = fetch_image(args.image) if args.image.startswith("http") else args.image
image_path = reference_text = None
if args.ref:
image_path, reference_text = resolve_reference(args.ref)
schemas = load_schemas()
if not schemas:
@ -329,7 +402,7 @@ def main(argv: list[str] | None = None) -> int:
if args.once is not None:
handle(args.once, schemas, voice=args.voice, verbose=not args.quiet,
image_path=image_path)
image_path=image_path, reference_text=reference_text)
return 0
print("WoodShop ready. Say things like 'place a 6 foot 2x4'. Ctrl-C to quit.")
@ -343,8 +416,8 @@ def main(argv: list[str] | None = None) -> int:
return 0
try:
handle(utterance, schemas, voice=args.voice, verbose=not args.quiet,
history=history, image_path=image_path)
image_path = None # the reference photo applies to the first turn only
history=history, image_path=image_path, reference_text=reference_text)
image_path = reference_text = None # the reference applies to the first turn only
except Exception as exc: # never let one bad command kill the session
print(f"WoodShop: sorry, that command failed ({exc}).")

View File

@ -17,7 +17,8 @@ from .controller import Controller
from .workers import run_async
_WHO_COLOR = {"you": "#9cdcfe", "ws": "#c8965a", "sys": "#e06c75"}
_IMAGE_EXTS = (".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp")
# Reference files we accept by drag-drop / picker (images, PDF plans, 3D models).
_REF_EXTS = tuple(sorted(driver.IMG_EXTS | driver.DOC_EXTS | driver.MESH_EXTS))
class CommandBar(QWidget):
@ -41,7 +42,8 @@ class CommandBar(QWidget):
row.addWidget(self.mic)
self.attach = QPushButton("📎")
self.attach.setToolTip("Attach a reference photo — then say 'build something like this'")
self.attach.setToolTip("Attach a reference (photo, PDF plan, or 3D model) — "
"then say 'build something like this'")
self.attach.setFixedWidth(40)
self.attach.clicked.connect(self._attach_image)
row.addWidget(self.attach)
@ -87,9 +89,10 @@ class CommandBar(QWidget):
super().mousePressEvent(e)
def _attach_image(self) -> None:
patterns = " ".join("*" + e for e in _REF_EXTS)
path, _ = QFileDialog.getOpenFileName(
self, "Attach reference photo", "",
"Images (*.png *.jpg *.jpeg *.webp *.gif *.bmp)")
self, "Attach reference (photo / PDF plan / 3D model)", "",
f"References ({patterns});;All files (*)")
if path:
self._set_image(path)
if not self.input.text().strip():
@ -97,7 +100,7 @@ class CommandBar(QWidget):
def dragEnterEvent(self, e):
md = e.mimeData()
if md.hasImage() or any(u.toLocalFile().lower().endswith(_IMAGE_EXTS)
if md.hasImage() or any(u.toLocalFile().lower().endswith(_REF_EXTS)
for u in md.urls()):
e.acceptProposedAction()
@ -105,7 +108,7 @@ class CommandBar(QWidget):
md = e.mimeData()
for u in md.urls():
p = u.toLocalFile()
if p.lower().endswith(_IMAGE_EXTS):
if p.lower().endswith(_REF_EXTS):
self._set_image(p)
break
else:
@ -161,18 +164,18 @@ class CommandBar(QWidget):
self._run(text)
def _run(self, text: str) -> None:
image = self._pending_image
url = None if image else driver.find_image_url(text)
note = " 📎 photo" if (image or url) else ""
source = self._pending_image or driver.find_reference_url(text)
note = " 📎 reference" if source else ""
self._log("you", text + note)
self._set_image(None)
self._busy(True, "looking…" if (image or url) else "thinking…")
self._busy(True, "studying reference…" if source else "thinking…")
def work():
path = image
if path is None and url:
path = driver.fetch_image(url) # download the linked image
return self.c.run_command(text, image_path=path)
image_path = reference_text = None
if source:
image_path, reference_text = driver.resolve_reference(source)
return self.c.run_command(text, image_path=image_path,
reference_text=reference_text)
def done(summary):
self._busy(False)

View File

@ -427,9 +427,11 @@ class Controller(QObject):
except (SceneError, ValueError, KeyError) as exc:
return str(exc).strip('"')
def run_command(self, text: str, image_path: str | None = None) -> str:
"""Interpret a spoken/typed command (optionally with a reference photo) and
apply it. Returns a spoken summary. (Slow call from a worker thread.)"""
def run_command(self, text: str, image_path: str | None = None,
reference_text: str | None = None) -> str:
"""Interpret a spoken/typed command (optionally with a reference photo,
plan, 3D render, or guide text) and apply it. Returns a spoken summary.
(Slow call from a worker thread.)"""
from ..scene import spatial_summary
self.save() # ensure disk reflects current state
sel = ", ".join(self.selected) if self.selected else "none"
@ -437,7 +439,8 @@ class Controller(QObject):
+ f"\nCurrently selected ('these' / 'them' / 'the selected'): {sel}"
+ "\n" + spatial_summary(self.scene))
calls = driver.interpret(text, self.schemas(), scene_text=scene_text,
history=self._history, image_path=image_path)
history=self._history, image_path=image_path,
reference_text=reference_text)
messages = driver.dispatch(calls, verbose=False, executor=self.execute_call)
self._commit()
spoken = driver.summarize(calls, messages)

View File

@ -1,6 +1,8 @@
"""Tests for the driver's orchestration logic (external tools are mocked)."""
import json
import pytest
from woodshop import driver
from woodshop.cli import normalize_anchor
@ -148,32 +150,33 @@ def test_woodshop_cmd_falls_back_to_module(monkeypatch):
assert cmd[1:] == ["-m", "woodshop"] and cmd[0] # python -m woodshop
def test_find_image_url():
assert driver.find_image_url("build like this https://x.com/chair.jpg please") \
def test_find_reference_url():
assert driver.find_reference_url("build like this https://x.com/chair.jpg please") \
== "https://x.com/chair.jpg"
assert driver.find_image_url("https://x.com/a.PNG") == "https://x.com/a.PNG"
assert driver.find_image_url("no image here http://x.com/page") is None
assert driver.find_reference_url("see https://x.com/how-to") == "https://x.com/how-to"
assert driver.find_reference_url("no url here") is None
def test_interpret_includes_image_directive(monkeypatch, tmp_path):
captured = {}
def fake_run(cmd, stdin=""):
captured["prompt"] = stdin
return "[]"
monkeypatch.setattr(driver, "_run", lambda cmd, stdin="": captured.update(prompt=stdin) or "[]")
img = tmp_path / "ref.jpg"
img.write_bytes(b"\xff\xd8\xff") # not a real jpeg, just a path
monkeypatch.setattr(driver, "_run", fake_run)
img.write_bytes(b"\xff\xd8\xff")
driver.interpret("build something like this", schemas="[]", scene_text="empty",
image_path=str(img))
assert "REFERENCE PHOTO" in captured["prompt"]
assert str(img) in captured["prompt"]
assert "REFERENCE" in captured["prompt"] and str(img) in captured["prompt"]
def test_fetch_image_writes_temp(monkeypatch):
import io
def test_interpret_includes_reference_text(monkeypatch):
captured = {}
monkeypatch.setattr(driver, "_run", lambda cmd, stdin="": captured.update(prompt=stdin) or "[]")
driver.interpret("build it", schemas="[]", scene_text="empty",
reference_text="Step 1: cut four legs 28 inches long.")
assert "build GUIDE" in captured["prompt"]
assert "cut four legs 28 inches" in captured["prompt"]
def test_fetch_url_writes_temp(monkeypatch):
class FakeResp:
headers = {"Content-Type": "image/png"}
def __enter__(self): return self
@ -181,9 +184,52 @@ def test_fetch_image_writes_temp(monkeypatch):
def read(self): return b"\x89PNG\r\n\x1a\n"
monkeypatch.setattr(driver.urllib.request, "urlopen", lambda *a, **k: FakeResp())
path = driver.fetch_image("https://x.com/chair.png")
path = driver.fetch_url("https://x.com/chair.png")
assert path.endswith(".png")
with open(path, "rb") as f:
assert f.read().startswith(b"\x89PNG")
import os as _os
_os.remove(path)
def test_fetch_web_text_strips_tags(monkeypatch):
html = b"<html><head><style>x{}</style></head><body><h1>Build</h1> a <b>shelf</b></body></html>"
class FakeResp:
def __enter__(self): return self
def __exit__(self, *a): return False
def read(self): return html
monkeypatch.setattr(driver.urllib.request, "urlopen", lambda *a, **k: FakeResp())
text = driver.fetch_web_text("https://x.com/guide")
assert "Build a shelf" in text and "<" not in text and "x{}" not in text
def test_resolve_reference_routes_by_kind(monkeypatch, tmp_path):
# local image -> (path, None)
img = tmp_path / "a.png"; img.write_bytes(b"x")
assert driver.resolve_reference(str(img)) == (str(img), None)
# local pdf -> read directly (path, None)
pdf = tmp_path / "plan.pdf"; pdf.write_bytes(b"%PDF")
assert driver.resolve_reference(str(pdf)) == (str(pdf), None)
# web page URL -> (None, text)
monkeypatch.setattr(driver, "fetch_web_text", lambda u, **k: "guide text")
assert driver.resolve_reference("https://x.com/how-to-build") == (None, "guide text")
# 3D file -> render (mocked) -> (png, dims)
monkeypatch.setattr(driver, "render_mesh", lambda p: ("/tmp/r.png", "bbox 10x10x10"))
stl = tmp_path / "m.stl"; stl.write_bytes(b"solid")
assert driver.resolve_reference(str(stl)) == ("/tmp/r.png", "bbox 10x10x10")
def test_render_mesh_real_if_possible(tmp_path):
"""Render an actual STL if pyvista + a working off-screen GL are available;
skip cleanly otherwise (headless boxes often lack GL)."""
pv = pytest.importorskip("pyvista")
stl = tmp_path / "box.stl"
try:
pv.Cube().save(str(stl))
png, dims = driver.render_mesh(str(stl))
except Exception as exc: # no GL / off-screen unsupported here
pytest.skip(f"offscreen render unavailable: {exc}")
import os as _os
assert _os.path.exists(png) and png.endswith(".png")
assert "bounding box" in dims
_os.remove(png)

View File

@ -145,7 +145,7 @@ def test_run_command_threads_history(tmp_path, monkeypatch):
c = _controller(tmp_path)
seen = {}
def fake_interpret(text, schemas, scene_text=None, history=None, image_path=None):
def fake_interpret(text, schemas, scene_text=None, history=None, image_path=None, reference_text=None):
seen["history"] = list(history or [])
return [{"tool": "say", "args": {"text": "want me to add tenons?"}}]
@ -162,7 +162,7 @@ def test_run_command_forwards_image_path(tmp_path, monkeypatch):
c = _controller(tmp_path)
seen = {}
def fake_interpret(text, schemas, scene_text=None, history=None, image_path=None):
def fake_interpret(text, schemas, scene_text=None, history=None, image_path=None, reference_text=None):
seen["image_path"] = image_path
return [{"tool": "say", "args": {"text": "ok"}}]