Reference input now accepts PDF plans, 3D models, and web links

Extends "build something like this" beyond photos: - driver.resolve_reference(source) routes any path/URL: image/PDF → a path claude -p reads directly; STL/STEP/OBJ → render_mesh() renders an isometric PNG (pyvista; STEP via build123d→STL) and reports the bounding box; a normal web URL → fetch_web_text() pulls the page's visible text. - interpret(reference_text=) injects guide/render-dims text alongside any image directive; handle() + controller.run_command() + woodshop-talk --ref pass it. - command bar: picker/drag-drop accept images + .pdf + 3D files; any pasted URL is resolved; resolution (download/render/fetch) runs off the UI thread. - find_image_url→find_reference_url (any URL); fetch_image→fetch_url (generic). - tests: URL detect, image+reference-text directives, fetch_url, web-text strip, resolve_reference routing per kind, real STL render (skips without GL). 220 pass. 3D render gives the model EXACT proportions (+ bbox) instead of a 2D guess. Honest limit: render needs the viewer stack + working off-screen GL on your box; the live model round-trip still wants your eyes to confirm. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-30 22:37:38 -03:00 · 2026-05-30 22:37:38 -03:00 · 84ae6d8756
parent c623ad2576
commit 84ae6d8756
6 changed files with 199 additions and 72 deletions
--- a/README.md
+++ b/README.md
@ -8,10 +8,12 @@ Talk to it like the Star Trek holodeck and watch furniture build itself:
 > *"Build a coffee table: a four foot by two foot frame from 2x4s, with four legs 18 inches tall standing at the corners."*
-You can also **attach a reference photo** (📎, drag-drop, paste, or an image URL)
+You can also **attach a reference** (📎, drag-drop, paste, or a URL) and say
-and say *"build something like this"* — WoodShop hands the image to the model and
+*"build something like this"*: a **photo**, a **PDF plan**, a **3D model**
-builds a simplified, buildable interpretation in dimensional lumber that you then
+(STL/STEP/OBJ — rendered to an image, with its bounding box measured), or a
-refine by voice/text. (It's an interpretation, not a measured replica.)
+**web-page guide** (its text is pulled). WoodShop builds a simplified, buildable
 interpretation in dimensional lumber that you then refine by voice/text. (It's an
 interpretation, not a measured replica.)
 Each board is real dimensional lumber (a 2x4 is modeled at its true 1.5″ × 3.5″),
 so the result is buildable — export to **STEP** (CAD/CNC) or **STL** (3D print),
--- a/src/woodshop/driver.py
+++ b/src/woodshop/driver.py
@ -29,41 +29,107 @@ TOOL_FILTER = "wood-*"  # auto-discover every wood-* tool, no hardcoded list
 REASON_PROVIDER = "claude -p"  # chosen for reliable structured tool-calling
 _MAX_HISTORY = 6  # turns of recent conversation fed back for reference-resolution
-# A reference photo can be attached to "build something like this". claude -p
+# A reference can be attached to "build something like this": a photo, a PDF
-# reads the image file (via its Read tool), so we just hand it an absolute path.
+# plan, a 3D model (STL/STEP/OBJ — rendered to an image first), or a web page
-_IMG_URL = re.compile(r'https?://\S+?\.(?:png|jpe?g|webp|gif|bmp)\b', re.I)
+# (its text is pulled). claude -p reads images & PDFs directly via its Read tool.
 IMG_EXTS = {".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp"}
 DOC_EXTS = {".pdf"}                                   # claude -p reads PDFs too
 MESH_EXTS = {".stl", ".obj", ".ply", ".step", ".stp"}
 _REF_EXTS = IMG_EXTS | DOC_EXTS | MESH_EXTS
 _URL = re.compile(r'https?://\S+', re.I)
 _IMAGE_DIRECTIVE = (
-    "A REFERENCE PHOTO of furniture is saved at this path:\n  {path}\n"
+    "A REFERENCE (photo / plan drawing / 3D render) is saved at this path:\n  {path}\n"
-    "Open and look at that image file. The user wants to build something LIKE it "
+    "Open and look at that file. The user wants to build something LIKE it from "
-    "from dimensional lumber and plywood. Infer the major parts, rough proportions, "
+    "dimensional lumber and plywood. Infer the major parts, proportions, and "
-    "and joinery, and emit the tool calls to build a SIMPLIFIED, buildable version "
+    "joinery, and emit the tool calls to build a SIMPLIFIED, buildable version with "
-    "with reasonable real dimensions in inches. This is an interpretation, not an "
+    "reasonable real dimensions in inches. An interpretation, not an exact replica "
-    "exact replica — prefer standard stock sizes and right angles.\n\n"
+    "— prefer standard stock sizes and right angles.\n\n")
-)
+_TEXT_DIRECTIVE = (
    "A build GUIDE / plan was provided as text (below). Use it to build a "
    "simplified, buildable version in dimensional lumber.\n--- REFERENCE ---\n"
    "{text}\n--- END REFERENCE ---\n\n")
-def find_image_url(text: str) -> str | None:
+def find_reference_url(text: str) -> str | None:
-    m = _IMG_URL.search(text or "")
+    m = _URL.search(text or "")
    return m.group(0) if m else None
-def fetch_image(url: str, timeout: int = 20) -> str:
+def _ext(name: str) -> str:
-    """Download an image URL to a temp file and return its path. Raises on
+    return os.path.splitext(name.split("?")[0])[1].lower()
-    failure (caller decides how to surface it)."""
+
 def fetch_url(url: str, timeout: int = 20) -> str:
    """Download a URL (image / PDF / 3D file) to a temp file; return its path."""
    req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
    with urllib.request.urlopen(req, timeout=timeout) as resp:
        ctype = (resp.headers.get("Content-Type") or "").split(";")[0].strip().lower()
        data = resp.read()
    ext = {"image/png": ".png", "image/jpeg": ".jpg", "image/webp": ".webp",
-           "image/gif": ".gif", "image/bmp": ".bmp"}.get(ctype)
+           "image/gif": ".gif", "image/bmp": ".bmp", "application/pdf": ".pdf",
-    if ext is None:
+           "model/stl": ".stl", "application/sla": ".stl"}.get(ctype) or _ext(url) or ".bin"
        m = re.search(r'\.(png|jpe?g|webp|gif|bmp)\b', url, re.I)
        ext = "." + m.group(1).lower() if m else ".img"
    fd, path = tempfile.mkstemp(suffix=ext, prefix="woodshop-ref-")
    with os.fdopen(fd, "wb") as f:
        f.write(data)
    return path
 def fetch_web_text(url: str, limit: int = 8000, timeout: int = 20) -> str:
    """Fetch a web page and return its visible text (tags/scripts stripped)."""
    req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
    with urllib.request.urlopen(req, timeout=timeout) as resp:
        html = resp.read().decode("utf-8", "replace")
    html = re.sub(r'(?is)<(script|style)[^>]*>.*?</\1>', ' ', html)
    text = re.sub(r'(?s)<[^>]+>', ' ', html)
    text = re.sub(r'\s+', ' ', text).strip()
    return text[:limit]
 def render_mesh(path: str) -> tuple[str, str]:
    """Render a 3D model (STL/OBJ/PLY/STEP) to a PNG and describe its bounding
    box. Returns (png_path, dims_text). Needs the viewer stack (pyvista); STEP
    also needs build123d."""
    import tempfile as _tf
    from pathlib import Path as _P
    import pyvista as pv
    ext = _P(path).suffix.lower()
    if ext in (".step", ".stp"):
        from build123d import export_stl, import_step
        shape = import_step(path)
        fd, stl = _tf.mkstemp(suffix=".stl"); os.close(fd)
        export_stl(shape, stl)
        mesh = pv.read(stl)
    else:
        mesh = pv.read(path)
    b = mesh.bounds
    dx, dy, dz = b[1] - b[0], b[3] - b[2], b[5] - b[4]
    pl = pv.Plotter(off_screen=True, window_size=(900, 700))
    pl.add_mesh(mesh, color="#c8965a", show_edges=True)
    pl.view_isometric()
    fd, png = _tf.mkstemp(suffix=".png", prefix="woodshop-render-"); os.close(fd)
    pl.screenshot(png)
    pl.close()
    dims = (f"This is a render of a 3D model; its bounding box is about "
            f"{dx:.1f} x {dy:.1f} x {dz:.1f} in the file's units (proportions are "
            f"exact — treat units as inches unless that's implausible).")
    return png, dims
 def resolve_reference(source: str) -> tuple[str | None, str | None]:
    """Turn a reference (local path or URL) into (image_path, reference_text) for
    interpret(). Image/PDF -> a path claude reads; 3D file -> rendered PNG + dims
    text; web page -> page text. Raises on download/render failure."""
    is_url = source.startswith(("http://", "https://"))
    ext = _ext(source)
    if is_url and ext not in _REF_EXTS:
        return None, fetch_web_text(source)            # a web-page guide
    local = fetch_url(source) if is_url else source
    if _ext(local) in MESH_EXTS:
        return render_mesh(local)                      # (png, dims)
    return local, None                                 # image or PDF — read directly
 # A board placed earlier in the SAME utterance is referenced as $1, $2, ...
 _SYMBOL = re.compile(r"\$(\d+)")
@ -182,12 +248,16 @@ def _render_history(history: list[tuple[str, str]] | None) -> str:
 def interpret(utterance: str, schemas: str, scene_text: str | None = None,
              history: list[tuple[str, str]] | None = None,
-              image_path: str | None = None) -> list[dict]:
+              image_path: str | None = None, reference_text: str | None = None) -> list[dict]:
    scene = scene_text if scene_text is not None else scene_summary()
    prompt = SYSTEM.format(schemas=schemas, scene=scene, utterance=utterance,
                           history=_render_history(history))
    prefix = ""
    if image_path:
-        prompt = _IMAGE_DIRECTIVE.format(path=os.path.abspath(image_path)) + prompt
+        prefix += _IMAGE_DIRECTIVE.format(path=os.path.abspath(image_path))
    if reference_text:
        prefix += _TEXT_DIRECTIVE.format(text=reference_text[:8000])
    prompt = prefix + prompt
    raw = _run(REASON_PROVIDER.split(), stdin=prompt)
    calls = _extract_calls(raw)
    if calls is None:
@ -284,8 +354,9 @@ def summarize(calls: list[dict], messages: list[str]) -> str:
 def handle(utterance: str, schemas: str, voice: bool, verbose: bool,
           history: list[tuple[str, str]] | None = None,
-           image_path: str | None = None) -> None:
+           image_path: str | None = None, reference_text: str | None = None) -> None:
-    calls = interpret(utterance, schemas, history=history, image_path=image_path)
+    calls = interpret(utterance, schemas, history=history, image_path=image_path,
                      reference_text=reference_text)
    messages = dispatch(calls, verbose=verbose)
    full = " ".join(m for m in messages if m).strip()
    spoken = summarize(calls, messages)
@ -313,13 +384,15 @@ def main(argv: list[str] | None = None) -> int:
    ap.add_argument("--voice", action="store_true", help="Listen on the mic instead of typing")
    ap.add_argument("--duration", type=int, default=6, help="Mic recording seconds (--voice)")
    ap.add_argument("--once", help="Run a single command (non-interactive) and exit")
-    ap.add_argument("--image", help="Reference photo (path or URL) for 'build something like this'")
+    ap.add_argument("--image", "--ref", dest="ref",
                    help="Reference for 'build something like this': a photo, PDF, 3D "
                         "model (stl/step/obj), or web-page URL (path or URL)")
    ap.add_argument("--quiet", action="store_true", help="Don't print per-call detail")
    args = ap.parse_args(argv)
-    image_path = None
+    image_path = reference_text = None
-    if args.image:
+    if args.ref:
-        image_path = fetch_image(args.image) if args.image.startswith("http") else args.image
+        image_path, reference_text = resolve_reference(args.ref)
    schemas = load_schemas()
    if not schemas:
@ -329,7 +402,7 @@ def main(argv: list[str] | None = None) -> int:
    if args.once is not None:
        handle(args.once, schemas, voice=args.voice, verbose=not args.quiet,
-               image_path=image_path)
+               image_path=image_path, reference_text=reference_text)
        return 0
    print("WoodShop ready. Say things like 'place a 6 foot 2x4'. Ctrl-C to quit.")
@ -343,8 +416,8 @@ def main(argv: list[str] | None = None) -> int:
            return 0
        try:
            handle(utterance, schemas, voice=args.voice, verbose=not args.quiet,
-                   history=history, image_path=image_path)
+                   history=history, image_path=image_path, reference_text=reference_text)
-            image_path = None        # the reference photo applies to the first turn only
+            image_path = reference_text = None   # the reference applies to the first turn only
        except Exception as exc:  # never let one bad command kill the session
            print(f"WoodShop: sorry, that command failed ({exc}).")
--- a/src/woodshop/gui/command_bar.py
+++ b/src/woodshop/gui/command_bar.py
@ -17,7 +17,8 @@ from .controller import Controller
 from .workers import run_async
 _WHO_COLOR = {"you": "#9cdcfe", "ws": "#c8965a", "sys": "#e06c75"}
-_IMAGE_EXTS = (".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp")
+# Reference files we accept by drag-drop / picker (images, PDF plans, 3D models).
 _REF_EXTS = tuple(sorted(driver.IMG_EXTS | driver.DOC_EXTS | driver.MESH_EXTS))
 class CommandBar(QWidget):
@ -41,7 +42,8 @@ class CommandBar(QWidget):
        row.addWidget(self.mic)
        self.attach = QPushButton("📎")
-        self.attach.setToolTip("Attach a reference photo — then say 'build something like this'")
+        self.attach.setToolTip("Attach a reference (photo, PDF plan, or 3D model) — "
                               "then say 'build something like this'")
        self.attach.setFixedWidth(40)
        self.attach.clicked.connect(self._attach_image)
        row.addWidget(self.attach)
@ -87,9 +89,10 @@ class CommandBar(QWidget):
        super().mousePressEvent(e)
    def _attach_image(self) -> None:
        patterns = " ".join("*" + e for e in _REF_EXTS)
        path, _ = QFileDialog.getOpenFileName(
-            self, "Attach reference photo", "",
+            self, "Attach reference (photo / PDF plan / 3D model)", "",
-            "Images (*.png *.jpg *.jpeg *.webp *.gif *.bmp)")
+            f"References ({patterns});;All files (*)")
        if path:
            self._set_image(path)
            if not self.input.text().strip():
@ -97,7 +100,7 @@ class CommandBar(QWidget):
    def dragEnterEvent(self, e):
        md = e.mimeData()
-        if md.hasImage() or any(u.toLocalFile().lower().endswith(_IMAGE_EXTS)
+        if md.hasImage() or any(u.toLocalFile().lower().endswith(_REF_EXTS)
                                for u in md.urls()):
            e.acceptProposedAction()
@ -105,7 +108,7 @@ class CommandBar(QWidget):
        md = e.mimeData()
        for u in md.urls():
            p = u.toLocalFile()
-            if p.lower().endswith(_IMAGE_EXTS):
+            if p.lower().endswith(_REF_EXTS):
                self._set_image(p)
                break
        else:
@ -161,18 +164,18 @@ class CommandBar(QWidget):
        self._run(text)
    def _run(self, text: str) -> None:
-        image = self._pending_image
+        source = self._pending_image or driver.find_reference_url(text)
-        url = None if image else driver.find_image_url(text)
+        note = "  📎 reference" if source else ""
        note = "  📎 photo" if (image or url) else ""
        self._log("you", text + note)
        self._set_image(None)
-        self._busy(True, "looking…" if (image or url) else "thinking…")
+        self._busy(True, "studying reference…" if source else "thinking…")
        def work():
-            path = image
+            image_path = reference_text = None
-            if path is None and url:
+            if source:
-                path = driver.fetch_image(url)         # download the linked image
+                image_path, reference_text = driver.resolve_reference(source)
-            return self.c.run_command(text, image_path=path)
+            return self.c.run_command(text, image_path=image_path,
                                      reference_text=reference_text)
        def done(summary):
            self._busy(False)
--- a/src/woodshop/gui/controller.py
+++ b/src/woodshop/gui/controller.py
@ -427,9 +427,11 @@ class Controller(QObject):
        except (SceneError, ValueError, KeyError) as exc:
            return str(exc).strip('"')
-    def run_command(self, text: str, image_path: str | None = None) -> str:
+    def run_command(self, text: str, image_path: str | None = None,
-        """Interpret a spoken/typed command (optionally with a reference photo) and
+                    reference_text: str | None = None) -> str:
-        apply it. Returns a spoken summary. (Slow — call from a worker thread.)"""
+        """Interpret a spoken/typed command (optionally with a reference photo,
        plan, 3D render, or guide text) and apply it. Returns a spoken summary.
        (Slow — call from a worker thread.)"""
        from ..scene import spatial_summary
        self.save()  # ensure disk reflects current state
        sel = ", ".join(self.selected) if self.selected else "none"
@ -437,7 +439,8 @@ class Controller(QObject):
                      + f"\nCurrently selected ('these' / 'them' / 'the selected'): {sel}"
                      + "\n" + spatial_summary(self.scene))
        calls = driver.interpret(text, self.schemas(), scene_text=scene_text,
-                                 history=self._history, image_path=image_path)
+                                 history=self._history, image_path=image_path,
                                 reference_text=reference_text)
        messages = driver.dispatch(calls, verbose=False, executor=self.execute_call)
        self._commit()
        spoken = driver.summarize(calls, messages)
--- a/tests/test_driver.py
+++ b/tests/test_driver.py
@ -1,6 +1,8 @@
 """Tests for the driver's orchestration logic (external tools are mocked)."""
 import json
 import pytest
 from woodshop import driver
 from woodshop.cli import normalize_anchor
@ -148,32 +150,33 @@ def test_woodshop_cmd_falls_back_to_module(monkeypatch):
    assert cmd[1:] == ["-m", "woodshop"] and cmd[0]   # python -m woodshop
-def test_find_image_url():
+def test_find_reference_url():
-    assert driver.find_image_url("build like this https://x.com/chair.jpg please") \
+    assert driver.find_reference_url("build like this https://x.com/chair.jpg please") \
        == "https://x.com/chair.jpg"
-    assert driver.find_image_url("https://x.com/a.PNG") == "https://x.com/a.PNG"
+    assert driver.find_reference_url("see https://x.com/how-to") == "https://x.com/how-to"
-    assert driver.find_image_url("no image here http://x.com/page") is None
+    assert driver.find_reference_url("no url here") is None
 def test_interpret_includes_image_directive(monkeypatch, tmp_path):
    captured = {}
-
+    monkeypatch.setattr(driver, "_run", lambda cmd, stdin="": captured.update(prompt=stdin) or "[]")
    def fake_run(cmd, stdin=""):
        captured["prompt"] = stdin
        return "[]"
    img = tmp_path / "ref.jpg"
-    img.write_bytes(b"\xff\xd8\xff")               # not a real jpeg, just a path
+    img.write_bytes(b"\xff\xd8\xff")
    monkeypatch.setattr(driver, "_run", fake_run)
    driver.interpret("build something like this", schemas="[]", scene_text="empty",
                     image_path=str(img))
-    assert "REFERENCE PHOTO" in captured["prompt"]
+    assert "REFERENCE" in captured["prompt"] and str(img) in captured["prompt"]
    assert str(img) in captured["prompt"]
-def test_fetch_image_writes_temp(monkeypatch):
+def test_interpret_includes_reference_text(monkeypatch):
-    import io
+    captured = {}
    monkeypatch.setattr(driver, "_run", lambda cmd, stdin="": captured.update(prompt=stdin) or "[]")
    driver.interpret("build it", schemas="[]", scene_text="empty",
                     reference_text="Step 1: cut four legs 28 inches long.")
    assert "build GUIDE" in captured["prompt"]
    assert "cut four legs 28 inches" in captured["prompt"]
 def test_fetch_url_writes_temp(monkeypatch):
    class FakeResp:
        headers = {"Content-Type": "image/png"}
        def __enter__(self): return self
@ -181,9 +184,52 @@ def test_fetch_image_writes_temp(monkeypatch):
        def read(self): return b"\x89PNG\r\n\x1a\n"
    monkeypatch.setattr(driver.urllib.request, "urlopen", lambda *a, **k: FakeResp())
-    path = driver.fetch_image("https://x.com/chair.png")
+    path = driver.fetch_url("https://x.com/chair.png")
    assert path.endswith(".png")
    with open(path, "rb") as f:
        assert f.read().startswith(b"\x89PNG")
    import os as _os
    _os.remove(path)
 def test_fetch_web_text_strips_tags(monkeypatch):
    html = b"<html><head><style>x{}</style></head><body><h1>Build</h1> a <b>shelf</b></body></html>"
    class FakeResp:
        def __enter__(self): return self
        def __exit__(self, *a): return False
        def read(self): return html
    monkeypatch.setattr(driver.urllib.request, "urlopen", lambda *a, **k: FakeResp())
    text = driver.fetch_web_text("https://x.com/guide")
    assert "Build a shelf" in text and "<" not in text and "x{}" not in text
 def test_resolve_reference_routes_by_kind(monkeypatch, tmp_path):
    # local image -> (path, None)
    img = tmp_path / "a.png"; img.write_bytes(b"x")
    assert driver.resolve_reference(str(img)) == (str(img), None)
    # local pdf -> read directly (path, None)
    pdf = tmp_path / "plan.pdf"; pdf.write_bytes(b"%PDF")
    assert driver.resolve_reference(str(pdf)) == (str(pdf), None)
    # web page URL -> (None, text)
    monkeypatch.setattr(driver, "fetch_web_text", lambda u, **k: "guide text")
    assert driver.resolve_reference("https://x.com/how-to-build") == (None, "guide text")
    # 3D file -> render (mocked) -> (png, dims)
    monkeypatch.setattr(driver, "render_mesh", lambda p: ("/tmp/r.png", "bbox 10x10x10"))
    stl = tmp_path / "m.stl"; stl.write_bytes(b"solid")
    assert driver.resolve_reference(str(stl)) == ("/tmp/r.png", "bbox 10x10x10")
 def test_render_mesh_real_if_possible(tmp_path):
    """Render an actual STL if pyvista + a working off-screen GL are available;
    skip cleanly otherwise (headless boxes often lack GL)."""
    pv = pytest.importorskip("pyvista")
    stl = tmp_path / "box.stl"
    try:
        pv.Cube().save(str(stl))
        png, dims = driver.render_mesh(str(stl))
    except Exception as exc:                      # no GL / off-screen unsupported here
        pytest.skip(f"offscreen render unavailable: {exc}")
    import os as _os
    assert _os.path.exists(png) and png.endswith(".png")
    assert "bounding box" in dims
    _os.remove(png)
--- a/tests/test_gui_controller.py
+++ b/tests/test_gui_controller.py
@ -145,7 +145,7 @@ def test_run_command_threads_history(tmp_path, monkeypatch):
    c = _controller(tmp_path)
    seen = {}
-    def fake_interpret(text, schemas, scene_text=None, history=None, image_path=None):
+    def fake_interpret(text, schemas, scene_text=None, history=None, image_path=None, reference_text=None):
        seen["history"] = list(history or [])
        return [{"tool": "say", "args": {"text": "want me to add tenons?"}}]
@ -162,7 +162,7 @@ def test_run_command_forwards_image_path(tmp_path, monkeypatch):
    c = _controller(tmp_path)
    seen = {}
-    def fake_interpret(text, schemas, scene_text=None, history=None, image_path=None):
+    def fake_interpret(text, schemas, scene_text=None, history=None, image_path=None, reference_text=None):
        seen["image_path"] = image_path
        return [{"tool": "say", "args": {"text": "ok"}}]