From 84ae6d8756f4a201732784712bf5d74e344bfb6b Mon Sep 17 00:00:00 2001 From: rob Date: Sat, 30 May 2026 22:37:38 -0300 Subject: [PATCH] Reference input now accepts PDF plans, 3D models, and web links MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends "build something like this" beyond photos: - driver.resolve_reference(source) routes any path/URL: image/PDF → a path claude -p reads directly; STL/STEP/OBJ → render_mesh() renders an isometric PNG (pyvista; STEP via build123d→STL) and reports the bounding box; a normal web URL → fetch_web_text() pulls the page's visible text. - interpret(reference_text=) injects guide/render-dims text alongside any image directive; handle() + controller.run_command() + woodshop-talk --ref pass it. - command bar: picker/drag-drop accept images + .pdf + 3D files; any pasted URL is resolved; resolution (download/render/fetch) runs off the UI thread. - find_image_url→find_reference_url (any URL); fetch_image→fetch_url (generic). - tests: URL detect, image+reference-text directives, fetch_url, web-text strip, resolve_reference routing per kind, real STL render (skips without GL). 220 pass. 3D render gives the model EXACT proportions (+ bbox) instead of a 2D guess. Honest limit: render needs the viewer stack + working off-screen GL on your box; the live model round-trip still wants your eyes to confirm. Co-Authored-By: Claude Opus 4.8 (1M context) --- README.md | 10 ++- src/woodshop/driver.py | 133 +++++++++++++++++++++++++------- src/woodshop/gui/command_bar.py | 31 ++++---- src/woodshop/gui/controller.py | 11 ++- tests/test_driver.py | 82 +++++++++++++++----- tests/test_gui_controller.py | 4 +- 6 files changed, 199 insertions(+), 72 deletions(-) diff --git a/README.md b/README.md index c742220..e843bbf 100644 --- a/README.md +++ b/README.md @@ -8,10 +8,12 @@ Talk to it like the Star Trek holodeck and watch furniture build itself: > *"Build a coffee table: a four foot by two foot frame from 2x4s, with four legs 18 inches tall standing at the corners."* -You can also **attach a reference photo** (📎, drag-drop, paste, or an image URL) -and say *"build something like this"* — WoodShop hands the image to the model and -builds a simplified, buildable interpretation in dimensional lumber that you then -refine by voice/text. (It's an interpretation, not a measured replica.) +You can also **attach a reference** (📎, drag-drop, paste, or a URL) and say +*"build something like this"*: a **photo**, a **PDF plan**, a **3D model** +(STL/STEP/OBJ — rendered to an image, with its bounding box measured), or a +**web-page guide** (its text is pulled). WoodShop builds a simplified, buildable +interpretation in dimensional lumber that you then refine by voice/text. (It's an +interpretation, not a measured replica.) Each board is real dimensional lumber (a 2x4 is modeled at its true 1.5″ × 3.5″), so the result is buildable — export to **STEP** (CAD/CNC) or **STL** (3D print), diff --git a/src/woodshop/driver.py b/src/woodshop/driver.py index b1d3baa..2276acd 100644 --- a/src/woodshop/driver.py +++ b/src/woodshop/driver.py @@ -29,41 +29,107 @@ TOOL_FILTER = "wood-*" # auto-discover every wood-* tool, no hardcoded list REASON_PROVIDER = "claude -p" # chosen for reliable structured tool-calling _MAX_HISTORY = 6 # turns of recent conversation fed back for reference-resolution -# A reference photo can be attached to "build something like this". claude -p -# reads the image file (via its Read tool), so we just hand it an absolute path. -_IMG_URL = re.compile(r'https?://\S+?\.(?:png|jpe?g|webp|gif|bmp)\b', re.I) +# A reference can be attached to "build something like this": a photo, a PDF +# plan, a 3D model (STL/STEP/OBJ — rendered to an image first), or a web page +# (its text is pulled). claude -p reads images & PDFs directly via its Read tool. +IMG_EXTS = {".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp"} +DOC_EXTS = {".pdf"} # claude -p reads PDFs too +MESH_EXTS = {".stl", ".obj", ".ply", ".step", ".stp"} +_REF_EXTS = IMG_EXTS | DOC_EXTS | MESH_EXTS +_URL = re.compile(r'https?://\S+', re.I) + _IMAGE_DIRECTIVE = ( - "A REFERENCE PHOTO of furniture is saved at this path:\n {path}\n" - "Open and look at that image file. The user wants to build something LIKE it " - "from dimensional lumber and plywood. Infer the major parts, rough proportions, " - "and joinery, and emit the tool calls to build a SIMPLIFIED, buildable version " - "with reasonable real dimensions in inches. This is an interpretation, not an " - "exact replica — prefer standard stock sizes and right angles.\n\n" -) + "A REFERENCE (photo / plan drawing / 3D render) is saved at this path:\n {path}\n" + "Open and look at that file. The user wants to build something LIKE it from " + "dimensional lumber and plywood. Infer the major parts, proportions, and " + "joinery, and emit the tool calls to build a SIMPLIFIED, buildable version with " + "reasonable real dimensions in inches. An interpretation, not an exact replica " + "— prefer standard stock sizes and right angles.\n\n") +_TEXT_DIRECTIVE = ( + "A build GUIDE / plan was provided as text (below). Use it to build a " + "simplified, buildable version in dimensional lumber.\n--- REFERENCE ---\n" + "{text}\n--- END REFERENCE ---\n\n") -def find_image_url(text: str) -> str | None: - m = _IMG_URL.search(text or "") +def find_reference_url(text: str) -> str | None: + m = _URL.search(text or "") return m.group(0) if m else None -def fetch_image(url: str, timeout: int = 20) -> str: - """Download an image URL to a temp file and return its path. Raises on - failure (caller decides how to surface it).""" +def _ext(name: str) -> str: + return os.path.splitext(name.split("?")[0])[1].lower() + + +def fetch_url(url: str, timeout: int = 20) -> str: + """Download a URL (image / PDF / 3D file) to a temp file; return its path.""" req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}) with urllib.request.urlopen(req, timeout=timeout) as resp: ctype = (resp.headers.get("Content-Type") or "").split(";")[0].strip().lower() data = resp.read() ext = {"image/png": ".png", "image/jpeg": ".jpg", "image/webp": ".webp", - "image/gif": ".gif", "image/bmp": ".bmp"}.get(ctype) - if ext is None: - m = re.search(r'\.(png|jpe?g|webp|gif|bmp)\b', url, re.I) - ext = "." + m.group(1).lower() if m else ".img" + "image/gif": ".gif", "image/bmp": ".bmp", "application/pdf": ".pdf", + "model/stl": ".stl", "application/sla": ".stl"}.get(ctype) or _ext(url) or ".bin" fd, path = tempfile.mkstemp(suffix=ext, prefix="woodshop-ref-") with os.fdopen(fd, "wb") as f: f.write(data) return path + +def fetch_web_text(url: str, limit: int = 8000, timeout: int = 20) -> str: + """Fetch a web page and return its visible text (tags/scripts stripped).""" + req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}) + with urllib.request.urlopen(req, timeout=timeout) as resp: + html = resp.read().decode("utf-8", "replace") + html = re.sub(r'(?is)<(script|style)[^>]*>.*?', ' ', html) + text = re.sub(r'(?s)<[^>]+>', ' ', html) + text = re.sub(r'\s+', ' ', text).strip() + return text[:limit] + + +def render_mesh(path: str) -> tuple[str, str]: + """Render a 3D model (STL/OBJ/PLY/STEP) to a PNG and describe its bounding + box. Returns (png_path, dims_text). Needs the viewer stack (pyvista); STEP + also needs build123d.""" + import tempfile as _tf + from pathlib import Path as _P + + import pyvista as pv + ext = _P(path).suffix.lower() + if ext in (".step", ".stp"): + from build123d import export_stl, import_step + shape = import_step(path) + fd, stl = _tf.mkstemp(suffix=".stl"); os.close(fd) + export_stl(shape, stl) + mesh = pv.read(stl) + else: + mesh = pv.read(path) + b = mesh.bounds + dx, dy, dz = b[1] - b[0], b[3] - b[2], b[5] - b[4] + pl = pv.Plotter(off_screen=True, window_size=(900, 700)) + pl.add_mesh(mesh, color="#c8965a", show_edges=True) + pl.view_isometric() + fd, png = _tf.mkstemp(suffix=".png", prefix="woodshop-render-"); os.close(fd) + pl.screenshot(png) + pl.close() + dims = (f"This is a render of a 3D model; its bounding box is about " + f"{dx:.1f} x {dy:.1f} x {dz:.1f} in the file's units (proportions are " + f"exact — treat units as inches unless that's implausible).") + return png, dims + + +def resolve_reference(source: str) -> tuple[str | None, str | None]: + """Turn a reference (local path or URL) into (image_path, reference_text) for + interpret(). Image/PDF -> a path claude reads; 3D file -> rendered PNG + dims + text; web page -> page text. Raises on download/render failure.""" + is_url = source.startswith(("http://", "https://")) + ext = _ext(source) + if is_url and ext not in _REF_EXTS: + return None, fetch_web_text(source) # a web-page guide + local = fetch_url(source) if is_url else source + if _ext(local) in MESH_EXTS: + return render_mesh(local) # (png, dims) + return local, None # image or PDF — read directly + # A board placed earlier in the SAME utterance is referenced as $1, $2, ... _SYMBOL = re.compile(r"\$(\d+)") @@ -182,12 +248,16 @@ def _render_history(history: list[tuple[str, str]] | None) -> str: def interpret(utterance: str, schemas: str, scene_text: str | None = None, history: list[tuple[str, str]] | None = None, - image_path: str | None = None) -> list[dict]: + image_path: str | None = None, reference_text: str | None = None) -> list[dict]: scene = scene_text if scene_text is not None else scene_summary() prompt = SYSTEM.format(schemas=schemas, scene=scene, utterance=utterance, history=_render_history(history)) + prefix = "" if image_path: - prompt = _IMAGE_DIRECTIVE.format(path=os.path.abspath(image_path)) + prompt + prefix += _IMAGE_DIRECTIVE.format(path=os.path.abspath(image_path)) + if reference_text: + prefix += _TEXT_DIRECTIVE.format(text=reference_text[:8000]) + prompt = prefix + prompt raw = _run(REASON_PROVIDER.split(), stdin=prompt) calls = _extract_calls(raw) if calls is None: @@ -284,8 +354,9 @@ def summarize(calls: list[dict], messages: list[str]) -> str: def handle(utterance: str, schemas: str, voice: bool, verbose: bool, history: list[tuple[str, str]] | None = None, - image_path: str | None = None) -> None: - calls = interpret(utterance, schemas, history=history, image_path=image_path) + image_path: str | None = None, reference_text: str | None = None) -> None: + calls = interpret(utterance, schemas, history=history, image_path=image_path, + reference_text=reference_text) messages = dispatch(calls, verbose=verbose) full = " ".join(m for m in messages if m).strip() spoken = summarize(calls, messages) @@ -313,13 +384,15 @@ def main(argv: list[str] | None = None) -> int: ap.add_argument("--voice", action="store_true", help="Listen on the mic instead of typing") ap.add_argument("--duration", type=int, default=6, help="Mic recording seconds (--voice)") ap.add_argument("--once", help="Run a single command (non-interactive) and exit") - ap.add_argument("--image", help="Reference photo (path or URL) for 'build something like this'") + ap.add_argument("--image", "--ref", dest="ref", + help="Reference for 'build something like this': a photo, PDF, 3D " + "model (stl/step/obj), or web-page URL (path or URL)") ap.add_argument("--quiet", action="store_true", help="Don't print per-call detail") args = ap.parse_args(argv) - image_path = None - if args.image: - image_path = fetch_image(args.image) if args.image.startswith("http") else args.image + image_path = reference_text = None + if args.ref: + image_path, reference_text = resolve_reference(args.ref) schemas = load_schemas() if not schemas: @@ -329,7 +402,7 @@ def main(argv: list[str] | None = None) -> int: if args.once is not None: handle(args.once, schemas, voice=args.voice, verbose=not args.quiet, - image_path=image_path) + image_path=image_path, reference_text=reference_text) return 0 print("WoodShop ready. Say things like 'place a 6 foot 2x4'. Ctrl-C to quit.") @@ -343,8 +416,8 @@ def main(argv: list[str] | None = None) -> int: return 0 try: handle(utterance, schemas, voice=args.voice, verbose=not args.quiet, - history=history, image_path=image_path) - image_path = None # the reference photo applies to the first turn only + history=history, image_path=image_path, reference_text=reference_text) + image_path = reference_text = None # the reference applies to the first turn only except Exception as exc: # never let one bad command kill the session print(f"WoodShop: sorry, that command failed ({exc}).") diff --git a/src/woodshop/gui/command_bar.py b/src/woodshop/gui/command_bar.py index 6a5123a..c7cc156 100644 --- a/src/woodshop/gui/command_bar.py +++ b/src/woodshop/gui/command_bar.py @@ -17,7 +17,8 @@ from .controller import Controller from .workers import run_async _WHO_COLOR = {"you": "#9cdcfe", "ws": "#c8965a", "sys": "#e06c75"} -_IMAGE_EXTS = (".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp") +# Reference files we accept by drag-drop / picker (images, PDF plans, 3D models). +_REF_EXTS = tuple(sorted(driver.IMG_EXTS | driver.DOC_EXTS | driver.MESH_EXTS)) class CommandBar(QWidget): @@ -41,7 +42,8 @@ class CommandBar(QWidget): row.addWidget(self.mic) self.attach = QPushButton("📎") - self.attach.setToolTip("Attach a reference photo — then say 'build something like this'") + self.attach.setToolTip("Attach a reference (photo, PDF plan, or 3D model) — " + "then say 'build something like this'") self.attach.setFixedWidth(40) self.attach.clicked.connect(self._attach_image) row.addWidget(self.attach) @@ -87,9 +89,10 @@ class CommandBar(QWidget): super().mousePressEvent(e) def _attach_image(self) -> None: + patterns = " ".join("*" + e for e in _REF_EXTS) path, _ = QFileDialog.getOpenFileName( - self, "Attach reference photo", "", - "Images (*.png *.jpg *.jpeg *.webp *.gif *.bmp)") + self, "Attach reference (photo / PDF plan / 3D model)", "", + f"References ({patterns});;All files (*)") if path: self._set_image(path) if not self.input.text().strip(): @@ -97,7 +100,7 @@ class CommandBar(QWidget): def dragEnterEvent(self, e): md = e.mimeData() - if md.hasImage() or any(u.toLocalFile().lower().endswith(_IMAGE_EXTS) + if md.hasImage() or any(u.toLocalFile().lower().endswith(_REF_EXTS) for u in md.urls()): e.acceptProposedAction() @@ -105,7 +108,7 @@ class CommandBar(QWidget): md = e.mimeData() for u in md.urls(): p = u.toLocalFile() - if p.lower().endswith(_IMAGE_EXTS): + if p.lower().endswith(_REF_EXTS): self._set_image(p) break else: @@ -161,18 +164,18 @@ class CommandBar(QWidget): self._run(text) def _run(self, text: str) -> None: - image = self._pending_image - url = None if image else driver.find_image_url(text) - note = " 📎 photo" if (image or url) else "" + source = self._pending_image or driver.find_reference_url(text) + note = " 📎 reference" if source else "" self._log("you", text + note) self._set_image(None) - self._busy(True, "looking…" if (image or url) else "thinking…") + self._busy(True, "studying reference…" if source else "thinking…") def work(): - path = image - if path is None and url: - path = driver.fetch_image(url) # download the linked image - return self.c.run_command(text, image_path=path) + image_path = reference_text = None + if source: + image_path, reference_text = driver.resolve_reference(source) + return self.c.run_command(text, image_path=image_path, + reference_text=reference_text) def done(summary): self._busy(False) diff --git a/src/woodshop/gui/controller.py b/src/woodshop/gui/controller.py index b577bfb..3b3a9b4 100644 --- a/src/woodshop/gui/controller.py +++ b/src/woodshop/gui/controller.py @@ -427,9 +427,11 @@ class Controller(QObject): except (SceneError, ValueError, KeyError) as exc: return str(exc).strip('"') - def run_command(self, text: str, image_path: str | None = None) -> str: - """Interpret a spoken/typed command (optionally with a reference photo) and - apply it. Returns a spoken summary. (Slow — call from a worker thread.)""" + def run_command(self, text: str, image_path: str | None = None, + reference_text: str | None = None) -> str: + """Interpret a spoken/typed command (optionally with a reference photo, + plan, 3D render, or guide text) and apply it. Returns a spoken summary. + (Slow — call from a worker thread.)""" from ..scene import spatial_summary self.save() # ensure disk reflects current state sel = ", ".join(self.selected) if self.selected else "none" @@ -437,7 +439,8 @@ class Controller(QObject): + f"\nCurrently selected ('these' / 'them' / 'the selected'): {sel}" + "\n" + spatial_summary(self.scene)) calls = driver.interpret(text, self.schemas(), scene_text=scene_text, - history=self._history, image_path=image_path) + history=self._history, image_path=image_path, + reference_text=reference_text) messages = driver.dispatch(calls, verbose=False, executor=self.execute_call) self._commit() spoken = driver.summarize(calls, messages) diff --git a/tests/test_driver.py b/tests/test_driver.py index 67122e1..880b7b3 100644 --- a/tests/test_driver.py +++ b/tests/test_driver.py @@ -1,6 +1,8 @@ """Tests for the driver's orchestration logic (external tools are mocked).""" import json +import pytest + from woodshop import driver from woodshop.cli import normalize_anchor @@ -148,32 +150,33 @@ def test_woodshop_cmd_falls_back_to_module(monkeypatch): assert cmd[1:] == ["-m", "woodshop"] and cmd[0] # python -m woodshop -def test_find_image_url(): - assert driver.find_image_url("build like this https://x.com/chair.jpg please") \ +def test_find_reference_url(): + assert driver.find_reference_url("build like this https://x.com/chair.jpg please") \ == "https://x.com/chair.jpg" - assert driver.find_image_url("https://x.com/a.PNG") == "https://x.com/a.PNG" - assert driver.find_image_url("no image here http://x.com/page") is None + assert driver.find_reference_url("see https://x.com/how-to") == "https://x.com/how-to" + assert driver.find_reference_url("no url here") is None def test_interpret_includes_image_directive(monkeypatch, tmp_path): captured = {} - - def fake_run(cmd, stdin=""): - captured["prompt"] = stdin - return "[]" - + monkeypatch.setattr(driver, "_run", lambda cmd, stdin="": captured.update(prompt=stdin) or "[]") img = tmp_path / "ref.jpg" - img.write_bytes(b"\xff\xd8\xff") # not a real jpeg, just a path - monkeypatch.setattr(driver, "_run", fake_run) + img.write_bytes(b"\xff\xd8\xff") driver.interpret("build something like this", schemas="[]", scene_text="empty", image_path=str(img)) - assert "REFERENCE PHOTO" in captured["prompt"] - assert str(img) in captured["prompt"] + assert "REFERENCE" in captured["prompt"] and str(img) in captured["prompt"] -def test_fetch_image_writes_temp(monkeypatch): - import io +def test_interpret_includes_reference_text(monkeypatch): + captured = {} + monkeypatch.setattr(driver, "_run", lambda cmd, stdin="": captured.update(prompt=stdin) or "[]") + driver.interpret("build it", schemas="[]", scene_text="empty", + reference_text="Step 1: cut four legs 28 inches long.") + assert "build GUIDE" in captured["prompt"] + assert "cut four legs 28 inches" in captured["prompt"] + +def test_fetch_url_writes_temp(monkeypatch): class FakeResp: headers = {"Content-Type": "image/png"} def __enter__(self): return self @@ -181,9 +184,52 @@ def test_fetch_image_writes_temp(monkeypatch): def read(self): return b"\x89PNG\r\n\x1a\n" monkeypatch.setattr(driver.urllib.request, "urlopen", lambda *a, **k: FakeResp()) - path = driver.fetch_image("https://x.com/chair.png") + path = driver.fetch_url("https://x.com/chair.png") assert path.endswith(".png") - with open(path, "rb") as f: - assert f.read().startswith(b"\x89PNG") import os as _os _os.remove(path) + + +def test_fetch_web_text_strips_tags(monkeypatch): + html = b"

Build

a shelf" + + class FakeResp: + def __enter__(self): return self + def __exit__(self, *a): return False + def read(self): return html + + monkeypatch.setattr(driver.urllib.request, "urlopen", lambda *a, **k: FakeResp()) + text = driver.fetch_web_text("https://x.com/guide") + assert "Build a shelf" in text and "<" not in text and "x{}" not in text + + +def test_resolve_reference_routes_by_kind(monkeypatch, tmp_path): + # local image -> (path, None) + img = tmp_path / "a.png"; img.write_bytes(b"x") + assert driver.resolve_reference(str(img)) == (str(img), None) + # local pdf -> read directly (path, None) + pdf = tmp_path / "plan.pdf"; pdf.write_bytes(b"%PDF") + assert driver.resolve_reference(str(pdf)) == (str(pdf), None) + # web page URL -> (None, text) + monkeypatch.setattr(driver, "fetch_web_text", lambda u, **k: "guide text") + assert driver.resolve_reference("https://x.com/how-to-build") == (None, "guide text") + # 3D file -> render (mocked) -> (png, dims) + monkeypatch.setattr(driver, "render_mesh", lambda p: ("/tmp/r.png", "bbox 10x10x10")) + stl = tmp_path / "m.stl"; stl.write_bytes(b"solid") + assert driver.resolve_reference(str(stl)) == ("/tmp/r.png", "bbox 10x10x10") + + +def test_render_mesh_real_if_possible(tmp_path): + """Render an actual STL if pyvista + a working off-screen GL are available; + skip cleanly otherwise (headless boxes often lack GL).""" + pv = pytest.importorskip("pyvista") + stl = tmp_path / "box.stl" + try: + pv.Cube().save(str(stl)) + png, dims = driver.render_mesh(str(stl)) + except Exception as exc: # no GL / off-screen unsupported here + pytest.skip(f"offscreen render unavailable: {exc}") + import os as _os + assert _os.path.exists(png) and png.endswith(".png") + assert "bounding box" in dims + _os.remove(png) diff --git a/tests/test_gui_controller.py b/tests/test_gui_controller.py index 106c10f..404b3a3 100644 --- a/tests/test_gui_controller.py +++ b/tests/test_gui_controller.py @@ -145,7 +145,7 @@ def test_run_command_threads_history(tmp_path, monkeypatch): c = _controller(tmp_path) seen = {} - def fake_interpret(text, schemas, scene_text=None, history=None, image_path=None): + def fake_interpret(text, schemas, scene_text=None, history=None, image_path=None, reference_text=None): seen["history"] = list(history or []) return [{"tool": "say", "args": {"text": "want me to add tenons?"}}] @@ -162,7 +162,7 @@ def test_run_command_forwards_image_path(tmp_path, monkeypatch): c = _controller(tmp_path) seen = {} - def fake_interpret(text, schemas, scene_text=None, history=None, image_path=None): + def fake_interpret(text, schemas, scene_text=None, history=None, image_path=None, reference_text=None): seen["image_path"] = image_path return [{"tool": "say", "args": {"text": "ok"}}]