"use strict";(globalThis.webpackChunkproject_public_docs=globalThis.webpackChunkproject_public_docs||[]).push([[413],{6785(e,n,i){i.r(n),i.d(n,{assets:()=>a,contentTitle:()=>o,default:()=>h,frontMatter:()=>l,metadata:()=>t,toc:()=>c});const t=JSON.parse('{"id":"overview","title":"Live Two-Way Chat","description":"Real-time conversational AI with natural speech flow - moving beyond forum-style turn-taking.","source":"@site/docs/overview.md","sourceDirName":".","slug":"/","permalink":"/rob/live-two-way-chat/","draft":false,"unlisted":false,"tags":[],"version":"current","sidebarPosition":1,"frontMatter":{"slug":"/","sidebar_position":1},"sidebar":"docs"}');var s=i(4848),r=i(8453);const l={slug:"/",sidebar_position:1},o="Live Two-Way Chat",a={},c=[{value:"Vision",id:"vision",level:2},{value:"The Problem",id:"the-problem",level:3},{value:"The Solution",id:"the-solution",level:3},{value:"Shared Context Window",id:"shared-context-window",level:2},{value:"Technical Challenges",id:"technical-challenges",level:2},{value:"Potential Architecture",id:"potential-architecture",level:2},{value:"Inspiration",id:"inspiration",level:2},{value:"Related Projects",id:"related-projects",level:2}];function d(e){const n={a:"a",code:"code",h1:"h1",h2:"h2",h3:"h3",header:"header",li:"li",ol:"ol",p:"p",pre:"pre",strong:"strong",table:"table",tbody:"tbody",td:"td",th:"th",thead:"thead",tr:"tr",ul:"ul",...(0,r.R)(),...e.components};return(0,s.jsxs)(s.Fragment,{children:[(0,s.jsx)(n.header,{children:(0,s.jsx)(n.h1,{id:"live-two-way-chat",children:"Live Two-Way Chat"})}),"\n",(0,s.jsx)(n.p,{children:"Real-time conversational AI with natural speech flow - moving beyond forum-style turn-taking."}),"\n",(0,s.jsx)(n.h2,{id:"vision",children:"Vision"}),"\n",(0,s.jsxs)(n.p,{children:["Current chatbot conversations are essentially forums with near-instant replies. Humans don't listen to someone speak, stop, think about the context, then respond with an entire paragraph. ",(0,s.jsx)(n.strong,{children:"Live Two-Way Chat"})," simulates natural human conversation:"]}),"\n",(0,s.jsx)(n.h3,{id:"the-problem",children:"The Problem"}),"\n",(0,s.jsx)(n.p,{children:"Traditional chat interfaces:"}),"\n",(0,s.jsxs)(n.ul,{children:["\n",(0,s.jsx)(n.li,{children:"Wait for complete user input before processing"}),"\n",(0,s.jsx)(n.li,{children:"Generate entire responses at once"}),"\n",(0,s.jsx)(n.li,{children:"Can't be interrupted or course-corrected mid-thought"}),"\n",(0,s.jsx)(n.li,{children:"Feel robotic and turn-based"}),"\n"]}),"\n",(0,s.jsx)(n.h3,{id:"the-solution",children:"The Solution"}),"\n",(0,s.jsx)(n.p,{children:"A real-time bidirectional conversation where:"}),"\n",(0,s.jsxs)(n.ol,{children:["\n",(0,s.jsxs)(n.li,{children:[(0,s.jsx)(n.strong,{children:"Continuous transcription"})," - Human voice is transcribed in small constant chunks in the background"]}),"\n",(0,s.jsxs)(n.li,{children:[(0,s.jsx)(n.strong,{children:"Predictive response preparation"})," - AI analyzes context and pre-prepares replies, modifying them as new context arrives"]}),"\n",(0,s.jsxs)(n.li,{children:[(0,s.jsx)(n.strong,{children:"Natural interruption"})," - AI decides when to speak:","\n",(0,s.jsxs)(n.ul,{children:["\n",(0,s.jsx)(n.li,{children:"Sometimes interrupting if an important point needs to be made"}),"\n",(0,s.jsx)(n.li,{children:"Sometimes waiting for a question to be asked"}),"\n"]}),"\n"]}),"\n",(0,s.jsxs)(n.li,{children:[(0,s.jsx)(n.strong,{children:"Bidirectional listening"})," - The chatbot listens even while speaking, taking into account what it was saying when interrupted"]}),"\n",(0,s.jsxs)(n.li,{children:[(0,s.jsx)(n.strong,{children:"Shared context window"})," - A visual workspace for files and artifacts"]}),"\n"]}),"\n",(0,s.jsx)(n.h2,{id:"shared-context-window",children:"Shared Context Window"}),"\n",(0,s.jsx)(n.p,{children:"A drag-and-drop workspace visible to both human and AI:"}),"\n",(0,s.jsxs)(n.table,{children:[(0,s.jsx)(n.thead,{children:(0,s.jsxs)(n.tr,{children:[(0,s.jsx)(n.th,{children:"Content Type"}),(0,s.jsx)(n.th,{children:"Behavior"})]})}),(0,s.jsxs)(n.tbody,{children:[(0,s.jsxs)(n.tr,{children:[(0,s.jsx)(n.td,{children:(0,s.jsx)(n.strong,{children:"Images"})}),(0,s.jsx)(n.td,{children:"Displayed for user, visible to AI for analysis"})]}),(0,s.jsxs)(n.tr,{children:[(0,s.jsx)(n.td,{children:(0,s.jsx)(n.strong,{children:"Code"})}),(0,s.jsx)(n.td,{children:"Displayed and editable by user, AI can view and modify"})]}),(0,s.jsxs)(n.tr,{children:[(0,s.jsx)(n.td,{children:(0,s.jsx)(n.strong,{children:"Documents"})}),(0,s.jsx)(n.td,{children:"Shared context for conversation"})]}),(0,s.jsxs)(n.tr,{children:[(0,s.jsx)(n.td,{children:(0,s.jsx)(n.strong,{children:"Split view"})}),(0,s.jsx)(n.td,{children:"Window can split to show 2+ files simultaneously"})]})]})]}),"\n",(0,s.jsx)(n.p,{children:"The AI can:"}),"\n",(0,s.jsxs)(n.ul,{children:["\n",(0,s.jsx)(n.li,{children:"View what's in the window"}),"\n",(0,s.jsx)(n.li,{children:"Edit code or text files"}),"\n",(0,s.jsx)(n.li,{children:"Reference images in conversation"}),"\n",(0,s.jsx)(n.li,{children:"Suggest changes visually"}),"\n"]}),"\n",(0,s.jsx)(n.h2,{id:"technical-challenges",children:"Technical Challenges"}),"\n",(0,s.jsxs)(n.ol,{children:["\n",(0,s.jsxs)(n.li,{children:[(0,s.jsx)(n.strong,{children:"Streaming ASR"})," - Real-time speech-to-text with low latency"]}),"\n",(0,s.jsxs)(n.li,{children:[(0,s.jsx)(n.strong,{children:"Incremental response generation"})," - Partial responses that can be updated"]}),"\n",(0,s.jsxs)(n.li,{children:[(0,s.jsx)(n.strong,{children:"Turn-taking model"})," - When to speak, when to wait, when to interrupt"]}),"\n",(0,s.jsxs)(n.li,{children:[(0,s.jsx)(n.strong,{children:"Context threading"})," - Tracking what was said/being-said when interruptions occur"]}),"\n",(0,s.jsxs)(n.li,{children:[(0,s.jsx)(n.strong,{children:"Audio ducking"})," - Managing simultaneous speech gracefully"]}),"\n"]}),"\n",(0,s.jsx)(n.h2,{id:"potential-architecture",children:"Potential Architecture"}),"\n",(0,s.jsx)(n.pre,{children:(0,s.jsx)(n.code,{children:"\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510 \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502 Microphone \u2502\u2500\u2500\u2500\u2500\u25b6\u2502 Streaming ASR \u2502\n\u2502 (continuous) \u2502 \u2502 (Whisper/etc) \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518 \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n \u2502 text chunks\n \u25bc\n\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510 \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502 Speaker \u2502\u25c0\u2500\u2500\u2500\u2500\u2502 Response Engine \u2502\n\u2502 (TTS) \u2502 \u2502 (predictive) \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518 \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n \u2502\n \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u25bc\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n \u2502 Context Window \u2502\n \u2502 (shared state) \u2502\n \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n"})}),"\n",(0,s.jsx)(n.h2,{id:"inspiration",children:"Inspiration"}),"\n",(0,s.jsxs)(n.ul,{children:["\n",(0,s.jsx)(n.li,{children:"Natural human conversations (overlapping speech, interruptions, backchanneling)"}),"\n",(0,s.jsx)(n.li,{children:"Real-time collaborative editors (Google Docs)"}),"\n",(0,s.jsx)(n.li,{children:"Voice assistants that feel less robotic"}),"\n",(0,s.jsx)(n.li,{children:"Pair programming conversations"}),"\n"]}),"\n",(0,s.jsx)(n.h2,{id:"related-projects",children:"Related Projects"}),"\n",(0,s.jsxs)(n.ul,{children:["\n",(0,s.jsxs)(n.li,{children:[(0,s.jsx)(n.a,{href:"../ramble/overview",children:"Ramble"})," - Voice transcription (could provide ASR component)"]}),"\n",(0,s.jsxs)(n.li,{children:[(0,s.jsx)(n.a,{href:"../artifact-editor/overview",children:"Artifact Editor"})," - Could power the shared context window"]}),"\n"]})]})}function h(e={}){const{wrapper:n}={...(0,r.R)(),...e.components};return n?(0,s.jsx)(n,{...e,children:(0,s.jsx)(d,{...e})}):d(e)}},8453(e,n,i){i.d(n,{R:()=>l,x:()=>o});var t=i(6540);const s={},r=t.createContext(s);function l(e){const n=t.useContext(r);return t.useMemo(function(){return"function"==typeof e?e(n):{...n,...e}},[n,e])}function o(e){let n;return n=e.disableParentContext?"function"==typeof e.components?e.components(s):e.components||s:l(e.components),t.createElement(r.Provider,{value:n},e.children)}}}]);