diff --git a/src/chatbot/chatbot_core.py b/src/chatbot/chatbot_core.py
index 24b8eef09..946d0f39a 100644
--- a/src/chatbot/chatbot_core.py
+++ b/src/chatbot/chatbot_core.py
@@ -3,12 +3,11 @@
 import os
 import re
 import json
-from typing import Dict, Any, Tuple, List
+from typing import Dict, Any, Tuple, List, Generator, Optional, Callable
 from .error_solutions import get_error_solution
 from .image_handler import analyze_and_extract
-from .ollama_runner import run_ollama
+from .ollama_runner import run_ollama, run_ollama_stream, get_embedding, CONFIG
 from .knowledge_base import search_knowledge
-from .ollama_runner import get_embedding
 
 # ==================== ESIM WORKFLOW KNOWLEDGE ====================
 
@@ -115,10 +114,12 @@ def clear_history() -> None:
 
 # ==================== ESIM ERROR LOGIC ====================
 
-def answer_with_rag_fallback(user_input: str) -> str:
+def answer_with_rag_fallback(user_input: str,
+                             on_chunk: Optional[Callable[[str], None]] = None) -> str:
     """
     Try to answer using eSim manuals (RAG).
     If nothing relevant is found, fallback to Ollama.
+    Streaming is enabled when CONFIG["streaming"]["enabled"] is true (default).
     """
 
     rag_context = search_knowledge(user_input)
@@ -137,7 +138,7 @@ def answer_with_rag_fallback(user_input: str) -> str:
 
 Answer clearly and step-by-step.
 """
-        return run_ollama(prompt)
+        return run_ollama(prompt, on_chunk=on_chunk)
 
     # Fallback: general LLM answer
     prompt = f"""
@@ -145,7 +146,7 @@ def answer_with_rag_fallback(user_input: str) -> str:
 
 {user_input}
 """
-    return run_ollama(prompt)
+    return run_ollama(prompt, on_chunk=on_chunk)
 
 def detect_esim_errors(image_context: Dict[str, Any], user_input: str) -> str:
     """
@@ -257,10 +258,12 @@ def clean_response_raw(raw: str) -> str:
     return cleaned.strip()
 
 
-def _history_to_text(history: List[Dict[str, str]] | None, max_turns: int = 6) -> str:
-    """Convert history to readable text with MORE context (6 turns)."""
+def _history_to_text(history: List[Dict[str, str]] | None, max_turns: int | None = None) -> str:
+    """Convert history to readable text. max_turns defaults to CONFIG.history.context_turns."""
     if not history:
         return ""
+    if max_turns is None:
+        max_turns = int(CONFIG.get("history", {}).get("context_turns", 6))
     recent = history[-max_turns:]
     lines: List[str] = []
     for i, t in enumerate(recent, 1):
@@ -282,30 +285,30 @@ def _is_follow_up_question(user_input: str, history: List[Dict[str, str]] | None
     """
     if not history:
         return False
-    
+
     user_lower = user_input.lower().strip()
     words = user_lower.split()
-    
-    
+
     if len(words) <= 7:
         return True
-    
+
     pronouns = ["it", "that", "this", "those", "these", "they", "them"]
     if any(pronoun in words for pronoun in pronouns):
         return True
-    
+
     continuations = [
         "what next", "next step", "after that", "and then", "then what",
         "what about", "how about", "what if", "but why", "why not"
     ]
     if any(phrase in user_lower for phrase in continuations):
         return True
-    
+
     question_starters = ["why", "how", "where", "when", "what", "which"]
     if words[0] in question_starters and len(words) <= 5:
         return True
-    
+
     return False
+
 import numpy as np
 
 def is_semantic_topic_switch(
@@ -358,7 +361,7 @@ def classify_question_type(user_input: str, has_image_context: bool,
                            history: List[Dict[str, str]] | None = None) -> str:
     """
     Classify question type for smart routing.
-    Returns: 'greeting', 'simple', 'esim', 'image_query', 'follow_up_image', 
+    Returns: 'greeting', 'simple', 'esim', 'image_query', 'follow_up_image',
              'follow_up', 'netlist'
     """
     user_lower = user_input.lower()
@@ -373,7 +376,7 @@ def classify_question_type(user_input: str, has_image_context: bool,
         follow_phrases = [
             "this circuit", "that circuit", "in this schematic",
             "components here", "what is the value", "how many",
-            "the circuit", "this schematic","what","can","how"
+            "the circuit", "this schematic", "what", "can", "how"
         ]
         if any(p in user_lower for p in follow_phrases):
             return "follow_up_image"
@@ -388,9 +391,8 @@ def classify_question_type(user_input: str, has_image_context: bool,
         print("[COPILOT] Topic switch detected (semantic)")
         is_followup = False
 
-    if not is_followup:
+    if not is_followup and history is not None:
         history.clear()
-        LAST_IMAGE_CONTEXT = None
 
     esim_keywords = [
         "esim", "kicad", "ngspice", "spice", "simulation", "netlist",
@@ -422,50 +424,55 @@ def handle_greeting() -> str:
     )
 
 
-def handle_simple_question(user_input: str) -> str:
+def handle_simple_question(user_input: str,
+                           on_chunk: Optional[Callable[[str], None]] = None) -> str:
     """
     Handles standalone questions.
-    Uses RAG first, then falls back to Ollama.
-    keep in mind that your a copilot of eSim an EDA tool
+    Uses RAG first, then falls back to Ollama. Streaming-aware.
     """
-    return answer_with_rag_fallback(user_input)
+    return answer_with_rag_fallback(user_input, on_chunk=on_chunk)
 
 
 def handle_follow_up(user_input: str,
                      image_context: Dict[str, Any],
-                     history: List[Dict[str, str]] | None = None) -> str:
+                     history: List[Dict[str, str]] | None = None,
+                     on_chunk: Optional[Callable[[str], None]] = None) -> str:
     """
     Handle follow-up questions that depend on conversation history.
     This handler PRIORITIZES history over RAG.
     """
-    history_text = _history_to_text(history, max_turns=6)
-    
+    history_text = _history_to_text(history)
+
     if not history_text:
         return "I need more context. Could you provide more details about your question?"
-    
+
     rag_context = ""
     user_lower = user_input.lower()
     if any(kw in user_lower for kw in ["model", "spice", "ground", "error", "netlist"]):
-        rag_context = search_knowledge(user_input, n_results=2)
-    
+        n = int(CONFIG.get("rag", {}).get("follow_up_n_results", 2))
+        rag_context = search_knowledge(user_input, n_results=n)
+
+    follow_up_rule = CONFIG["system_rules"].get("follow_up_rule", "")
+
     prompt = (
         "You are an eSim expert assistant. The user is asking a follow-up question.\n\n"
+        f"{follow_up_rule}\n\n"
         "=== CONVERSATION HISTORY (MOST IMPORTANT) ===\n"
         f"{history_text}\n"
         "=============================================\n\n"
         f"=== CURRENT USER QUESTION (FOLLOW-UP) ===\n{user_input}\n\n"
     )
-    
+
     if rag_context:
         prompt += f"=== REFERENCE MANUAL (if needed) ===\n{rag_context}\n\n"
-    
+
     if image_context:
         prompt += (
             f"=== CURRENT CIRCUIT CONTEXT ===\n"
             f"Type: {image_context.get('circuit_analysis', {}).get('circuit_type', 'Unknown')}\n"
             f"Components: {image_context.get('components', [])}\n\n"
         )
-    
+
     prompt += (
         "CRITICAL INSTRUCTIONS:\n"
         "1. The user's question refers to the CONVERSATION HISTORY above.\n"
@@ -477,13 +484,14 @@ def handle_follow_up(user_input: str,
         "7. Keep answer concise (max 150 words).\n\n"
         "Answer:"
     )
-    
-    return run_ollama(prompt, mode="default")
+
+    return run_ollama(prompt, mode="follow_up", on_chunk=on_chunk)
 
 
 def handle_esim_question(user_input: str,
                          image_context: Dict[str, Any],
-                         history: List[Dict[str, str]] | None = None) -> str:
+                         history: List[Dict[str, str]] | None = None,
+                         on_chunk: Optional[Callable[[str], None]] = None) -> str:
     """
     Handle eSim-specific questions with RAG + conversation history.
     """
@@ -500,11 +508,12 @@ def handle_esim_question(user_input: str,
         )
         if cmd:
             answer += f"**eSim action:** {cmd}\n"
-        return answer_with_rag_fallback(user_input)
+        return answer_with_rag_fallback(user_input, on_chunk=on_chunk)
 
-    history_text = _history_to_text(history, max_turns=6)
+    history_text = _history_to_text(history)
 
-    rag_context = search_knowledge(user_input, n_results=5)
+    n_results = int(CONFIG.get("rag", {}).get("default_n_results", 5))
+    rag_context = search_knowledge(user_input, n_results=n_results)
 
     image_context_str = ""
     if image_context:
@@ -515,16 +524,19 @@ def handle_esim_question(user_input: str,
             f"Values: {image_context.get('values', {})}\n"
         )
 
+    esim_rule = CONFIG["system_rules"].get("esim_rule", "")
+
     prompt = (
         "You are an eSim expert. Answer using the workflows, manual, and conversation history.\n\n"
+        f"{esim_rule}\n\n"
         f"{ESIM_WORKFLOWS}\n\n"
         f"=== MANUAL CONTEXT ===\n{rag_context}\n"
         f"{image_context_str}\n"
     )
-    
+
     if history_text:
         prompt += f"=== CONVERSATION HISTORY ===\n{history_text}\n\n"
-    
+
     prompt += (
         f"USER QUESTION: {user_input}\n\n"
         "INSTRUCTIONS:\n"
@@ -535,7 +547,7 @@ def handle_esim_question(user_input: str,
         "Answer:"
     )
 
-    return run_ollama(prompt, mode="default")
+    return run_ollama(prompt, mode="default", on_chunk=on_chunk)
 
 
 def handle_image_query(user_input: str) -> Tuple[str, Dict[str, Any]]:
@@ -585,7 +597,8 @@ def handle_image_query(user_input: str) -> Tuple[str, Dict[str, Any]]:
 
 
 def handle_follow_up_image_question(user_input: str,
-                                    image_context: Dict[str, Any]) -> str:
+                                    image_context: Dict[str, Any],
+                                    on_chunk: Optional[Callable[[str], None]] = None) -> str:
     """
     Answer questions about an analyzed image using ONLY extracted data.
     """
@@ -613,23 +626,27 @@ def handle_follow_up_image_question(user_input: str,
         "Answer:"
     )
 
-    return run_ollama(prompt, mode="default")
+    return run_ollama(prompt, mode="default", on_chunk=on_chunk)
 
 
-def handle_netlist_analysis(user_input: str) -> str:
+def handle_netlist_analysis(user_input: str,
+                            on_chunk: Optional[Callable[[str], None]] = None) -> str:
     """
     Handle netlist analysis prompts (FACT-based prompt from GUI).
     """
-    raw_reply = run_ollama(user_input)
+    raw_reply = run_ollama(user_input, on_chunk=on_chunk)
     return clean_response_raw(raw_reply)
 
 
 # ==================== MAIN ROUTER ====================
 
 def handle_input(user_input: str,
-                 history: List[Dict[str, str]] | None = None) -> str:
+                 history: List[Dict[str, str]] | None = None,
+                 on_chunk: Optional[Callable[[str], None]] = None) -> str:
     """
-    Main router. Accepts optional conversation history for follow-up understanding.
+    Main router. Accepts optional conversation history for follow-up understanding,
+    and an optional on_chunk(text) callback that's invoked for each streamed token.
+    Returns the full assembled response.
     """
     global LAST_IMAGE_CONTEXT, LAST_BOT_REPLY
 
@@ -638,7 +655,7 @@ def handle_input(user_input: str,
         return "Please enter a query."
 
     if "[ESIM_NETLIST_START]" in user_input:
-        raw_reply = run_ollama(user_input)
+        raw_reply = run_ollama(user_input, on_chunk=on_chunk)
         cleaned = clean_response_raw(raw_reply)
         LAST_BOT_REPLY = cleaned
         return cleaned
@@ -650,24 +667,36 @@ def handle_input(user_input: str,
 
     try:
         if question_type == "netlist":
-            response = handle_netlist_analysis(user_input)
+            response = handle_netlist_analysis(user_input, on_chunk=on_chunk)
 
         elif question_type == "greeting":
             response = handle_greeting()
+            if on_chunk:
+                on_chunk(response)  # send the greeting through the stream once
 
         elif question_type == "image_query":
             response, LAST_IMAGE_CONTEXT = handle_image_query(user_input)
+            if on_chunk:
+                on_chunk(response)
 
         elif question_type == "follow_up_image":
-            response = handle_follow_up_image_question(user_input, LAST_IMAGE_CONTEXT)
+            response = handle_follow_up_image_question(
+                user_input, LAST_IMAGE_CONTEXT, on_chunk=on_chunk
+            )
 
         elif question_type == "simple":
-            response = handle_simple_question(user_input)
+            response = handle_simple_question(user_input, on_chunk=on_chunk)
 
         elif question_type == "follow_up" and history:
-            response = handle_follow_up(user_input, LAST_IMAGE_CONTEXT, history)
+            response = handle_follow_up(
+                user_input, LAST_IMAGE_CONTEXT, history, on_chunk=on_chunk
+            )
+        elif question_type == "esim":
+            response = handle_esim_question(
+                user_input, LAST_IMAGE_CONTEXT, history, on_chunk=on_chunk
+            )
         else:
-            response = handle_simple_question(user_input)
+            response = handle_simple_question(user_input, on_chunk=on_chunk)
 
         LAST_BOT_REPLY = response
         return response
@@ -678,24 +707,82 @@ def handle_input(user_input: str,
         return error_msg
 
 
+def handle_input_stream(user_input: str,
+                        history: List[Dict[str, str]] | None = None
+                        ) -> Generator[str, None, None]:
+    """
+    Streaming variant of handle_input. Yields response chunks as they arrive
+    from Ollama (or as whole strings for non-LLM branches like greeting / image).
+
+    Usage in a Qt GUI:
+        for chunk in wrapper.handle_input_stream(text):
+            append_chunk_to_chat_window(chunk)
+    """
+    import queue
+    import threading
+
+    q: "queue.Queue[Optional[str]]" = queue.Queue()
+
+    def _on_chunk(text: str) -> None:
+        q.put(text)
+
+    def _worker() -> None:
+        try:
+            handle_input(user_input, history, on_chunk=_on_chunk)
+        finally:
+            q.put(None)  # sentinel
+
+    t = threading.Thread(target=_worker, daemon=True)
+    t.start()
+
+    while True:
+        item = q.get()
+        if item is None:
+            break
+        yield item
+
+
 # ==================== WRAPPER ====================
 
 class ESIMCopilotWrapper:
     def __init__(self) -> None:
         self.history: List[Dict[str, str]] = []
+        self._max_history = int(CONFIG.get("history", {}).get("max_turns", 12))
+
+    def _trim(self) -> None:
+        if len(self.history) > self._max_history:
+            self.history = self.history[-self._max_history:]
 
-    def handle_input(self, user_input: str) -> str:
-        reply = handle_input(user_input, self.history)
+    def handle_input(self, user_input: str,
+                     on_chunk: Optional[Callable[[str], None]] = None) -> str:
+        reply = handle_input(user_input, self.history, on_chunk=on_chunk)
         self.history.append({"user": user_input, "bot": reply})
-        if len(self.history) > 12:
-            self.history = self.history[-12:]
+        self._trim()
         return reply
 
+    def handle_input_stream(self, user_input: str) -> Generator[str, None, None]:
+        """
+        Yield chunks as they arrive, then record the full reply in history.
+        """
+        collected: List[str] = []
+        for chunk in handle_input_stream(user_input, self.history):
+            collected.append(chunk)
+            yield chunk
+        full = "".join(collected).strip()
+        self.history.append({"user": user_input, "bot": full})
+        self._trim()
+
     def analyze_schematic(self, query: str) -> str:
         return self.handle_input(query)
 
+
 _GLOBAL_WRAPPER = ESIMCopilotWrapper()
 
 
 def analyze_schematic(query: str) -> str:
     return _GLOBAL_WRAPPER.handle_input(query)
+
+
+def analyze_schematic_stream(query: str) -> Generator[str, None, None]:
+    """Streaming alternative to analyze_schematic."""
+    return _GLOBAL_WRAPPER.handle_input_stream(query)
\ No newline at end of file
diff --git a/src/chatbot/chatbot_thread.py b/src/chatbot/chatbot_thread.py
index f134d4f57..5008830cd 100644
--- a/src/chatbot/chatbot_thread.py
+++ b/src/chatbot/chatbot_thread.py
@@ -1,5 +1,6 @@
 import os
 import re
+import json
 import socket
 import subprocess
 import time
@@ -23,38 +24,119 @@
     _PIL_AVAILABLE = False
 
 
+# ── Built-in default prompts (used if config.json is missing) ─────────────────
+
+_DEFAULT_SYSTEM_PROMPT = """You are an expert electronics engineer and the AI assistant embedded inside eSim, an open-source EDA tool developed by FOSSEE at IIT Bombay.
+
+Your expertise includes:
+- KiCad schematic capture, symbols, labels, ERC issues, footprints
+- NgSpice simulations and SPICE netlists
+- Circuit debugging and simulation troubleshooting
+- eSim workflow: KiCad → netlist → NgSpice → analysis
+
+Rules:
+- Be practical, direct, and technically useful.
+- Match response length to question complexity.
+- For debugging, explain both WHY and HOW to fix the issue.
+- When code or SPICE is needed, use a fenced code block.
+- If uncertain, say likely / appears to be, but still provide analysis.
+"""
+
+_DEFAULT_VISION_SYSTEM_PROMPT = """You are an expert electronics engineer and the AI assistant inside eSim, an open-source EDA tool by FOSSEE at IIT Bombay.
+
+You are given one or more schematic images from eSim or KiCad. Read every visible label, net name, component reference, value, and pin number, and answer the user's question accurately and helpfully. Never refuse to analyse. Be concise and use the visible reference designators (R1, C3, U2, etc.).
+"""
+
+
+# ── Configuration layer (config.json) ─────────────────────────────────────────
+#
+# config.json sits next to this file (src/chatbot/config.json). Editing it lets
+# you change the assistant's system rules and model parameters WITHOUT touching
+# code — restart eSim and the new behaviour takes effect. If the file is missing
+# or malformed, the built-in defaults below are used so the app still runs.
+
+_CONFIG_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config.json")
+
+_DEFAULT_CONFIG = {
+    "system_rules": {
+        "text_system_prompt": _DEFAULT_SYSTEM_PROMPT,
+        "vision_system_prompt": _DEFAULT_VISION_SYSTEM_PROMPT,
+    },
+    "context_window": {
+        "text_num_ctx": 1024,
+        "vision_num_ctx": 1024,
+        "vision_num_predict": 512,
+    },
+    "sampling": {
+        "repeat_penalty": 1.08,
+        "vision_temperature": 0.15,
+        "vision_repeat_penalty": 1.05,
+    },
+    "runtime": {
+        "keep_alive": "-1m",
+    },
+    "history": {
+        "max_lines": 6,
+    },
+}
+
+
+def _deep_merge(base: dict, override: dict) -> dict:
+    out = dict(base)
+    for k, v in (override or {}).items():
+        if k in out and isinstance(out[k], dict) and isinstance(v, dict):
+            out[k] = _deep_merge(out[k], v)
+        else:
+            out[k] = v
+    return out
+
+
+def load_config() -> dict:
+    """Load config.json merged over the built-in defaults."""
+    cfg = dict(_DEFAULT_CONFIG)
+    try:
+        if os.path.isfile(_CONFIG_PATH):
+            with open(_CONFIG_PATH, "r", encoding="utf-8") as f:
+                cfg = _deep_merge(_DEFAULT_CONFIG, json.load(f))
+            print(f"[CONFIG] Loaded {_CONFIG_PATH}")
+        else:
+            print(f"[CONFIG] No config.json found at {_CONFIG_PATH} — using defaults.")
+    except Exception as e:
+        print(f"[CONFIG] Failed to read config.json ({e}) — using defaults.")
+    return cfg
+
+
+CONFIG = load_config()
+
+# Resolve the active prompts from config (with fallback to the constants).
+_SYSTEM_PROMPT = CONFIG["system_rules"].get("text_system_prompt", _DEFAULT_SYSTEM_PROMPT)
+_VISION_SYSTEM_PROMPT = CONFIG["system_rules"].get("vision_system_prompt", _DEFAULT_VISION_SYSTEM_PROMPT)
+
+
 # ── Image preprocessing ───────────────────────────────────────────────────────
 
-# llava internally resizes images to 336×336 anyway.
-# Downscaling large images before sending saves encoding time and reduces
-# the number of tokens the model spends on the image.
-_MAX_IMAGE_DIM = 512   # pixels on longest side — fast, good quality
+_MAX_IMAGE_DIM = 512
 
 
 def _downscale_image_bytes(raw_bytes: bytes) -> bytes:
-    """
-    Downscale image to _MAX_IMAGE_DIM on the longest side using PIL.
-    Returns original bytes if PIL is unavailable or image is already small.
-    """
     if not _PIL_AVAILABLE:
         return raw_bytes
     try:
         img = _PilImage.open(_io.BytesIO(raw_bytes))
         w, h = img.size
         if max(w, h) <= _MAX_IMAGE_DIM:
-            return raw_bytes          # already small enough
+            return raw_bytes
         scale  = _MAX_IMAGE_DIM / max(w, h)
         new_w  = max(1, int(w * scale))
         new_h  = max(1, int(h * scale))
         img    = img.resize((new_w, new_h), _PilImage.LANCZOS)
-        # Convert to RGB (handles RGBA/P mode PNGs)
         if img.mode not in ("RGB", "L"):
             img = img.convert("RGB")
         buf = _io.BytesIO()
         img.save(buf, format="JPEG", quality=85)
         return buf.getvalue()
     except Exception:
-        return raw_bytes              # fall back to original on any error
+        return raw_bytes
 
 
 # ── Connectivity / runtime helpers ───────────────────────────────────────────
@@ -71,7 +153,6 @@ def _check_internet(host="8.8.8.8", port=53, timeout=2):
 
 
 def get_stt_backend() -> str:
-    """Returns 'google' if SpeechRecognition is installed, else 'none'."""
     if _SR_AVAILABLE:
         return "google"
     return "none"
@@ -87,12 +168,6 @@ def is_ollama_running():
 
 
 def start_ollama(stop_flag=None):
-    """
-    Start Ollama server if needed.
-
-    Accepts an optional stop_flag callable so the caller can cancel startup.
-    The polling loop checks stop_flag() each second and exits early if cancelled.
-    """
     if os.name == 'nt':
         subprocess.Popen('start cmd /k "ollama serve"', shell=True)
     else:
@@ -102,7 +177,6 @@ def start_ollama(stop_flag=None):
              'gnome-terminal -- ollama serve || '
              'xterm -e "ollama serve"']
         )
-
     for _ in range(30):
         if stop_flag is not None and stop_flag():
             return False
@@ -164,9 +238,7 @@ def run(self):
                 if name:
                     names.append(name)
 
-            # Keep the vision model cache warm so image sends don't block
             _refresh_model_cache()
-
             self.result_signal.emit(names if names else ['qwen2.5-coder:3b'])
         except Exception:
             self.result_signal.emit(['qwen2.5-coder:3b'])
@@ -175,13 +247,10 @@ def run(self):
 # ── Smart token budget ───────────────────────────────────────────────────────
 
 _COMPLEX_KEYWORDS = {
-    # netlist / SPICE
     'netlist', 'spice', 'ngspice', '.tran', '.ac', '.dc', '.model',
     'subcircuit', 'convergence', 'singular', 'timestep',
-    # debugging
     'error', 'debug', 'fix', 'wrong', 'issue', 'problem', 'fail',
     'simulate', 'simulation', 'analyse', 'analyze',
-    # circuit design
     'schematic', 'kicad', 'footprint', 'component', 'resistor',
     'capacitor', 'mosfet', 'transistor', 'opamp', 'voltage', 'current',
 }
@@ -193,87 +262,30 @@ def run(self):
 
 
 def _smart_num_predict(user_messages: list, user_override: int = 1024) -> int:
-    """
-    Choose a token budget based on message complexity rather than a flat cap.
-    This is the single biggest speed improvement: most answers need far fewer
-    tokens than the hard 1024 limit.
-
-    Tiers (all well within correct/complete answer range for each type):
-      simple question  →  256 tokens  (~1-2 min on slow CPU, ~20s on fast)
-      technical        →  512 tokens  (~2-4 min on slow CPU, ~40s on fast)
-      netlist/debug    →  768 tokens  (~3-6 min on slow CPU, ~60s on fast)
-
-    If the user has manually set a lower budget via the settings slider,
-    we always respect that as an upper bound.
-
-    Combines up to 4 recent history lines (2 user turns + their bot replies)
-    for a better complexity signal than looking at user messages alone.
-    """
-    # Combine the last 4 history lines (up to 2 user turns + their bot replies)
     combined = " ".join(
         line[5:].lower() for line in user_messages[-4:]
         if line.startswith("User:")
     )
-
-    # Check for complex technical content
     is_complex = any(kw in combined for kw in _COMPLEX_KEYWORDS)
-    # Check for simple definitional questions
     is_simple  = any(kw in combined for kw in _SIMPLE_KEYWORDS) and not is_complex
-    # Long message = detailed question = needs detailed answer
     is_long    = len(combined) > 300
 
     if is_simple and not is_long:
-        budget = 256
+        budget = 128
     elif is_complex or is_long:
-        budget = 768
+        budget = 512
     else:
-        budget = 384
+        budget = 256
 
-    # Respect the user's slider setting as a ceiling
     return min(budget, user_override)
 
 
-# ── System prompts ────────────────────────────────────────────────────────────
-
-_SYSTEM_PROMPT = """You are an expert electronics engineer and the AI assistant embedded inside eSim, an open-source EDA tool developed by FOSSEE at IIT Bombay.
-
-Your expertise includes:
-- KiCad schematic capture, symbols, labels, ERC issues, footprints
-- NgSpice simulations and SPICE netlists
-- Circuit debugging and simulation troubleshooting
-- FPGA, MCU, power, reset, SPI, pull-up/pull-down, decoupling and connector review
-- Reading partial schematic screenshots from EDA tools
-- eSim workflow: KiCad → netlist → NgSpice → analysis
-
-Rules:
-- Be practical, direct, and technically useful.
-- Match response length to question complexity.
-- For debugging, explain both WHY and HOW to fix the issue.
-- When code or SPICE is needed, use a fenced code block.
-- If the user provides a schematic image, analyze the visible block instead of giving a generic refusal.
-- If uncertain, say likely / appears to be, but still provide analysis.
-"""
-
-
-_VISION_SYSTEM_PROMPT = """You are an expert electronics engineer and the AI assistant inside eSim, an open-source EDA tool by FOSSEE at IIT Bombay.
-
-You are given one or more schematic images from eSim or KiCad. Your job is to answer the user's question about those images as accurately and helpfully as possible.
-
-Rules:
-- Read every visible label, net name, component reference, value, and pin number from the image.
-- If the user asks a specific question (e.g. "how to build this in eSim", "what does this component do", "why is this connection wrong"), answer THAT question directly and completely.
-- If no specific question is given, describe the circuit: identify its function, list components, and flag any design issues.
-- Never refuse to analyse. If parts of the image are unclear, do your best and note any uncertainty.
-- Be concise and practical. Match the length of your answer to the complexity of the question.
-- When referring to components, use their visible reference designators (R1, C3, U2, etc.).
-"""
-
-
 # ── Text chat worker ──────────────────────────────────────────────────────────
 
 class OllamaWorker(QThread):
     response_signal = pyqtSignal(str)
     status_signal = pyqtSignal(str)
+    chunk_signal = pyqtSignal(str)
 
     def __init__(self, chat_history, model="qwen2.5-coder:3b",
                  temperature=0.25, num_predict=1024):
@@ -294,7 +306,7 @@ def run(self):
                 started = start_ollama(stop_flag=lambda: self._stop_requested)
                 if not started:
                     if self._stop_requested:
-                        return  # user cancelled cleanly
+                        return
                     self.response_signal.emit(
                         "❌ Could not start Ollama automatically.\n"
                         "Please open a terminal and run: ollama serve"
@@ -303,20 +315,22 @@ def run(self):
                 self.status_signal.emit("Ollama started! Getting response…")
                 time.sleep(1)
 
-            # Keep last 10 history lines (5 turns).
-            # Sending 20 lines fills most of the context window before the
-            # question is even added, forcing the model to load more tokens.
+            # config-driven history window + system prompt
+            max_lines = int(CONFIG.get("history", {}).get("max_lines", 6))
             messages = [{"role": "system", "content": _SYSTEM_PROMPT}]
-            for line in self.chat_history[-10:]:
+            for line in self.chat_history[-max_lines:]:
                 if line.startswith("User:"):
                     messages.append({"role": "user", "content": line[5:].strip()})
                 elif line.startswith("Bot:"):
                     messages.append({"role": "assistant", "content": line[4:].strip()})
 
-            # Smart token budget: short questions get fewer tokens so they
-            # finish faster; complex questions still get enough for a full answer.
             budget = _smart_num_predict(self.chat_history, self.num_predict)
 
+            # config-driven model options
+            num_ctx       = int(CONFIG.get("context_window", {}).get("text_num_ctx", 1024))
+            repeat_pen    = float(CONFIG.get("sampling", {}).get("repeat_penalty", 1.08))
+            keep_alive    = CONFIG.get("runtime", {}).get("keep_alive", "-1m")
+
             stream = ollama.chat(
                 model=self.model,
                 messages=messages,
@@ -324,15 +338,9 @@ def run(self):
                 options={
                     "temperature": self.temperature,
                     "num_predict": budget,
-                    # 2048 ctx handles 5 turns of history + question comfortably.
-                    # Allocating 4096 forces Ollama to malloc a larger KV-cache,
-                    # adding 2-4s overhead before token 1 is generated.
-                    "num_ctx": 2048,
-                    "repeat_penalty": 1.08,
-                    # Keep model loaded in RAM between requests.
-                    # Without this, Ollama unloads after 5 min and the next
-                    # question pays a 30-60s reload cost.
-                    "keep_alive": "10m",
+                    "num_ctx": num_ctx,
+                    "repeat_penalty": repeat_pen,
+                    "keep_alive": keep_alive,
                 }
             )
 
@@ -341,7 +349,9 @@ def run(self):
                 if self._stop_requested:
                     bot_response += "\n\n⏹ Generation stopped."
                     break
-                bot_response += chunk["message"]["content"]
+                piece = chunk["message"]["content"]
+                bot_response += piece
+                self.chunk_signal.emit(piece)
 
             bot_response = bot_response.strip()
             if not bot_response:
@@ -368,7 +378,8 @@ def _is_vision_model(model_name: str) -> bool:
     return any(k in m for k in [
         "llava", "bakllava", "vision", "moondream", "minicpm-v", "qwen2-vl"
     ])
-# QThread reads/writes don't produce a data race.
+
+
 _cache_lock = threading.Lock()
 _installed_models_cache: list = []
 _installed_models_cache_valid: bool = False
@@ -397,12 +408,6 @@ def _refresh_model_cache():
 
 
 def _pick_best_vision_model(preferred: str = "") -> str:
-    """
-    Pick the fastest available vision model.
-    Priority: user-selected (if vision-capable) → moondream → llava:7b → llava → llava:13b
-    Smaller/faster models come FIRST so CPU inference is quick.
-    Uses a cached model list — no blocking ollama.list() call at send time.
-    """
     with _cache_lock:
         cache_valid = _installed_models_cache_valid
         cache_copy  = list(_installed_models_cache)
@@ -414,48 +419,35 @@ def _pick_best_vision_model(preferred: str = "") -> str:
 
     installed_map = {name.lower(): name for name in cache_copy}
 
-    # If the user explicitly selected a vision model, respect that choice first
     if preferred and _is_vision_model(preferred):
         return preferred
 
-    # Prefer smaller/faster models for speed on CPU
     speed_order = [
-        "moondream",       # ~1.6 GB — fastest
-        "llava:7b",        # ~4 GB  — good balance
-        "llava",           # ~4 GB  — default tag (usually 7b)
-        "bakllava",        # ~4 GB
-        "llava:13b",       # ~8 GB  — slowest, last resort
+        "moondream",
+        "llava:7b",
+        "llava",
+        "bakllava",
+        "llava:13b",
     ]
     for cand in speed_order:
         if cand.lower() in installed_map:
             return installed_map[cand.lower()]
 
-    # Fallback: any installed vision model
     for name in cache_copy:
         if _is_vision_model(name):
             return name
 
-    # No vision-capable model is installed.
-    # Return None so the caller can show a clear error instead of
-    # sending the images to a text-only model that will hallucinate.
     return None
 
 
 def _build_schematic_vision_prompt(extra_prompt: str, image_count: int) -> str:
-    """
-    Build the prompt sent to the vision model alongside the image(s).
-    If the user typed a question, that question drives the response.
-    If no question was given, request a general circuit analysis.
-    """
     n = "this schematic" if image_count == 1 else f"these {image_count} schematics"
     if extra_prompt and extra_prompt.strip():
-        # User asked something specific - make that the primary request.
         return (
             f"Looking at {n}: {extra_prompt.strip()}\n\n"
             "Base your answer on what is actually visible in the image."
         )
     else:
-        # No question given - do a general analysis.
         return (
             f"Please analyse {n}. "
             "Identify the circuit's function, list all visible components with their "
@@ -469,6 +461,7 @@ def _build_schematic_vision_prompt(extra_prompt: str, image_count: int) -> str:
 class OllamaVisionWorker(QThread):
     response_signal = pyqtSignal(str)
     status_signal = pyqtSignal(str)
+    chunk_signal = pyqtSignal(str)
 
     def __init__(self, image_paths=None, extra_prompt: str = "",
                  model: str = "llava", image_path: str = ""):
@@ -489,6 +482,11 @@ def stop(self):
         self._stop_requested = True
 
     def _chat_once(self, model_name: str, prompt: str, image_bytes_list):
+        # config-driven vision options
+        vc = CONFIG.get("context_window", {})
+        vs = CONFIG.get("sampling", {})
+        keep_alive = CONFIG.get("runtime", {}).get("keep_alive", "10m")
+
         stream = ollama.chat(
             model=model_name,
             messages=[
@@ -501,13 +499,11 @@ def _chat_once(self, model_name: str, prompt: str, image_bytes_list):
             ],
             stream=True,
             options={
-                "temperature": 0.15,
-                # llava: ~576 tokens/image patch + ~200 prompt + 512 predict.
-                # 3072 gives comfortable headroom without the overhead of 4096.
-                "num_ctx": 3072,
-                "num_predict": 512,
-                "repeat_penalty": 1.05,
-                "keep_alive": "10m",
+                "temperature": float(vs.get("vision_temperature", 0.15)),
+                "num_ctx": int(vc.get("vision_num_ctx", 1024)),
+                "num_predict": int(vc.get("vision_num_predict", 512)),
+                "repeat_penalty": float(vs.get("vision_repeat_penalty", 1.05)),
+                "keep_alive": keep_alive,
             }
         )
 
@@ -515,14 +511,13 @@ def _chat_once(self, model_name: str, prompt: str, image_bytes_list):
         token_count   = 0
         for chunk in stream:
             if self._stop_requested:
-                response += "\n\n\u23f9 Generation stopped."
+                response += "\n\n⏹ Generation stopped."
                 break
             piece       = chunk["message"]["content"]
             response   += piece
             token_count += 1
+            self.chunk_signal.emit(piece)
 
-            # Emit progress every 20 tokens so the status label
-            # shows the model is actively working
             if token_count % 20 == 0:
                 self.status_signal.emit(
                     f"Generating… ({token_count} tokens so far)"
@@ -549,10 +544,6 @@ def run(self):
                 self.response_signal.emit("❌ No image paths provided.")
                 return
 
-            # Load and downscale images before sending.
-            # llava resizes internally to 336px anyway; sending 4K images
-            # just wastes encoding time. Downscaling to 512px is invisible
-            # to the model but saves significant transfer overhead.
             image_bytes_list = []
             for path in self.image_paths:
                 if not os.path.exists(path):
@@ -569,9 +560,6 @@ def run(self):
             vision_model = _pick_best_vision_model(self.model)
 
             if vision_model is None:
-                # No vision-capable model is installed. A text model cannot
-                # see images and will hallucinate plausible-sounding but
-                # completely fabricated answers.
                 self.response_signal.emit(
                     "❌ No vision model is installed.\n\n"
                     "Image analysis requires a vision-capable model. "
@@ -620,7 +608,6 @@ class MicWorker(QThread):
     status_signal = pyqtSignal(str)
 
     def run(self):
-        """Record from microphone and transcribe using Google Speech Recognition."""
         if not _SR_AVAILABLE:
             self.error_signal.emit(
                 "SpeechRecognition not installed.\nRun:  pip install SpeechRecognition pyaudio"
diff --git a/src/chatbot/image_handler.py b/src/chatbot/image_handler.py
index cd8744791..9d1ff3e89 100644
--- a/src/chatbot/image_handler.py
+++ b/src/chatbot/image_handler.py
@@ -5,15 +5,24 @@
 import time
 from typing import Dict, Any
 from PIL import Image
-MAX_IMAGE_BYTES = int(0.5*1024 * 1024)  
-from .ollama_runner import run_ollama_vision
+
+from .ollama_runner import run_ollama_vision, CONFIG
+
+# ==================== CONFIG-DRIVEN LIMITS ====================
+
+_IMG_CFG = CONFIG.get("image", {})
+MAX_IMAGE_MB = float(_IMG_CFG.get("max_size_mb", 0.5))
+MAX_IMAGE_BYTES = int(MAX_IMAGE_MB * 1024 * 1024)
+MAX_WIDTH = int(_IMG_CFG.get("max_width", 1920))
+MAX_HEIGHT = int(_IMG_CFG.get("max_height", 1080))
+VISION_MAX_RETRIES = int(_IMG_CFG.get("vision_max_retries", 2))
 
 # === IMPORT PADDLE OCR ===
 try:
     from paddleocr import PaddleOCR
     import logging
     logging.getLogger("ppocr").setLevel(logging.ERROR)
-    
+
     # CRITICAL FIX: Disabled MKLDNN and Angle Classification to prevent VM Crashes
     ocr_engine = PaddleOCR(
         use_angle_cls=False,    # <--- MUST BE FALSE TO STOP SIGABRT
@@ -21,7 +30,7 @@
         use_gpu=False,          # Force CPU
         enable_mkldnn=False,    # <--- MUST BE FALSE FOR PADDLE v3 COMPATIBILITY
         use_mp=False,           # Disable multiprocessing
-        show_log=False 
+        show_log=False
     )
     HAS_PADDLE = True
     print("[INIT] PaddleOCR initialized (Safe Mode).")
@@ -40,7 +49,8 @@ def encode_image(image_path: str) -> str:
 def optimize_image_for_vision(image_path: str) -> bytes:
     """
     Resize large images to reduce vision model processing time.
-    Target: Max 1920x1080 while maintaining aspect ratio.
+    Target: Max MAX_WIDTH x MAX_HEIGHT (from config.json) while maintaining
+    aspect ratio.
     """
     try:
         img = Image.open(image_path)
@@ -48,12 +58,8 @@ def optimize_image_for_vision(image_path: str) -> bytes:
         if img.mode not in ('RGB', 'L'):
             img = img.convert('RGB')
 
-        max_width = 1920
-        max_height = 1080
-
-        if img.width > max_width or img.height > max_height:
-            # Calculate scaling factor
-            scale = min(max_width / img.width, max_height / img.height)
+        if img.width > MAX_WIDTH or img.height > MAX_HEIGHT:
+            scale = min(MAX_WIDTH / img.width, MAX_HEIGHT / img.height)
             new_size = (int(img.width * scale), int(img.height * scale))
             img = img.resize(new_size, Image.Resampling.LANCZOS)
             print(f"[IMAGE] Resized from {img.width}x{img.height} to {new_size[0]}x{new_size[1]}")
@@ -91,55 +97,41 @@ def extract_text_with_paddle(image_path: str) -> str:
         print(f"[OCR] PaddleOCR Failed: {e}")
         return ""
 
+
+def _empty_result(error: str = "", design_errors=None) -> Dict[str, Any]:
+    return {
+        "error": error,
+        "vision_summary": "",
+        "component_counts": {},
+        "circuit_analysis": {
+            "circuit_type": "Unknown",
+            "design_errors": design_errors or [],
+            "design_warnings": []
+        },
+        "components": [],
+        "values": {}
+    }
+
+
 def analyze_and_extract(image_path: str) -> Dict[str, Any]:
     """
-    Analyze schematic with image optimization, PaddleOCR text injection, and timeout handling.
-    Rejects images larger than 0.5 MB.
+    Analyze schematic with image optimization, PaddleOCR text injection, and
+    timeout handling. Rejects images larger than CONFIG.image.max_size_mb.
     """
     if not os.path.exists(image_path):
-        return {
-            "error": "Image file not found",
-            "vision_summary": "",
-            "component_counts": {},
-            "circuit_analysis": {
-                "circuit_type": "Unknown",
-                "design_errors": [],
-                "design_warnings": []
-            },
-            "components": [],
-            "values": {}
-        }
+        return _empty_result("Image file not found")
 
     try:
         file_size = os.path.getsize(image_path)
     except OSError as e:
-        return {
-            "error": f"Could not read image size: {e}",
-            "vision_summary": "",
-            "component_counts": {},
-            "circuit_analysis": {
-                "circuit_type": "Unknown",
-                "design_errors": [],
-                "design_warnings": []
-            },
-            "components": [],
-            "values": {}
-        }
+        return _empty_result(f"Could not read image size: {e}")
 
     if file_size > MAX_IMAGE_BYTES:
         size_mb = round(file_size / (1024 * 1024), 2)
-        return {
-            "error": f"Image too large ({size_mb} MB). Max allowed size is 0.5 MB.",
-            "vision_summary": "",
-            "component_counts": {},
-            "circuit_analysis": {
-                "circuit_type": "Unknown",
-                "design_errors": ["Image file size exceeded 0.5 MB limit"],
-                "design_warnings": []
-            },
-            "components": [],
-            "values": {}
-        }
+        return _empty_result(
+            f"Image too large ({size_mb} MB). Max allowed size is {MAX_IMAGE_MB} MB.",
+            design_errors=[f"Image file size exceeded {MAX_IMAGE_MB} MB limit"],
+        )
 
     # === OPTIMIZE IMAGE BEFORE SENDING ===
     print(f"[VISION] Processing image: {os.path.basename(image_path)}")
@@ -178,10 +170,9 @@ def analyze_and_extract(image_path: str) -> Dict[str, Any]:
 RESPOND WITH JSON ONLY.
 """
 
-    max_retries = 2
-    for attempt in range(max_retries):
+    for attempt in range(VISION_MAX_RETRIES):
         try:
-            print(f"[VISION] Attempt {attempt + 1}/{max_retries}...")
+            print(f"[VISION] Attempt {attempt + 1}/{VISION_MAX_RETRIES}...")
 
             response_text = run_ollama_vision(prompt, image_bytes)
 
@@ -224,21 +215,12 @@ def analyze_and_extract(image_path: str) -> Dict[str, Any]:
 
         except Exception as e:
             print(f"[VISION] Attempt {attempt + 1} failed: {str(e)}")
-            if attempt == max_retries - 1:
-                return {
-                    "error": f"Vision analysis failed: {str(e)}",
-                    "vision_summary": "Unable to analyze circuit image",
-                    "component_counts": {},
-                    "circuit_analysis": {
-                        "circuit_type": "Unknown",
-                        "design_errors": ["Analysis timed out or failed"],
-                        "design_warnings": []
-                    },
-                    "components": [],
-                    "values": {}
-                }
+            if attempt == VISION_MAX_RETRIES - 1:
+                return _empty_result(
+                    f"Vision analysis failed: {str(e)}",
+                    design_errors=["Analysis timed out or failed"],
+                )
             else:
-                import time
                 time.sleep(2)
 
 
diff --git a/src/chatbot/ollama_runner.py b/src/chatbot/ollama_runner.py
index ae754bd0b..49defddb6 100644
--- a/src/chatbot/ollama_runner.py
+++ b/src/chatbot/ollama_runner.py
@@ -2,6 +2,7 @@
 import ollama
 import json
 import time
+from typing import Generator, Iterable, Optional
 
 # ==================== CLIENT ====================
 
@@ -10,20 +11,114 @@
     timeout=300.0,
 )
 
-# ==================== SETTINGS ====================
+# ==================== CONFIG (config.json) ====================
+
+# Path resolution: prefer config.json sitting next to this file (inside the
+# chatbot package). Fall back to the per-user settings directory.
+_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+_PACKAGE_CONFIG_PATH = os.path.join(_THIS_DIR, "config.json")
 
 _SETTINGS_DIR = os.path.join(
     os.path.expanduser("~"), ".local", "share", "esim-copilot"
 )
 _SETTINGS_PATH = os.path.join(_SETTINGS_DIR, "settings.json")
+_USER_CONFIG_PATH = os.path.join(_SETTINGS_DIR, "config.json")
 
 _DEFAULT_TEXT_MODEL = "qwen2.5:3b"
 _DEFAULT_VISION_MODEL = "minicpm-v:latest"
 EMBED_MODEL = "nomic-embed-text"
 
+# Built-in defaults (used if config.json is missing). These mirror config.json
+# so the runner still works in a degraded state.
+_DEFAULT_CONFIG = {
+    "models": {
+        "text_model": _DEFAULT_TEXT_MODEL,
+        "vision_model": _DEFAULT_VISION_MODEL,
+        "embed_model": EMBED_MODEL,
+    },
+    "system_rules": {
+        "text_system_prompt": (
+            "You are eSim Copilot, an expert assistant for the eSim EDA tool. "
+            "Be concise, accurate, and practical."
+        ),
+        "vision_system_prompt": (
+            "You are an expert Electronics Engineer using eSim. "
+            "Analyze the schematic image carefully and output JSON only."
+        ),
+    },
+    "context_window": {
+        "text_num_ctx": 4096,
+        "text_num_predict": 512,
+        "vision_num_ctx": 4096,
+        "vision_num_predict": 1024,
+        "follow_up_num_predict": 350,
+    },
+    "sampling": {
+        "temperature": 0.05,
+        "top_p": 0.9,
+        "repeat_penalty": 1.1,
+        "vision_temperature": 0.0,
+    },
+    "streaming": {"enabled": True},
+    "history": {"max_turns": 12, "context_turns": 6},
+    "image": {
+        "max_size_mb": 0.5,
+        "max_width": 1920,
+        "max_height": 1080,
+        "vision_max_retries": 2,
+    },
+    "rag": {"default_n_results": 5, "follow_up_n_results": 2},
+    "stt": {"samplerate": 16000, "max_silence_sec": 3, "phrase_limit_sec": 8},
+}
+
+
+def _deep_merge(base: dict, override: dict) -> dict:
+    """Recursively merge override into base (override wins)."""
+    out = dict(base)
+    for k, v in (override or {}).items():
+        if k in out and isinstance(out[k], dict) and isinstance(v, dict):
+            out[k] = _deep_merge(out[k], v)
+        else:
+            out[k] = v
+    return out
+
+
+def load_config() -> dict:
+    """
+    Load the customizable configuration layer.
+    Priority: ~/.local/share/esim-copilot/config.json (user)
+            > <package>/config.json (shipped)
+            > built-in defaults.
+    """
+    cfg = dict(_DEFAULT_CONFIG)
+
+    for path in (_PACKAGE_CONFIG_PATH, _USER_CONFIG_PATH):
+        if os.path.isfile(path):
+            try:
+                with open(path, "r", encoding="utf-8") as f:
+                    cfg = _deep_merge(cfg, json.load(f))
+            except Exception as e:
+                print(f"[CONFIG] Failed to load {path}: {e}")
+    return cfg
+
+
+CONFIG = load_config()
+
+
+def reload_config() -> dict:
+    """Re-read config.json from disk (call after editing it)."""
+    global CONFIG
+    CONFIG = load_config()
+    # Also keep the legacy model dicts in sync.
+    VISION_MODELS["primary"] = CONFIG["models"].get("vision_model", _DEFAULT_VISION_MODEL)
+    TEXT_MODELS["default"] = CONFIG["models"].get("text_model", _DEFAULT_TEXT_MODEL)
+    return CONFIG
+
+
+# ==================== LEGACY SETTINGS (kept for backward compat) ====================
 
 def load_model_settings() -> dict:
-    """Load persisted model preferences from disk."""
+    """Load persisted model preferences from disk (legacy settings.json)."""
     try:
         with open(_SETTINGS_PATH, "r", encoding="utf-8") as f:
             return json.load(f)
@@ -32,7 +127,7 @@ def load_model_settings() -> dict:
 
 
 def save_model_settings(text_model: str, vision_model: str) -> None:
-    """Persist model preferences to disk."""
+    """Persist model preferences to disk (legacy settings.json)."""
     os.makedirs(_SETTINGS_DIR, exist_ok=True)
     try:
         with open(_SETTINGS_PATH, "w", encoding="utf-8") as f:
@@ -51,18 +146,20 @@ def list_available_models() -> list:
         return [_DEFAULT_TEXT_MODEL, _DEFAULT_VISION_MODEL, EMBED_MODEL]
 
 
-# Load settings and initialise model dicts
-_settings = load_model_settings()
+# Merge legacy settings.json on top of config.json (legacy wins for model picks
+# so old users keep their previous choices).
+_legacy = load_model_settings()
+_cfg_models = CONFIG.get("models", {})
 
-VISION_MODELS = {"primary": _settings.get("vision_model", _DEFAULT_VISION_MODEL)}
-TEXT_MODELS   = {"default": _settings.get("text_model",   _DEFAULT_TEXT_MODEL)}
+VISION_MODELS = {"primary": _legacy.get("vision_model", _cfg_models.get("vision_model", _DEFAULT_VISION_MODEL))}
+TEXT_MODELS   = {"default": _legacy.get("text_model",   _cfg_models.get("text_model",   _DEFAULT_TEXT_MODEL))}
 
 
 def reload_model_settings() -> None:
     """Re-read settings from disk and update running dicts (called after save)."""
     s = load_model_settings()
-    VISION_MODELS["primary"] = s.get("vision_model", _DEFAULT_VISION_MODEL)
-    TEXT_MODELS["default"]   = s.get("text_model",   _DEFAULT_TEXT_MODEL)
+    VISION_MODELS["primary"] = s.get("vision_model", CONFIG["models"].get("vision_model", _DEFAULT_VISION_MODEL))
+    TEXT_MODELS["default"]   = s.get("text_model",   CONFIG["models"].get("text_model",   _DEFAULT_TEXT_MODEL))
 
 
 # ==================== VISION ====================
@@ -70,6 +167,12 @@ def reload_model_settings() -> None:
 def run_ollama_vision(prompt: str, image_input) -> str:
     """Call vision model with Chain-of-Thought for better accuracy."""
     model = VISION_MODELS["primary"]
+    ctx = CONFIG["context_window"]
+    samp = CONFIG["sampling"]
+    sys_prompt = CONFIG["system_rules"].get(
+        "vision_system_prompt",
+        _DEFAULT_CONFIG["system_rules"]["vision_system_prompt"],
+    )
 
     try:
         import base64
@@ -85,9 +188,9 @@ def run_ollama_vision(prompt: str, image_input) -> str:
         else:
             raise ValueError("Invalid image input format")
 
+        # Compose system prompt with the analysis schema requirements.
         system_prompt = (
-            "You are an expert Electronics Engineer using eSim.\n"
-            "Analyze the schematic image carefully.\n\n"
+            f"{sys_prompt}\n\n"
             "STEP 1: THINKING PROCESS\n"
             "- List visible components (e.g., 'I see 4 diodes in a bridge...').\n"
             "- Trace connections (e.g., 'Resistor R1 is in series...').\n"
@@ -119,9 +222,9 @@ def run_ollama_vision(prompt: str, image_input) -> str:
                 },
             ],
             options={
-                "temperature": 0.0,
-                "num_ctx": 8192,
-                "num_predict": 1024,
+                "temperature": samp.get("vision_temperature", 0.0),
+                "num_ctx": ctx.get("vision_num_ctx", 4096),
+                "num_predict": ctx.get("vision_num_predict", 1024),
             },
         )
 
@@ -150,29 +253,70 @@ def run_ollama_vision(prompt: str, image_input) -> str:
         })
 
 
-# ==================== TEXT ====================
-
-def run_ollama(prompt: str, mode: str = "default") -> str:
-    """Run text model with focused parameters."""
+# ==================== TEXT (with streaming) ====================
+
+def _build_text_options(mode: str = "default") -> dict:
+    """Build the Ollama `options` dict from config.json."""
+    ctx = CONFIG["context_window"]
+    samp = CONFIG["sampling"]
+    num_predict = ctx.get("text_num_predict", 512)
+    if mode == "follow_up":
+        num_predict = ctx.get("follow_up_num_predict", num_predict)
+    return {
+        "temperature": samp.get("temperature", 0.05),
+        "num_ctx": ctx.get("text_num_ctx", 4096),
+        "num_predict": num_predict,
+        "top_p": samp.get("top_p", 0.9),
+        "repeat_penalty": samp.get("repeat_penalty", 1.1),
+    }
+
+
+def _text_messages(prompt: str) -> list:
+    """Build the chat message list with the configurable system rule."""
+    sys_prompt = CONFIG["system_rules"].get(
+        "text_system_prompt",
+        _DEFAULT_CONFIG["system_rules"]["text_system_prompt"],
+    )
+    return [
+        {"role": "system", "content": sys_prompt},
+        {"role": "user", "content": prompt},
+    ]
+
+
+def run_ollama(
+    prompt: str,
+    mode: str = "default",
+    stream: Optional[bool] = None,
+    on_chunk=None,
+) -> str:
+    """
+    Run text model.
+
+    - If `stream` is True (or None and CONFIG["streaming"]["enabled"] is True),
+      tokens are streamed from Ollama and accumulated into the final string.
+    - `on_chunk(text)` is called for each streamed chunk (useful for live UI).
+    - Returns the full assembled response.
+    """
     model = TEXT_MODELS.get(mode, TEXT_MODELS["default"])
+    if stream is None:
+        stream = bool(CONFIG.get("streaming", {}).get("enabled", True))
 
     try:
+        if stream:
+            collected = []
+            for chunk in run_ollama_stream(prompt, mode=mode):
+                collected.append(chunk)
+                if on_chunk is not None:
+                    try:
+                        on_chunk(chunk)
+                    except Exception as cb_e:
+                        print(f"[STREAM CALLBACK] {cb_e}")
+            return "".join(collected).strip()
+
         resp = ollama_client.chat(
             model=model,
-            messages=[
-                {
-                    "role": "system",
-                    "content": "You are an eSim and electronics expert. Be concise, accurate, and practical.",
-                },
-                {"role": "user", "content": prompt},
-            ],
-            options={
-                "temperature": 0.05,
-                "num_ctx": 2048,
-                "num_predict": 400,
-                "top_p": 0.9,
-                "repeat_penalty": 1.1,
-            },
+            messages=_text_messages(prompt),
+            options=_build_text_options(mode),
         )
         return resp["message"]["content"].strip()
 
@@ -180,13 +324,42 @@ def run_ollama(prompt: str, mode: str = "default") -> str:
         return f"[Error] {str(e)}"
 
 
+def run_ollama_stream(prompt: str, mode: str = "default") -> Generator[str, None, None]:
+    """
+    Generator that yields text chunks as they arrive from Ollama.
+    Suitable for plugging into a Qt signal/slot or any streaming UI.
+    """
+    model = TEXT_MODELS.get(mode, TEXT_MODELS["default"])
+    try:
+        stream = ollama_client.chat(
+            model=model,
+            messages=_text_messages(prompt),
+            options=_build_text_options(mode),
+            stream=True,
+        )
+        for chunk in stream:
+            piece = ""
+            if isinstance(chunk, dict):
+                piece = chunk.get("message", {}).get("content", "") or ""
+            else:
+                # ollama-python returns objects with .message.content too
+                msg = getattr(chunk, "message", None)
+                if msg is not None:
+                    piece = getattr(msg, "content", "") or ""
+            if piece:
+                yield piece
+    except Exception as e:
+        yield f"[Error] {str(e)}"
+
+
 # ==================== EMBEDDINGS ====================
 
 def get_embedding(text: str):
     """Get text embeddings for RAG."""
     try:
-        r = ollama_client.embeddings(model=EMBED_MODEL, prompt=text)
+        embed_model = CONFIG["models"].get("embed_model", EMBED_MODEL)
+        r = ollama_client.embeddings(model=embed_model, prompt=text)
         return r["embedding"]
     except Exception as e:
         print(f"[EMBED ERROR] {e}")
-        return None
+        return None
\ No newline at end of file
diff --git a/src/chatbot/stt_handler.py b/src/chatbot/stt_handler.py
index f2d536066..f7fc8fa4a 100644
--- a/src/chatbot/stt_handler.py
+++ b/src/chatbot/stt_handler.py
@@ -3,6 +3,8 @@
 import queue
 import time
 
+from .ollama_runner import CONFIG
+
 try:
     import sounddevice as sd
     from vosk import Model, KaldiRecognizer
@@ -20,6 +22,14 @@
     "esim-copilot", "vosk-model-small-en-us-0.15",
 )
 
+# ==================== CONFIG-DRIVEN DEFAULTS ====================
+
+_STT_CFG = CONFIG.get("stt", {})
+DEFAULT_SAMPLERATE = int(_STT_CFG.get("samplerate", 16000))
+DEFAULT_MAX_SILENCE_SEC = int(_STT_CFG.get("max_silence_sec", 3))
+DEFAULT_PHRASE_LIMIT_SEC = int(_STT_CFG.get("phrase_limit_sec", 8))
+
+
 def _get_model():
     global _MODEL
     if not _HAS_STT:
@@ -37,13 +47,29 @@ def _get_model():
         _MODEL = Model(model_path)
     return _MODEL
 
-def listen_to_mic(should_stop=lambda: False, max_silence_sec=3, samplerate=16000, phrase_limit_sec=8) -> str:
+
+def listen_to_mic(
+    should_stop=lambda: False,
+    max_silence_sec=None,
+    samplerate=None,
+    phrase_limit_sec=None,
+) -> str:
     """
-    Offline STT using Vosk.
+    Offline STT using Vosk. All timing knobs default to values in config.json
+    (CONFIG.stt) so users can tune without touching the source.
+
     Returns recognized text, or "" if cancelled / timed out.
     """
     if not _HAS_STT:
         raise RuntimeError("Speech-to-text is not installed or failed to load.")
+
+    if max_silence_sec is None:
+        max_silence_sec = DEFAULT_MAX_SILENCE_SEC
+    if samplerate is None:
+        samplerate = DEFAULT_SAMPLERATE
+    if phrase_limit_sec is None:
+        phrase_limit_sec = DEFAULT_PHRASE_LIMIT_SEC
+
     q = queue.Queue()
     rec = KaldiRecognizer(_get_model(), samplerate)
 
@@ -89,4 +115,4 @@ def callback(indata, frames, time_info, status):
                     started = True
                     t_speech = now
 
-        return json.loads(rec.FinalResult()).get("text", "").strip()
+        return json.loads(rec.FinalResult()).get("text", "").strip()
\ No newline at end of file
diff --git a/src/chatbot/test_copilot_config.py b/src/chatbot/test_copilot_config.py
new file mode 100644
index 000000000..4cb614264
--- /dev/null
+++ b/src/chatbot/test_copilot_config.py
@@ -0,0 +1,26 @@
+# test_copilot_config.py
+import sys
+import os
+
+# Append src to path so Python can resolve your modules
+sys.path.append(os.path.abspath("./src"))
+
+try:
+    from chatbot.chatbot_thread import OllamaWorker
+    print("[SUCCESS] Module imports matched cleanly.")
+    
+    # Instantiate the worker to trigger config loading
+    worker = OllamaWorker([])
+    
+    print("\n--- Parsing Verification ---")
+    print(f"Target Text Model: {worker.config_data.get('models', {}).get('text_model')}")
+    print(f"Sampling Temp:     {worker.config_data.get('sampling', {}).get('temperature')}")
+    print(f"Context Window:    {worker.config_data.get('context_window', {}).get('text_num_ctx')} tokens")
+    
+    if worker.config_data:
+        print("\n[PASSED] config.json successfully loaded and mapped to the backend thread!")
+    else:
+        print("\n[FAILED] config.json data dictionary is empty. Check your directory paths.")
+
+except Exception as e:
+    print(f"\n[CRITICAL FAULT] Test environment broke down: {e}")
\ No newline at end of file
diff --git a/src/frontEnd/Application.py b/src/frontEnd/Application.py
index 20a529731..709977da0 100644
--- a/src/frontEnd/Application.py
+++ b/src/frontEnd/Application.py
@@ -27,7 +27,8 @@
     init_path = ''
 else:
     import pathmagic    # noqa:F401
-    init_path = '../../'
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    init_path = os.path.abspath(os.path.join(current_dir, "..", "..")) + os.sep
 
 from PyQt5 import QtGui, QtCore, QtWidgets
 from PyQt5.Qt import QSize
@@ -125,7 +126,7 @@ def initchatbot(self):
             }
         """)
         self.addDockWidget(QtCore.Qt.RightDockWidgetArea, self.chatbot_dock)
-        self.chatbot_dock.hide()  # Hidden by default; toggled by the icon button
+        self.chatbot_dock.show()  # <--- Force it to open inside the layout on startup
         # When user closes dock via the X button, reposition the floating icon
         self.chatbot_dock.visibilityChanged.connect(
             lambda _: self._reposition_chatbot_icon()
@@ -837,4 +838,5 @@ def main(args):
     try:
         main(sys.argv)
     except Exception as err:
-        print("Error: ", err)
\ No newline at end of file
+        print("Error: ", err)
+self.openChatbot()
\ No newline at end of file
diff --git a/src/frontEnd/Chatbot.py b/src/frontEnd/Chatbot.py
index 1a3e75701..88db50067 100644
--- a/src/frontEnd/Chatbot.py
+++ b/src/frontEnd/Chatbot.py
@@ -33,12 +33,8 @@
 
 _IMG_FILTER = "Images (*.png *.jpg *.jpeg *.bmp *.gif *.tiff)"
 _IMAGE_EXTS = {'.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff', '.tif', '.webp'}
-# NgSpice logs can be 10-50 KB; sending all of it blows past num_ctx: 2048.
-# 60 lines is enough for any meaningful error message while staying well inside
-# the context window even with history prepended.
+
 _MAX_ERROR_LOG_LINES = 60
-# _save_history() is called after every bot response; without debouncing this
-# causes synchronous I/O on the main thread on every message.
 _SAVE_DEBOUNCE_MS = 5000
 
 WELCOME_MESSAGE = """
@@ -92,13 +88,8 @@ def _typing_bubble(frame=0):
 # ── Markdown renderer ─────────────────────────────────────────────────────────
 
 def _render_inline(text):
-    """
-    Renders inline markdown: **bold**, *italic*, `code`, # headings, and [links](url).
-    """
-    # Escape HTML special chars first so subsequent substitutions are safe
     text = text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
 
-    # Headings (must be processed line-by-line because they are block-level)
     def _render_headings(t):
         lines = t.split('\n')
         out = []
@@ -116,32 +107,23 @@ def _render_headings(t):
             else:
                 out.append(line)
         return '\n'.join(out)
-
     text = _render_headings(text)
 
-    # Bold (**text** or __text__)
     text = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', text)
     text = re.sub(r'__(.*?)__',     r'<b>\1</b>', text)
-
-    # Italic (*text* or _text_) — processed after bold so ** is already gone
     text = re.sub(r'\*(.*?)\*', r'<i>\1</i>', text)
     text = re.sub(r'_(.*?)_',   r'<i>\1</i>', text)
-
-    # Inline code (`code`)
     text = re.sub(
         r'`([^`]+)`',
         r'<span style="font-family:Consolas,monospace;background-color:#e8ecf0;'
         r'padding:1px 4px;border-radius:3px;">\1</span>',
         text
     )
-
-    # Markdown links [text](url)
     text = re.sub(
         r'\[([^\]]+)\]\((https?://[^\)]+)\)',
         r'<a href="\2" style="color:#0078d4;">\1</a>',
         text
     )
-
     text = text.replace('\n', '<br>')
     return text
 
@@ -150,12 +132,10 @@ def _render_markdown(text):
     result = []
     pattern = re.compile(r'```(\w*)\n?(.*?)```', re.DOTALL)
     last_end = 0
-
     for match in pattern.finditer(text):
         before = text[last_end:match.start()]
         if before:
             result.append(_render_inline(before))
-
         lang = match.group(1) or 'code'
         code = (
             match.group(2)
@@ -175,7 +155,6 @@ def _render_markdown(text):
             f'{label}{code}</div></td></tr></table>'
         )
         last_end = match.end()
-
     tail = text[last_end:]
     if tail:
         result.append(_render_inline(tail))
@@ -198,7 +177,6 @@ def _escape_text_preserve_breaks(text: str) -> str:
 
 
 def _image_thumbnail_html(b64_str: str, filename: str) -> str:
-    """Render a saved image as an inline base64 thumbnail in the chat."""
     safe_name = filename.replace('&', '&amp;').replace('<', '&lt;')
     return (
         '<table width="100%" cellpadding="0" cellspacing="0"><tr>'
@@ -248,7 +226,6 @@ def _bot_bubble(text, timestamp, response_idx):
     copy_href  = f'copy:///{response_idx}'
     retry_href = f'retry:///{response_idx}'
     token_est = _approx_token_count(text)
-
     return (
         '<table width="100%" cellpadding="0" cellspacing="0"><tr>'
         '<td align="left" style="padding:4px 0 0 10px;">'
@@ -371,13 +348,11 @@ def _parse_custom_url(url):
     scheme = url.scheme()
     host = url.host()
     path = url.path().strip('/')
-
     parts = []
     if host:
         parts.append(host)
     if path:
         parts.extend([p for p in path.split('/') if p])
-
     return scheme, parts
 
 
@@ -479,7 +454,7 @@ def __init__(self, session: dict, parent=None):
         )
         top_layout.addWidget(title_lbl, 1)
 
-        meta_lbl = QLabel(f"{n_usr} msg{'s' if n_usr!=1 else ''}  ·  {updated[:10]}  ·  {kind}")
+        meta_lbl = QLabel(f"{n_usr} msg{'s' if n_usr != 1 else ''}  ·  {updated[:10]}  ·  {kind}")
         meta_lbl.setStyleSheet("font-size:10px; color:#aaa; background:transparent;")
         top_layout.addWidget(meta_lbl)
 
@@ -508,7 +483,6 @@ def __init__(self, session: dict, parent=None):
                 font-size:13px;
             }
         """)
-
         html = ""
         for line in msgs:
             if line.startswith("User:"):
@@ -559,7 +533,6 @@ def __init__(self, title: str, parent=None):
         super().__init__(parent, Qt.FramelessWindowHint | Qt.Dialog)
         self.setAttribute(Qt.WA_TranslucentBackground)
         self.setMinimumWidth(320)
-
         outer = QWidget(self)
         outer.setObjectName("card")
         outer.setStyleSheet("""
@@ -569,7 +542,6 @@ def __init__(self, title: str, parent=None):
                 border: 1px solid #e0e0e0;
             }
         """)
-
         card_layout = QVBoxLayout(outer)
         card_layout.setContentsMargins(28, 24, 28, 20)
         card_layout.setSpacing(14)
@@ -641,7 +613,6 @@ def __init__(self, session_id: str, title: str, date: str,
         self.session_id = session_id
         self.title = title
         self.kind = kind
-
         self.setMinimumHeight(78)
         self.setStyleSheet("QWidget { background: transparent; }")
 
@@ -673,13 +644,11 @@ def __init__(self, session_id: str, title: str, date: str,
         title_row = QHBoxLayout()
         title_row.setSpacing(4)
         title_row.setContentsMargins(0, 0, 0, 0)
-
         title_lbl = QLabel(title[:22] + ("…" if len(title) > 22 else ""))
         title_lbl.setStyleSheet(
             "font-size:12px; font-weight:700; color:#1a1a2e; background:transparent;"
         )
         title_row.addWidget(title_lbl, 1)
-
         date_lbl = QLabel(date)
         date_lbl.setStyleSheet("font-size:10px; color:#bbb; background:transparent;")
         title_row.addWidget(date_lbl)
@@ -688,13 +657,11 @@ def __init__(self, session_id: str, title: str, date: str,
         meta_row = QHBoxLayout()
         meta_row.setSpacing(4)
         meta_row.setContentsMargins(0, 0, 0, 0)
-
         kind_lbl = QLabel()
         kind_lbl.setText(_session_kind_badge(kind))
         kind_lbl.setTextFormat(Qt.RichText)
         kind_lbl.setStyleSheet("background:transparent;")
         meta_row.addWidget(kind_lbl)
-
         if msg_count > 0:
             count_lbl = QLabel(str(msg_count))
             count_lbl.setFixedSize(20, 16)
@@ -753,7 +720,6 @@ def __init__(self, session_id: str, title: str, date: str,
         """)
         self._del_btn.clicked.connect(self._on_delete_clicked)
         btn_col.addWidget(self._del_btn)
-
         btn_col.addStretch()
         outer.addLayout(btn_col)
 
@@ -776,7 +742,6 @@ def __init__(self, parent=None):
         super().__init__(parent)
         self.setFixedWidth(290)
         self._all_sessions_cache = []
-
         self.setStyleSheet("""
             QWidget {
                 background:#ffffff;
@@ -875,7 +840,6 @@ def __init__(self, parent=None):
         """)
         delete_all_btn.clicked.connect(self.delete_all_requested)
         controls_layout.addWidget(delete_all_btn)
-
         root.addWidget(controls)
 
         sep = QFrame()
@@ -919,11 +883,9 @@ def __init__(self, parent=None):
     def populate(self):
         self._all_sessions_cache = []
         self.session_list.clear()
-
         if not os.path.exists(_SESSIONS_DIR):
             self._empty_lbl.show()
             return
-
         for fname in os.listdir(_SESSIONS_DIR):
             if not fname.endswith('.json'):
                 continue
@@ -933,14 +895,12 @@ def populate(self):
                 self._all_sessions_cache.append(s)
             except Exception:
                 pass
-
         self._all_sessions_cache.sort(key=lambda s: s.get('updated_at', ''), reverse=True)
         self._apply_filter()
 
     def _apply_filter(self):
         self.session_list.clear()
         query = self.search_input.text().strip().lower()
-
         filtered = []
         for s in self._all_sessions_cache:
             title = s.get('title', 'Chat')
@@ -950,13 +910,10 @@ def _apply_filter(self):
             haystack = f"{title} {preview} {kind}".lower()
             if not query or query in haystack:
                 filtered.append(s)
-
         if not filtered:
             self._empty_lbl.show()
             return
-
         self._empty_lbl.hide()
-
         for s in filtered:
             sid = s['id']
             title = s.get('title', 'Chat')
@@ -965,37 +922,25 @@ def _apply_filter(self):
             msg_count = sum(1 for m in msgs if m.startswith("User:"))
             preview = next((m[5:].strip() for m in msgs if m.startswith("User:")), "")
             kind = s.get('kind', 'text')
-
             item = QListWidgetItem()
             item.setData(Qt.UserRole, sid)
             widget = _SessionItemWidget(sid, title, date, msg_count, preview, kind, self.session_list)
             widget.delete_requested.connect(self._delete_session)
             widget.rename_requested.connect(self.rename_requested)
-
             item.setSizeHint(widget.sizeHint())
             self.session_list.addItem(item)
             self.session_list.setItemWidget(item, widget)
 
     def upsert_session(self, session: dict):
-        """
-        Insert or update a session entry in the sidebar immediately,
-        without reading from disk. Called as soon as the first bot reply
-        arrives so the chat appears in the sidebar right away instead of
-        waiting for the debounced disk save to complete.
-        """
         sid = session.get('id')
         if not sid:
             return
-
-        # Update existing entry in the cache if present, otherwise prepend it.
         for i, s in enumerate(self._all_sessions_cache):
             if s.get('id') == sid:
                 self._all_sessions_cache[i] = session
                 break
         else:
             self._all_sessions_cache.insert(0, session)
-
-        # Re-sort so the newest session stays at the top.
         self._all_sessions_cache.sort(
             key=lambda s: s.get('updated_at', ''), reverse=True
         )
@@ -1015,10 +960,13 @@ def _delete_session(self, session_id: str):
 # ── Main Chatbot GUI ──────────────────────────────────────────────────────────
 
 class ChatbotGUI(QWidget):
-    # Emitted from _suspend_worker's background callback to safely update
-    # the sidebar from the main thread after a background save completes.
     _background_session_saved = pyqtSignal(dict)
 
+    # Sentinel anchor names — used by find_*_anchor_cursor to locate the
+    # typing/streaming bubble in the document regardless of reflow position.
+    _TYPING_ANCHOR = '<a name="_typing_anchor_"></a>'
+    _STREAM_ANCHOR = '<a name="_stream_anchor_"></a>'
+
     def __init__(self):
         super().__init__()
         self.setWindowTitle("eSim AI Assistant")
@@ -1044,9 +992,14 @@ def __init__(self):
         self._current_session_kind = "text"
         self._session_title_override = None
         self._is_generating = False
-        self._images_store = {}   # key -> [base64_str, ...] for image replay
-        self._last_image_paths = []  # image paths from last vision send (for follow-ups)
-        # batched rather than firing synchronously after every bot response.
+        self._images_store = {}
+        self._last_image_paths = []
+
+        # Streaming state (per-message; cleared in display_response)
+        self._stream_buf = None
+        self._stream_ts = None
+        self._stream_idx = None
+
         self._save_pending = False
         self._save_debounce_timer = QTimer(self)
         self._save_debounce_timer.setSingleShot(True)
@@ -1086,8 +1039,7 @@ def __init__(self):
         self._sidebar.session_list.itemDoubleClicked.connect(self._open_session_viewer)
         self._sidebar.hide()
         root.addWidget(self._sidebar)
-        # Route background-thread session saves through a signal so the
-        # sidebar upsert always runs on the main thread (Qt requirement).
+
         self._background_session_saved.connect(self._sidebar_upsert_from_signal)
 
         chat_container = QWidget()
@@ -1240,7 +1192,6 @@ def __init__(self):
         )
         status_layout.addWidget(self.status_label)
         status_layout.addStretch()
-
         chat_layout.addLayout(status_layout)
 
         self._settings_panel = QWidget()
@@ -1253,7 +1204,6 @@ def __init__(self):
             }
         """)
         self._settings_btn.toggled.connect(lambda on: self._settings_panel.setVisible(on))
-
         sp_layout = QHBoxLayout(self._settings_panel)
         sp_layout.setContentsMargins(12, 8, 12, 8)
         sp_layout.setSpacing(16)
@@ -1262,7 +1212,6 @@ def __init__(self):
         self._temp_label = QLabel(f"Precision  {self._temperature:.2f}")
         self._temp_label.setStyleSheet("font-size:10px; color:#555;")
         temp_col.addWidget(self._temp_label)
-
         self._temp_slider = QSlider(Qt.Horizontal)
         self._temp_slider.setRange(1, 100)
         self._temp_slider.setValue(int(self._temperature * 100))
@@ -1275,7 +1224,6 @@ def __init__(self):
         self._tok_label = QLabel(f"Max tokens  {self._num_predict}")
         self._tok_label.setStyleSheet("font-size:10px; color:#555;")
         tok_col.addWidget(self._tok_label)
-
         self._tok_slider = QSlider(Qt.Horizontal)
         self._tok_slider.setRange(1, 40)
         self._tok_slider.setValue(self._num_predict // 128)
@@ -1285,7 +1233,6 @@ def __init__(self):
         sp_layout.addLayout(tok_col)
 
         sp_layout.addStretch()
-
         reset_btn = QPushButton("Reset")
         reset_btn.setFixedHeight(26)
         reset_btn.setStyleSheet("""
@@ -1392,7 +1339,6 @@ def __init__(self):
         self._staging_area = QWidget()
         self._staging_area.setStyleSheet("QWidget { background:#f5f8ff; border-radius:10px; }")
         self._staging_area.setVisible(False)
-
         staging_outer = QVBoxLayout(self._staging_area)
         staging_outer.setContentsMargins(6, 6, 6, 4)
         staging_outer.setSpacing(4)
@@ -1402,7 +1348,6 @@ def __init__(self):
         staged_lbl.setStyleSheet("font-size:11px;color:#555;")
         staging_header.addWidget(staged_lbl)
         staging_header.addStretch()
-
         clear_all_btn = QPushButton("Remove all")
         clear_all_btn.setFixedHeight(20)
         clear_all_btn.setStyleSheet("""
@@ -1422,13 +1367,11 @@ def __init__(self):
         scroll.setVerticalScrollBarPolicy(Qt.ScrollBarAlwaysOff)
         scroll.setWidgetResizable(True)
         scroll.setStyleSheet("QScrollArea { border:none; background:transparent; }")
-
         self._thumb_container = QWidget()
         self._thumb_row = QHBoxLayout(self._thumb_container)
         self._thumb_row.setContentsMargins(0, 0, 0, 0)
         self._thumb_row.setSpacing(6)
         self._thumb_row.addStretch()
-
         scroll.setWidget(self._thumb_container)
         staging_outer.addWidget(scroll)
         chat_layout.addWidget(self._staging_area)
@@ -1436,6 +1379,98 @@ def __init__(self):
         self.move_to_bottom_right()
         self._load_history()
 
+    # ── Streaming helpers ─────────────────────────────────────────────
+
+    def _start_worker(self, worker):
+        """Hook a freshly-created worker to all signals and start it."""
+        self.worker = worker
+        self.worker.response_signal.connect(self.display_response)
+        self.worker.status_signal.connect(self._on_status_update)
+        if hasattr(self.worker, "chunk_signal"):
+            self.worker.chunk_signal.connect(self._on_stream_chunk)
+        self.worker.start()
+
+    def _find_anchor_cursor(self, anchor_name: str):
+        """Generic anchor finder, used for both typing and streaming anchors."""
+        doc = self.chat_display.document()
+        block = doc.begin()
+        while block.isValid():
+            it = block.begin()
+            while not it.atEnd():
+                frag = it.fragment()
+                if frag.isValid():
+                    fmt = frag.charFormat()
+                    matched = False
+                    try:
+                        names = fmt.anchorNames()
+                        matched = anchor_name in (names or [])
+                    except AttributeError:
+                        try:
+                            matched = fmt.anchorName() == anchor_name
+                        except AttributeError:
+                            matched = False
+                    if matched:
+                        cursor = QTextCursor(doc)
+                        cursor.setPosition(frag.position())
+                        return cursor
+                it += 1
+            block = block.next()
+        return None
+
+    def _find_typing_anchor_cursor(self):
+        return self._find_anchor_cursor("_typing_anchor_")
+
+    def _find_stream_anchor_cursor(self):
+        return self._find_anchor_cursor("_stream_anchor_")
+
+    def _begin_streaming_bubble(self):
+        """Drop the typing dots and open an empty bot bubble anchored for replacement."""
+        self._remove_typing_bubble()
+        self._stream_buf = ""
+        self._stream_ts = _get_time()
+        self._stream_idx = self._response_counter
+        cursor = QTextCursor(self.chat_display.document())
+        cursor.movePosition(QTextCursor.End)
+        cursor.insertHtml(
+            self._STREAM_ANCHOR
+            + _bot_bubble("…", self._stream_ts, self._stream_idx)
+        )
+        self._scroll_to_bottom()
+
+    def _on_stream_chunk(self, piece: str):
+        """Append a streamed token to the in-progress bot bubble."""
+        if self._stream_buf is None:
+            self._begin_streaming_bubble()
+        self._stream_buf += piece
+
+        anchor_cursor = self._find_stream_anchor_cursor()
+        if anchor_cursor is None:
+            # Sentinel went missing — re-open the bubble with the buffer so far.
+            buf = self._stream_buf
+            self._reset_stream_state()
+            self._begin_streaming_bubble()
+            self._stream_buf = buf
+            anchor_cursor = self._find_stream_anchor_cursor()
+            if anchor_cursor is None:
+                return
+
+        # Select from anchor to end of document and rewrite the bubble in place.
+        anchor_cursor.movePosition(QTextCursor.End, QTextCursor.KeepAnchor)
+        anchor_cursor.removeSelectedText()
+        anchor_cursor.insertHtml(
+            self._STREAM_ANCHOR
+            + _bot_bubble(self._stream_buf, self._stream_ts, self._stream_idx)
+        )
+
+        sb = self.chat_display.verticalScrollBar()
+        if sb.maximum() - sb.value() < 60:
+            sb.setValue(sb.maximum())
+
+    def _reset_stream_state(self):
+        self._stream_buf = None
+        self._stream_ts = None
+        self._stream_idx = None
+
     # ── Drag & drop ───────────────────────────────────────────────────
 
     def dragEnterEvent(self, event: QDragEnterEvent):
@@ -1452,7 +1487,6 @@ def dropEvent(self, event: QDropEvent):
         if not mime.hasUrls():
             event.ignore()
             return
-
         added = 0
         for url in mime.urls():
             if not url.isLocalFile():
@@ -1461,12 +1495,10 @@ def dropEvent(self, event: QDropEvent):
             if _is_image_file(path) and path not in self._staged_images:
                 self._staged_images.append(path)
                 added += 1
-
         if added:
             self._refresh_staging_strip()
             self.status_label.setText(f"📎 Added {added} image{'s' if added != 1 else ''} by drag-and-drop.")
             QTimer.singleShot(2500, lambda: self.status_label.setText(""))
-
         event.acceptProposedAction()
 
     # ── Sidebar / sessions ────────────────────────────────────────────
@@ -1486,7 +1518,6 @@ def _delete_all_chats(self):
         dlg = _DeleteConfirmDialog("all chats", self)
         if dlg.exec_() != QDialog.Accepted:
             return
-
         try:
             if os.path.exists(_SESSIONS_DIR):
                 for fname in os.listdir(_SESSIONS_DIR):
@@ -1494,7 +1525,6 @@ def _delete_all_chats(self):
                         os.remove(os.path.join(_SESSIONS_DIR, fname))
         except Exception:
             pass
-
         self._sidebar.populate()
 
     def _open_session_viewer(self, item):
@@ -1528,7 +1558,6 @@ def _rename_session_by_id(self, session_id: str):
                 session = json.load(f)
         except Exception:
             return
-
         current_title = session.get("title", "Chat")
         title, ok = QInputDialog.getText(
             self, "Rename Chat", "New chat title:", text=current_title
@@ -1538,17 +1567,14 @@ def _rename_session_by_id(self, session_id: str):
         title = title.strip()
         if not title:
             return
-
         try:
             session["title"] = title
             with open(path, "w", encoding="utf-8") as f:
                 json.dump(session, f, ensure_ascii=False, indent=2)
         except Exception:
             return
-
         if session_id == self._current_session_id:
             self._session_title_override = title
-
         self._sidebar.populate()
 
     def _derive_session_title(self):
@@ -1563,7 +1589,6 @@ def _rebuild_chat_html_from_history(self):
         self.chat_display.setHtml(WELCOME_MESSAGE)
         self._bot_responses = {}
         self._response_counter = 0
-
         for line in self.chat_history:
             if line.startswith("User:"):
                 self.chat_display.append(_user_bubble(line[5:].strip(), ""))
@@ -1577,16 +1602,9 @@ def _rebuild_chat_html_from_history(self):
 
     def _on_session_clicked(self, item):
         session_id = item.data(Qt.UserRole)
-
-        # If this is the session already showing, do nothing.
         if (session_id == self._current_session_id
                 and not self._viewing_past_session):
             return
-
-        # Suspend BEFORE changing self._current_session_id so the worker
-        # snapshot captures the correct (old) session ID and history.
-        # Then flush the current session to disk so the file exists for
-        # _on_background_response to update when the worker finishes.
         if self._is_generating:
             self._suspend_worker(
                 session_id=self._current_session_id,
@@ -1594,13 +1612,10 @@ def _on_session_clicked(self, item):
                 session_kind=self._current_session_kind,
                 images_store=self._images_store,
             )
-
         self._save_debounce_timer.stop()
         self._save_pending = False
         self._save_current_session()
 
-        # Load the target session — try disk first, fall back to the
-        # in-memory sidebar cache (handles sessions not yet written to disk).
         path = os.path.join(_SESSIONS_DIR, f"{session_id}.json")
         session = None
         try:
@@ -1608,13 +1623,11 @@ def _on_session_clicked(self, item):
                 session = json.load(f)
         except Exception:
             pass
-
         if session is None:
             for s in self._sidebar._all_sessions_cache:
                 if s.get('id') == session_id:
                     session = s
                     break
-
         if session is None:
             return
 
@@ -1623,8 +1636,6 @@ def _on_session_clicked(self, item):
         created = session.get('created_at', '')
         kind    = session.get('kind', 'text')
 
-        # Switch the active session context to the one being viewed so that
-        # if the user types a follow-up, it goes to the right session.
         self._current_session_id      = session_id
         self._session_created_at      = created
         self._current_session_kind    = kind
@@ -1635,10 +1646,9 @@ def _on_session_clicked(self, item):
             (m[5:].strip() for m in reversed(msgs) if m.startswith("User:")), ""
         )
 
-        # Restore image store from session so follow-ups can re-send images
         saved_images = session.get("images", {})
         self._images_store = saved_images
-        self._last_image_paths = []  # original paths are gone; base64 stored instead
+        self._last_image_paths = []
 
         html = WELCOME_MESSAGE
         html += (
@@ -1658,7 +1668,6 @@ def _on_session_clicked(self, item):
         self._bot_responses = {}
         local_counter = 0
 
-        # Build a flat list of saved image thumbnails in order for replay
         all_saved_imgs = []
         for key in sorted(saved_images.keys()):
             all_saved_imgs.extend(saved_images[key])
@@ -1667,17 +1676,13 @@ def _on_session_clicked(self, item):
         for line in msgs:
             if line.startswith("User:"):
                 text = line[5:].strip()
-                # If this line is an image-analysis request, show the thumbnail
                 if text.startswith("[Image analysis request:"):
-                    # Show saved thumbnails for this entry
                     while img_replay_idx < len(all_saved_imgs):
                         fname, b64 = all_saved_imgs[img_replay_idx]
                         html += _image_thumbnail_html(b64, fname)
                         img_replay_idx += 1
-                        # Only consume images for this request
                         if img_replay_idx >= len(all_saved_imgs):
                             break
-                    # Also show any user text after the image tag
                     user_text_part = text.split("\n", 1)[-1].strip()
                     if user_text_part and not user_text_part.startswith("[Image"):
                         html += _user_bubble(user_text_part, "")
@@ -1688,16 +1693,13 @@ def _on_session_clicked(self, item):
                 self._bot_responses[local_counter] = text
                 html += _bot_bubble(text, "", local_counter)
                 local_counter += 1
-        self._response_counter = local_counter
 
+        self._response_counter = local_counter
         self.chat_display.setHtml(html)
         QTimer.singleShot(120, lambda: self.chat_display.verticalScrollBar().setValue(
             self.chat_display.verticalScrollBar().maximum()
         ))
 
-        # Load the session's messages into chat_history so follow-up questions
-        # have full context, and update the session ID so any new messages save
-        # to the correct file rather than the previous live session.
         self.chat_history = list(msgs)
         self._retry_history = list(msgs)
         self._current_session_id = session_id
@@ -1710,61 +1712,42 @@ def _on_session_clicked(self, item):
         self._viewing_past_session = True
 
     def _abort_worker(self):
-        """
-        Stop the active worker immediately and discard its response.
-        Use _suspend_worker() instead when switching sessions so the
-        generation can finish silently in the background.
-        """
         if hasattr(self, 'worker') and self.worker.isRunning():
             self.worker.stop()
             try:
                 self.worker.response_signal.disconnect()
                 self.worker.status_signal.disconnect()
+                if hasattr(self.worker, "chunk_signal"):
+                    self.worker.chunk_signal.disconnect()
             except Exception:
                 pass
             self.worker.wait(300)
         self._stop_thinking()
+        self._reset_stream_state()
 
     def _sidebar_upsert_from_signal(self, session: dict):
-        """Slot — always called on the main thread via _background_session_saved."""
         self._sidebar.upsert_session(session)
 
     def _suspend_worker(self, session_id: str, history: list,
                         session_kind: str, images_store: dict):
-        """
-        Detach the running worker from the UI and let it finish in the
-        background. When it completes, the bot reply is appended to the
-        session file on disk so the user sees the full conversation the
-        next time they open that chat from the sidebar.
-        """
         if not (hasattr(self, 'worker') and self.worker.isRunning()):
             self._stop_thinking()
             return
-
-        # Snapshot everything the callback needs before self.* moves on.
         _sid     = session_id
         _history = list(history)
         _kind    = session_kind
         _images  = dict(images_store)
         _worker  = self.worker
-        _signal  = self._background_session_saved   # Qt signal, safe to emit from thread
+        _signal  = self._background_session_saved
 
         def _on_background_response(bot_response: str):
-            """
-            Called from the worker thread when generation finishes.
-            Saves the response to disk, then emits a signal so the sidebar
-            update happens on the main thread (direct QWidget calls from
-            worker threads cause crashes on some platforms).
-            """
             try:
                 _history.append(f"Bot: {bot_response}")
                 path = os.path.join(_SESSIONS_DIR, f"{_sid}.json")
-
                 if os.path.exists(path):
                     with open(path, encoding="utf-8") as fp:
                         session = json.load(fp)
                 else:
-                    # Session file doesn't exist yet — build it from the snapshot.
                     session = {
                         "id":         _sid,
                         "title":      next(
@@ -1775,16 +1758,12 @@ def _on_background_response(bot_response: str):
                         "kind":       _kind,
                         "images":     _images,
                     }
-
                 session["messages"]   = _history[-40:]
                 session["updated_at"] = datetime.now().strftime("%Y-%m-%d %H:%M")
                 session["kind"]       = _kind
-
                 os.makedirs(_SESSIONS_DIR, exist_ok=True)
                 with open(path, "w", encoding="utf-8") as fp:
                     json.dump(session, fp, ensure_ascii=False, indent=2)
-
-                # Emit signal — the connected slot runs on the main thread.
                 _signal.emit(session)
             except Exception:
                 pass
@@ -1792,20 +1771,18 @@ def _on_background_response(bot_response: str):
         try:
             _worker.response_signal.disconnect()
             _worker.status_signal.disconnect()
+            if hasattr(_worker, "chunk_signal"):
+                _worker.chunk_signal.disconnect()
         except Exception:
             pass
-
         _worker.response_signal.connect(_on_background_response)
         self._stop_thinking()
+        self._reset_stream_state()
 
     def _new_chat(self):
-        # Stop the debounce timer and flush the current session to disk NOW,
-        # before anything is reset, so the file is written under the correct ID.
         self._save_debounce_timer.stop()
         self._save_pending = False
         if self._is_generating:
-            # Generation is running — detach it so it finishes silently and
-            # saves its reply into the current session file when it completes.
             self._suspend_worker(
                 session_id=self._current_session_id,
                 history=self.chat_history,
@@ -1813,14 +1790,8 @@ def _new_chat(self):
                 images_store=self._images_store,
             )
         else:
-            # Save current session synchronously so it lands on disk before
-            # we move on.  _save_current_session() is a no-op if chat_history
-            # is empty, so clicking New Chat on a blank window is safe.
             self._save_current_session()
 
-        # Reset UI and state for the new blank session WITHOUT calling
-        # clear_session() — that method deletes the session file, which
-        # would erase the chat we just saved above.
         self.chat_display.setHtml(WELCOME_MESSAGE)
         self.chat_history = []
         self._retry_history = []
@@ -1835,27 +1806,19 @@ def _new_chat(self):
         self._session_title_override = None
         self._current_session_id = str(uuid.uuid4())
         self._session_created_at = datetime.now().strftime("%Y-%m-%d %H:%M")
-
+        self._reset_stream_state()
         try:
             if os.path.exists(_HISTORY_FILE):
                 os.remove(_HISTORY_FILE)
         except Exception:
             pass
-
-        self._sidebar.populate()
-        self._current_session_kind = "text"
-        self._session_title_override = None
         self._sidebar.populate()
 
     def _on_session_deleted(self, deleted_id: str):
         if deleted_id == self._current_session_id or self._viewing_past_session:
             self._abort_worker()
-
-            # Cancel any pending debounced save so the deleted session
-            # file cannot be re-created by a timer that was already running.
             self._save_debounce_timer.stop()
             self._save_pending = False
-
             self._current_session_id = str(uuid.uuid4())
             self._session_created_at = datetime.now().strftime("%Y-%m-%d %H:%M")
             self._current_session_kind = "text"
@@ -1882,7 +1845,6 @@ def _export_current_chat(self):
             self.status_label.setText("Nothing to export.")
             QTimer.singleShot(2500, lambda: self.status_label.setText(""))
             return
-
         path, _ = QFileDialog.getSaveFileName(
             self,
             "Export Chat",
@@ -1891,7 +1853,6 @@ def _export_current_chat(self):
         )
         if not path:
             return
-
         try:
             with open(path, "w", encoding="utf-8") as f:
                 for line in self.chat_history:
@@ -1935,65 +1896,10 @@ def _on_status_result(self, running: bool):
 
     # ── Typing bubble ─────────────────────────────────────────────────
 
-    # ── Typing bubble (window-switch safe) ──────────────────────────
-    #
-    # (_typing_start_pos) and used it to select-and-replace the animated
-    # dots on every timer tick.  When the user switches away from the
-    # chatbot window Qt reflows the QTextBrowser's HTML document, which
-    # shifts character positions.  On the next timer tick the cursor
-    # landed in the wrong place and deleted real chat content.
-    #
-    # New approach: insert a sentinel <a> anchor tag with a unique id
-    # ("_typing_anchor_") right before the bubble HTML.  To update or
-    # remove the bubble we search the document for that anchor using
-    # QTextDocument.find() — which is position-independent and survives
-    # any reflow — then select from the match to the end of the document.
-    # The sentinel itself is a zero-width invisible link so it never
-    # appears in the rendered output.
-
-    _TYPING_ANCHOR = '<a name="_typing_anchor_"></a>'
-
-    def _find_typing_anchor_cursor(self):
-        """Return a cursor positioned at the typing-bubble sentinel,
-        or None if the sentinel is not in the document.
-
-        PyQt5 exposes anchor names via QTextCharFormat.anchorNames()
-        (returns a list) not .anchorName() -- we handle both spellings
-        defensively so the code works across PyQt5 versions.
-        """
-        doc = self.chat_display.document()
-        block = doc.begin()
-        while block.isValid():
-            it = block.begin()
-            while not it.atEnd():
-                frag = it.fragment()
-                if frag.isValid():
-                    fmt = frag.charFormat()
-                    # PyQt5 uses anchorNames() -> list[str]
-                    # Some builds also have anchorName() -> str
-                    # We try both so it works regardless of version.
-                    try:
-                        names = fmt.anchorNames()  # PyQt5 standard
-                        matched = "_typing_anchor_" in (names or [])
-                    except AttributeError:
-                        try:
-                            matched = fmt.anchorName() == "_typing_anchor_"
-                        except AttributeError:
-                            matched = False
-                    if matched:
-                        cursor = QTextCursor(doc)
-                        cursor.setPosition(frag.position())
-                        return cursor
-                it += 1
-            block = block.next()
-        return None
-
     def _show_typing_bubble(self):
         self._typing_frame = 0
         cursor = QTextCursor(self.chat_display.document())
         cursor.movePosition(QTextCursor.End)
-        # Insert sentinel anchor + bubble in one operation so they form
-        # a contiguous block that can be fully removed later.
         cursor.insertHtml(self._TYPING_ANCHOR + _typing_bubble(0))
         self._scroll_to_bottom()
         self._typing_anim_timer.start(400)
@@ -2002,16 +1908,10 @@ def _animate_typing_bubble(self):
         self._typing_frame = (self._typing_frame + 1) % 3
         anchor_cursor = self._find_typing_anchor_cursor()
         if anchor_cursor is None:
-            # Sentinel gone — stop the timer defensively
             self._typing_anim_timer.stop()
             return
-        # Select from the sentinel to the end of the document and replace.
-        # This is immune to any reflow that happened while the window was
-        # in the background because we locate by anchor name, not position.
         anchor_cursor.movePosition(QTextCursor.End, QTextCursor.KeepAnchor)
         anchor_cursor.insertHtml(self._TYPING_ANCHOR + _typing_bubble(self._typing_frame))
-        # Only auto-scroll if the user is already near the bottom so we
-        # don't hijack their scroll position while they read earlier msgs.
         sb = self.chat_display.verticalScrollBar()
         if sb.maximum() - sb.value() < 60:
             self._scroll_to_bottom()
@@ -2022,14 +1922,12 @@ def _remove_typing_bubble(self):
         if anchor_cursor is not None:
             anchor_cursor.movePosition(QTextCursor.End, QTextCursor.KeepAnchor)
             anchor_cursor.removeSelectedText()
-        # Legacy guard: if somehow _typing_start_pos path left stale state
         self._typing_start_pos = -1
 
     # ── Links ────────────────────────────────────────────────────────
 
     def _handle_link_click(self, url):
         scheme, parts = _parse_custom_url(url)
-
         if scheme == 'copy':
             if not parts:
                 return
@@ -2041,7 +1939,6 @@ def _handle_link_click(self, url):
             if text:
                 QApplication.clipboard().setText(text)
                 self._show_copy_toast()
-
         elif scheme == 'retry':
             if not parts:
                 return
@@ -2050,7 +1947,6 @@ def _handle_link_click(self, url):
             except ValueError:
                 return
             self._retry_response(idx)
-
         elif scheme == 'clear':
             self.clear_session()
 
@@ -2085,15 +1981,12 @@ def _refresh_staging_strip(self):
             item = self._thumb_row.takeAt(0)
             if item.widget():
                 item.widget().deleteLater()
-
         for path in self._staged_images:
             self._thumb_row.insertWidget(self._thumb_row.count() - 1, self._make_thumbnail(path))
-
         self._staging_area.setVisible(bool(self._staged_images))
 
     def _make_thumbnail(self, image_path: str) -> QWidget:
         from PyQt5.QtGui import QPixmap
-
         card = QWidget()
         card.setFixedSize(80, 64)
         card.setStyleSheet("""
@@ -2154,23 +2047,10 @@ def _clear_staged_images(self):
         self._refresh_staging_strip()
 
     def _warn_or_switch_to_vision_model(self) -> bool:
-        """
-        Ensure a vision-capable model is selected before sending images.
-
-        Returns True if it is safe to proceed (a vision model is active),
-        or False if no vision model is installed and the request should be
-        blocked. Sending images to a text-only model causes it to fabricate
-        completely wrong answers because it cannot actually see the image.
-        """
         current = self.model_combo.currentText()
         vision_keywords = ["llava", "bakllava", "vision", "moondream", "qwen2-vl", "minicpm-v"]
-
-        # Already on a vision model — good to go.
         if any(k in current.lower() for k in vision_keywords):
             return True
-
-        # Try to auto-switch to any vision model the user has installed.
-        preferred_order = ["moondream", "llava:7b", "llava", "bakllava", "llava:13b"]
         for i in range(self.model_combo.count()):
             name = self.model_combo.itemText(i)
             if any(k in name.lower() for k in vision_keywords):
@@ -2180,8 +2060,6 @@ def _warn_or_switch_to_vision_model(self) -> bool:
                 ))
                 self._scroll_to_bottom()
                 return True
-
-        # No vision model found — block the request and explain clearly.
         self.chat_display.append(_system_bubble(
             "⚠️ No vision model installed. Image analysis is not possible with the "
             "current model — a text-only model cannot see images and will give "
@@ -2193,7 +2071,7 @@ def _warn_or_switch_to_vision_model(self) -> bool:
         self._scroll_to_bottom()
         return False
 
-    # ── Mic ──────────────────────────────────────────────────────────
+    # ── Settings ─────────────────────────────────────────────────────
 
     def _on_temp_changed(self, value: int):
         self._temperature = round(value / 100, 2)
@@ -2209,6 +2087,8 @@ def _reset_settings(self):
         self._temp_slider.setValue(35)
         self._tok_slider.setValue(8)
 
+    # ── Mic ──────────────────────────────────────────────────────────
+
     def _update_mic_tooltip(self):
         backend = get_stt_backend()
         tips = {
@@ -2258,14 +2138,11 @@ def analyse_netlist(self, netlist_path: str):
                 f'❌ Netlist file not found: {_escape_text_preserve_breaks(netlist_path)}</td></tr></table>'
             )
             return
-
         self._current_session_kind = "netlist"
-
         ts = _get_time()
         filename = os.path.basename(netlist_path)
         self.chat_display.append(_netlist_header_bubble(filename, ts))
         self._scroll_to_bottom()
-
         try:
             with open(netlist_path, 'r', errors='replace') as f:
                 raw_lines = f.readlines()
@@ -2275,7 +2152,6 @@ def analyse_netlist(self, netlist_path: str):
                 f'❌ Could not read file: {_escape_text_preserve_breaks(str(e))}</td></tr></table>'
             )
             return
-
         components, nodes, directives = [], set(), []
         for line in raw_lines:
             s = line.strip()
@@ -2289,7 +2165,6 @@ def analyse_netlist(self, netlist_path: str):
                     nodes.update([parts[1], parts[2]])
             elif first == '.':
                 directives.append(s)
-
         summary = (
             f"Netlist file: {filename}\n"
             f"Total lines: {len(raw_lines)}\n"
@@ -2300,7 +2175,6 @@ def analyse_netlist(self, netlist_path: str):
             f"Full netlist:\n{''.join(raw_lines[:80])}"
             f"{'[truncated]' if len(raw_lines) > 80 else ''}"
         )
-
         prompt = (
             f"Analyse this NgSpice netlist for me.\n\n{summary}\n\n"
             "Please: (1) identify all components and their roles, "
@@ -2308,21 +2182,16 @@ def analyse_netlist(self, netlist_path: str):
             "(3) highlight any potential simulation issues, "
             "(4) suggest any improvements."
         )
-
         self.chat_history = (self.chat_history + [f"User: {prompt}"])[-20:]
         self._retry_history = list(self.chat_history)
         self._last_user_text = prompt
         self._start_thinking()
-
-        self.worker = OllamaWorker(
+        self._start_worker(OllamaWorker(
             self.chat_history,
             model=self.model_combo.currentText(),
             temperature=self._temperature,
             num_predict=self._num_predict,
-        )
-        self.worker.response_signal.connect(self.display_response)
-        self.worker.status_signal.connect(self._on_status_update)
-        self.worker.start()
+        ))
 
     # ── Topic switch ─────────────────────────────────────────────────
 
@@ -2332,26 +2201,17 @@ def _check_topic_switch(self, new_text: str) -> bool:
             self.chat_history = self.chat_history[-2:]
             self.chat_display.append(_topic_reset_banner())
             self._scroll_to_bottom()
-            # Clear image follow-up context when topic changes
             self._last_image_paths = []
         return switched
 
     # ── Persistence ──────────────────────────────────────────────────
 
     def _save_history(self):
-        """
-        Schedules a debounced disk write so saves are batched rather than
-        firing synchronously after every message, preventing UI freezes.
-        """
         self._save_pending = True
-        # Restart the timer so the window slides forward from the last change.
-        # If the user sends multiple messages quickly, only the final state is
-        # written, avoiding redundant I/O.
         if not self._save_debounce_timer.isActive():
             self._save_debounce_timer.start(_SAVE_DEBOUNCE_MS)
 
     def _flush_save(self):
-        """Perform the actual disk write when the debounce timer fires."""
         if not self._save_pending:
             return
         self._save_pending = False
@@ -2385,18 +2245,11 @@ def _save_current_session(self):
             path = os.path.join(_SESSIONS_DIR, f"{self._current_session_id}.json")
             with open(path, 'w', encoding='utf-8') as f:
                 json.dump(session, f, ensure_ascii=False, indent=2)
-            # Keep the sidebar in-memory cache in sync so the chat appears
-            # immediately without requiring a full populate() from disk.
             self._sidebar.upsert_session(session)
         except Exception:
             pass
 
     def _load_history(self):
-        """
-        On startup: if a leftover history file exists, archive it into the
-        sidebar sessions directory so the user can access it from the sidebar,
-        then delete the file.  The chat window always opens fresh.
-        """
         if not os.path.exists(_HISTORY_FILE):
             return
         try:
@@ -2432,7 +2285,6 @@ def _load_history(self):
                 os.remove(_HISTORY_FILE)
             except Exception:
                 pass
-        # New session ID so nothing from the old chat bleeds into the new one
         self._current_session_id  = str(uuid.uuid4())
         self._session_created_at  = datetime.now().strftime("%Y-%m-%d %H:%M")
 
@@ -2450,7 +2302,6 @@ def _on_models_fetched(self, model_names: list):
         self.model_combo.clear()
         for name in model_names:
             self.model_combo.addItem(name)
-
         preferred_order = [
             'qwen2.5-coder:3b',
             'llava:13b',
@@ -2466,7 +2317,6 @@ def _on_models_fetched(self, model_names: list):
                 break
         if chosen_idx >= 0:
             self.model_combo.setCurrentIndex(chosen_idx)
-
         self.model_combo.setEnabled(True)
 
     # ── Thinking / retry / regenerate ────────────────────────────────
@@ -2483,6 +2333,7 @@ def _start_thinking(self):
         self.send_button.hide()
         self.stop_button.show()
         self.clear_button.setEnabled(False)
+        self._reset_stream_state()
         self._show_typing_bubble()
 
     def _stop_thinking(self):
@@ -2507,17 +2358,8 @@ def _stop_generating(self):
             self.worker.stop()
 
     def _retry_response(self, response_idx: int):
-        """
-        Retry the bot response at response_idx.
-        Trims chat_history back to just before that response,
-        rebuilds the UI cleanly, then re-fires the worker so the
-        new answer replaces the old one with no duplicate bubbles.
-        """
         if self._is_generating:
             return
-
-        # Walk chat_history counting Bot: entries to find the target,
-        # then slice everything from that point forward off.
         bot_count = 0
         trim_to = None
         for i, line in enumerate(self.chat_history):
@@ -2526,88 +2368,65 @@ def _retry_response(self, response_idx: int):
                     trim_to = i
                     break
                 bot_count += 1
-
         if trim_to is None:
-            # Fallback: trim the last bot entry
             for i in range(len(self.chat_history) - 1, -1, -1):
                 if self.chat_history[i].startswith("Bot:"):
                     trim_to = i
                     break
-
         if trim_to is None or not any(
             l.startswith("User:") for l in self.chat_history[:trim_to]
         ):
             self.status_label.setText("Nothing to retry.")
             QTimer.singleShot(2000, lambda: self.status_label.setText(""))
             return
-
-        # Trim history then rebuild UI so the stale bubble is gone
-        # before the new response is appended.
         self.chat_history = self.chat_history[:trim_to]
         self._retry_history = list(self.chat_history)
         self._rebuild_chat_html_from_history()
         self._start_thinking()
-
-        # Re-use vision worker if the last user turn included images.
         last_user = next(
             (l for l in reversed(self.chat_history) if l.startswith("User:")), ""
         )
         followup_paths = [p for p in self._last_image_paths if os.path.exists(p)]
         if followup_paths and "[Image analysis request:" in last_user:
             prompt = last_user.split("\n", 1)[-1].strip() if "\n" in last_user else ""
-            self.worker = OllamaVisionWorker(
+            self._start_worker(OllamaVisionWorker(
                 image_paths=followup_paths,
                 extra_prompt=prompt,
                 model=self.model_combo.currentText(),
-            )
+            ))
         else:
-            self.worker = OllamaWorker(
+            self._start_worker(OllamaWorker(
                 self._retry_history,
                 model=self.model_combo.currentText(),
                 temperature=self._temperature,
                 num_predict=self._num_predict,
-            )
-        self.worker.response_signal.connect(self.display_response)
-        self.worker.status_signal.connect(self._on_status_update)
-        self.worker.start()
+            ))
 
     def _retry_last(self):
-        """Legacy shim kept so any external callers don't break."""
         if self.chat_history:
             self._retry_response(self._response_counter - 1)
 
     def _regenerate_last_response(self):
         if not self.chat_history:
             return
-
-        # Remove trailing bot response if present
         if self.chat_history and self.chat_history[-1].startswith("Bot:"):
             self.chat_history.pop()
-
-        # Find last user prompt
         if not self.chat_history or not self.chat_history[-1].startswith("User:"):
             self.status_label.setText("No previous user prompt to regenerate.")
             QTimer.singleShot(2500, lambda: self.status_label.setText(""))
             return
-
-        # Rebuild UI from trimmed history and retry from same state
         self._retry_history = list(self.chat_history)
         self._rebuild_chat_html_from_history()
         self._start_thinking()
-
-        self.worker = OllamaWorker(
+        self._start_worker(OllamaWorker(
             self._retry_history,
             model=self.model_combo.currentText(),
             temperature=self._temperature,
             num_predict=self._num_predict,
-        )
-        self.worker.response_signal.connect(self.display_response)
-        self.worker.status_signal.connect(self._on_status_update)
-        self.worker.start()
+        ))
 
     def _on_status_update(self, msg: str):
         self.status_label.setText(msg)
-        # Only show as chat bubble for major state changes, not every progress tick
         if "Starting Ollama" in msg or "Ollama started" in msg:
             self.chat_display.append(_system_bubble(msg))
             self._scroll_to_bottom()
@@ -2617,16 +2436,11 @@ def _on_status_update(self, msg: str):
     def ask_ollama(self):
         user_text = self.user_input.text().strip()
         staged_paths = list(self._staged_images)
-
         if not user_text and not staged_paths:
             return
-
         if self._is_generating:
             return
-
         if self._viewing_past_session:
-            # chat_history was already synced when the session was loaded,
-            # so no rebuild is needed — just clear the read-only flag.
             self._viewing_past_session = False
 
         ts = _get_time()
@@ -2634,22 +2448,13 @@ def ask_ollama(self):
         if staged_paths:
             self._current_session_kind = "image"
             if not self._warn_or_switch_to_vision_model():
-                # No vision model available — clear staged images and abort.
                 self._clear_staged_images()
                 return
-
             fnames = [os.path.basename(p) for p in staged_paths]
-
             if user_text:
                 self.user_input.add_to_history(user_text)
             self.user_input.clear()
-
-            # Pass the user's text directly to the vision worker.
-            # chatbot_thread._build_schematic_vision_prompt() handles both
-            # cases: if user_text is empty it requests a general analysis;
-            # if it contains a question that question drives the response.
             vision_extra_prompt = user_text
-
             if user_text:
                 user_history_text = (
                     f"[Image analysis request: {', '.join(fnames)}]\n{user_text}"
@@ -2658,20 +2463,16 @@ def ask_ollama(self):
                 user_history_text = (
                     f"[Image analysis request: {', '.join(fnames)}]"
                 )
-
             self.chat_history = (self.chat_history + [f"User: {user_history_text}"])[-20:]
             self._retry_history = list(self.chat_history)
             self._last_user_text = user_text if user_text else "image analysis"
 
-            # Read and encode images before displaying so thumbnails appear
-            # in the chat bubble immediately when the user sends.
             img_key = ts + "_" + self._current_session_id
             b64_list = []
             for p in staged_paths:
                 try:
                     with open(p, "rb") as f_img:
                         raw = f_img.read()
-                    # Downscale for storage (reuse PIL if available)
                     try:
                         from PIL import Image as _PI
                         import io as _io2
@@ -2689,33 +2490,22 @@ def ask_ollama(self):
                     pass
             if b64_list:
                 self._images_store[img_key] = b64_list
-
-            # Show image thumbnails inline so the user can see what was sent.
             if b64_list:
                 for fname, b64 in b64_list:
                     self.chat_display.append(_image_thumbnail_html(b64, fname))
             else:
-                # Fallback to filename badges if encoding failed for all images
                 self.chat_display.append(_staged_images_bubble(fnames, ts))
-
             if user_text:
                 self.chat_display.append(_user_bubble(user_text, ts))
             self._scroll_to_bottom()
-
-            # Keep paths for follow-up context
             self._last_image_paths = list(staged_paths)
-
             self._clear_staged_images()
             self._start_thinking()
-
-            self.worker = OllamaVisionWorker(
+            self._start_worker(OllamaVisionWorker(
                 image_paths=staged_paths,
                 extra_prompt=vision_extra_prompt,
                 model=self.model_combo.currentText(),
-            )
-            self.worker.response_signal.connect(self.display_response)
-            self.worker.status_signal.connect(self._on_status_update)
-            self.worker.start()
+            ))
             return
 
         self._current_session_kind = "text"
@@ -2723,60 +2513,69 @@ def ask_ollama(self):
         self.chat_history = (self.chat_history + [f"User: {user_text}"])[-20:]
         self.chat_display.append(_user_bubble(user_text, ts))
         self._scroll_to_bottom()
-
         self.user_input.add_to_history(user_text)
         self.user_input.clear()
         self._last_user_text = user_text
         self._retry_history = list(self.chat_history)
         self._start_thinking()
 
-        # If the user is following up on an image session, re-send the last
-        # images so the model has visual context for its answer.
         followup_image_paths = [
             p for p in self._last_image_paths if os.path.exists(p)
         ]
         if followup_image_paths and self._current_session_kind in ("image", "text"):
-            self.worker = OllamaVisionWorker(
+            self._start_worker(OllamaVisionWorker(
                 image_paths=followup_image_paths,
                 extra_prompt=user_text,
                 model=self.model_combo.currentText(),
-            )
+            ))
         else:
-            self.worker = OllamaWorker(
+            self._start_worker(OllamaWorker(
                 self.chat_history,
                 model=self.model_combo.currentText(),
                 temperature=self._temperature,
                 num_predict=self._num_predict,
-            )
-        self.worker.response_signal.connect(self.display_response)
-        self.worker.status_signal.connect(self._on_status_update)
-        self.worker.start()
+            ))
 
     # ── Window / response / clear ────────────────────────────────────
 
     def move_to_bottom_right(self):
-        # in Qt 6.  Use QApplication.primaryScreen().availableGeometry() instead.
         screen = QApplication.primaryScreen().availableGeometry()
         widget = self.geometry()
         x = screen.width() - widget.width() - 10
         y = screen.height() - widget.height() - 50
         self.move(x, y)
 
-    def display_response(self, bot_response: str):
+    def display_response(self, bot_response):
+        """
+        Final-reply slot. If streaming was active, replace the in-progress
+        bubble (located by anchor) with the authoritative final text.
+        Otherwise just append a fresh bubble.
+        """
         self._stop_thinking()
-        ts = _get_time()
-        idx = self._response_counter
-        self._response_counter += 1
-        self._bot_responses[idx] = bot_response
+        ts = self._stream_ts or _get_time()
+
+        if self._stream_buf is not None:
+            idx = self._stream_idx
+            anchor_cursor = self._find_stream_anchor_cursor()
+            if anchor_cursor is not None:
+                anchor_cursor.movePosition(QTextCursor.End, QTextCursor.KeepAnchor)
+                anchor_cursor.removeSelectedText()
+                anchor_cursor.insertHtml(_bot_bubble(bot_response, ts, idx))
+            else:
+                # Anchor lost (shouldn't happen) — append as a fallback.
+                self.chat_display.append(_bot_bubble(bot_response, ts, idx))
+        else:
+            idx = self._response_counter
+            self.chat_display.append(_bot_bubble(bot_response, ts, idx))
 
-        self.chat_display.append(_bot_bubble(bot_response, ts, idx))
+        self._bot_responses[idx] = bot_response
+        self._response_counter = max(self._response_counter, idx + 1)
         self.chat_history.append(f"Bot: {bot_response}")
+        self._reset_stream_state()
+
         self._scroll_to_bottom()
         self._update_ollama_status()
 
-        # Push a lightweight session entry into the sidebar immediately so
-        # the new chat appears at the top as soon as the first reply lands,
-        # without waiting for the debounced disk save (up to 5 seconds).
         self._sidebar.upsert_session({
             "id":         self._current_session_id,
             "title":      self._derive_session_title(),
@@ -2785,26 +2584,17 @@ def display_response(self, bot_response: str):
             "messages":   self.chat_history[-40:],
             "kind":       self._current_session_kind,
         })
-
         self._save_history()
 
-        # (Retry is now an inline link in every bot bubble;
-        # the old navbar retry_button has been removed.)
-
     def clear_session(self):
-        # Cancel any pending debounced save so _flush_save() can't
-        # resurrect the session file after we delete it below.
         self._save_debounce_timer.stop()
         self._save_pending = False
-
-        # Remove session file so it never reappears in the sidebar.
         session_file = os.path.join(_SESSIONS_DIR, f"{self._current_session_id}.json")
         try:
             if os.path.exists(session_file):
                 os.remove(session_file)
         except Exception:
             pass
-
         self.chat_display.setHtml(WELCOME_MESSAGE)
         self.chat_history = []
         self._retry_history = []
@@ -2818,18 +2608,14 @@ def clear_session(self):
         self._viewing_past_session = False
         self._current_session_kind = "text"
         self._session_title_override = None
-
-        # Assign a fresh session ID so the next conversation starts clean
         self._current_session_id = str(uuid.uuid4())
         self._session_created_at = datetime.now().strftime("%Y-%m-%d %H:%M")
-
+        self._reset_stream_state()
         try:
             if os.path.exists(_HISTORY_FILE):
                 os.remove(_HISTORY_FILE)
         except Exception:
             pass
-
-        # Refresh sidebar so the cleared session disappears immediately
         self._refresh_sidebar_if_open()
 
     # ── Debug helpers ────────────────────────────────────────────────
@@ -2847,15 +2633,12 @@ def debug_ollama(self):
         self._scroll_to_bottom()
         self._retry_history = list(self.chat_history)
         self._start_thinking()
-        self.worker = OllamaWorker(
+        self._start_worker(OllamaWorker(
             self.chat_history,
             model=self.model_combo.currentText(),
             temperature=self._temperature,
             num_predict=self._num_predict,
-        )
-        self.worker.response_signal.connect(self.display_response)
-        self.worker.status_signal.connect(self._on_status_update)
-        self.worker.start()
+        ))
         self.user_input.clear()
 
     def debug_error(self, log):
@@ -2863,14 +2646,11 @@ def debug_error(self, log):
         self.show()
         self.raise_()
         self.activateWindow()
-
         self.chat_history = []
         self._current_session_kind = "simulation_error"
-
         if os.path.exists(log):
             with open(log, "r") as f:
                 lines = [ln for ln in f.readlines() if ln.strip()]
-
             no_compat_index = next(
                 (i for i, ln in enumerate(lines) if "No compatibility mode selected!" in ln), None
             )
@@ -2878,7 +2658,6 @@ def debug_error(self, log):
             total_cpu_index = next(
                 (i for i, ln in enumerate(lines) if "Total CPU time (seconds)" in ln), None
             )
-
             before_no_compat = lines[:no_compat_index] if no_compat_index else []
             between = (
                 lines[circuit_index + 1:total_cpu_index]
@@ -2886,28 +2665,21 @@ def debug_error(self, log):
                 else []
             )
             filtered_lines = before_no_compat + between
-            # before sending to the model.  NgSpice logs can be 10-50 KB; sending
-            # all of it blows past num_ctx: 2048 and makes the model ignore the
-            # actual error.  The most actionable errors always appear at the end.
             if len(filtered_lines) > _MAX_ERROR_LOG_LINES:
                 truncated_notice = [
                     f"[Log truncated: showing last {_MAX_ERROR_LOG_LINES} "
                     f"of {len(filtered_lines)} lines]\n"
                 ]
                 filtered_lines = truncated_notice + filtered_lines[-_MAX_ERROR_LOG_LINES:]
-
             combined_text = "".join(filtered_lines)
-            # QLineEdit); display a compact summary label in the status bar instead.
             self.status_label.setText(
                 f"🔍 Analysing error log ({len(filtered_lines)} lines)…"
             )
-
             self.obj_appconfig = Appconfig()
             self.projDir = self.obj_appconfig.current_project["ProjectName"]
             output_file = os.path.join(self.projDir, "erroroutput.txt")
             with open(output_file, "w") as f:
                 f.writelines(filtered_lines)
-
             self.chat_history.append(
                 f"User: I got a simulation error. Here is the log:\n{combined_text}"
             )
diff --git a/src/frontEnd/DockArea.py b/src/frontEnd/DockArea.py
index 32d0682fb..da0378d4a 100755
--- a/src/frontEnd/DockArea.py
+++ b/src/frontEnd/DockArea.py
@@ -12,7 +12,7 @@
 from PyQt5.QtWidgets import QLineEdit, QLabel, QPushButton, QVBoxLayout, QHBoxLayout
 from PyQt5.QtCore import Qt
 import os
-from frontEnd.Chatbot import create_chatbot_dock
+from frontEnd.Chatbot import ChatbotGUI
 from converter.pspiceToKicad import PspiceConverter
 from converter.ltspiceToKicad import LTspiceConverter
 from converter.LtspiceLibConverter import LTspiceLibConverter