From 4f651fac1274a2ab85ffa2adc926d69e5a8aa36f Mon Sep 17 00:00:00 2001
From: wb <wb@aquantico.de>
Date: Wed, 6 May 2026 15:50:40 +0200
Subject: [PATCH] fix(diarization-ui): capture thinking tokens in debug stream
 (Qwen3)

Ollama streaming chunks for thinking models use a separate "thinking"
field. Previously only "response" was captured, leaving the debug
window empty while the model reasoned. Now both fields are tracked
independently: thinking is shown in blue above the final answer,
both are persisted to new llm_thinking / existing llm_response columns.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 app.py | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/app.py b/app.py
index e4ac8fd..7b11f45 100644
--- a/app.py
+++ b/app.py
@@ -110,6 +110,10 @@ def init_db():
             c.execute("ALTER TABLE jobs ADD COLUMN llm_response TEXT")
         except Exception:
             pass
+        try:
+            c.execute("ALTER TABLE jobs ADD COLUMN llm_thinking TEXT")
+        except Exception:
+            pass
 
         # defaults
         c.execute("INSERT OR IGNORE INTO projects(name, created_at) VALUES (?,?)", ("Default", now_iso()))
@@ -369,7 +373,7 @@ def _process_analysis_job(job_id: int):
 
         _job_set(job_id, llm_prompt=llm_prompt)
         with _JOB_STREAM_LOCK:
-            _JOB_STREAMS[job_id] = ""
+            _JOB_STREAMS[job_id] = {"thinking": "", "response": ""}
 
         r = requests.post(
             f"{OLLAMA_BASE_URL}/api/generate",
@@ -379,16 +383,18 @@ def _process_analysis_job(job_id: int):
         )
         r.raise_for_status()
 
-        accumulated = ""
+        acc_thinking = ""
+        acc_response = ""
         ollama_final = {}
         chunk_count = 0
         for line in r.iter_lines():
             if not line:
                 continue
             chunk = json.loads(line)
-            accumulated += chunk.get("response", "")
+            acc_thinking += chunk.get("thinking") or ""
+            acc_response += chunk.get("response") or ""
             with _JOB_STREAM_LOCK:
-                _JOB_STREAMS[job_id] = accumulated
+                _JOB_STREAMS[job_id] = {"thinking": acc_thinking, "response": acc_response}
             chunk_count += 1
             if chunk_count % 100 == 0:
                 j_check = _job_get(job_id)
@@ -398,7 +404,7 @@ def _process_analysis_job(job_id: int):
                 ollama_final = chunk
                 break
 
-        answer = accumulated
+        answer = acc_response
 
         j = _job_get(job_id)
         if not j or j["status"] == "cancelled":
@@ -423,7 +429,7 @@ def _process_analysis_job(job_id: int):
             )
             new_doc_id = cur.lastrowid
 
-        _job_set(job_id, status="done", result_document_id=new_doc_id, finished_at=now_iso(), llm_response=answer)
+        _job_set(job_id, status="done", result_document_id=new_doc_id, finished_at=now_iso(), llm_response=answer, llm_thinking=acc_thinking or None)
     except Exception as e:
         _job_set(job_id, status="error", error=str(e), finished_at=now_iso())
     finally:
@@ -1302,12 +1308,13 @@ def job_debug_data(job_id: int):
     if not j:
         raise HTTPException(404, "job not found")
     with _JOB_STREAM_LOCK:
-        live = _JOB_STREAMS.get(job_id)
+        live = dict(_JOB_STREAMS[job_id]) if job_id in _JOB_STREAMS else None
     return {
         "status": j["status"],
         "kind": j["kind"],
         "llm_prompt": j.get("llm_prompt"),
-        "llm_response": live if live is not None else j.get("llm_response"),
+        "llm_thinking": live["thinking"] if live is not None else j.get("llm_thinking"),
+        "llm_response": live["response"] if live is not None else j.get("llm_response"),
         "streaming": live is not None,
     }
 
@@ -1530,7 +1537,16 @@ async function refreshDebug() {{
     document.getElementById('debug-prompt-content').textContent = d.llm_prompt || '(kein Prompt gespeichert)';
     const respEl = document.getElementById('debug-response-content');
     const atBottom = respEl.scrollHeight - respEl.scrollTop <= respEl.clientHeight + 20;
-    respEl.textContent = d.llm_response || '(noch keine Antwort)';
+    const thinking = d.llm_thinking || '';
+    const response = d.llm_response || '';
+    if (thinking) {{
+      respEl.innerHTML =
+        `<span style='color:#94a3b8;font-size:.75rem;letter-spacing:.06em'>▶ THINKING</span>\n` +
+        `<span style='color:#7dd3fc'>${{thinking.replace(/</g,'&lt;')}}</span>` +
+        (response ? `\n\n<span style='color:#94a3b8;font-size:.75rem;letter-spacing:.06em'>▶ ANTWORT</span>\n<span style='color:#c7f9cc'>${{response.replace(/</g,'&lt;')}}</span>` : '');
+    }} else {{
+      respEl.textContent = response || '(noch keine Antwort)';
+    }}
     if (atBottom) respEl.scrollTop = respEl.scrollHeight;
     const badge = document.getElementById('debug-streaming-badge');
     badge.classList.toggle('d-none', !d.streaming);