From 39250e6582cdf8812c26dec2797359950992b042 Mon Sep 17 00:00:00 2001
From: wb <wb@aquantico.de>
Date: Wed, 6 May 2026 16:26:55 +0200
Subject: [PATCH] fix(diarization-ui): raise default num_predict to 16384

Thinking tokens count against num_predict. At 4096 the model was
running out mid-response after spending ~3000 tokens on thinking.
16384 gives enough headroom for thinking + full response.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .env.example | 2 +-
 app.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.env.example b/.env.example
index 43ec7bb..df50125 100644
--- a/.env.example
+++ b/.env.example
@@ -1,5 +1,5 @@
 API_BASE=http://gx10.aquantico.lan:8093
 OLLAMA_BASE_URL=http://gx10.aquantico.lan:11434
 OLLAMA_MODEL=qwen3.5:9b
-OLLAMA_NUM_PREDICT=4096
+OLLAMA_NUM_PREDICT=16384
 OLLAMA_THINK=true
diff --git a/app.py b/app.py
index f21a94a..7e47378 100644
--- a/app.py
+++ b/app.py
@@ -15,7 +15,7 @@ from fastapi.responses import HTMLResponse, PlainTextResponse, Response, JSONRes
 API_BASE = os.getenv("API_BASE", "http://gx10.aquantico.lan:8093").rstrip("/")
 OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://gx10.aquantico.lan:11434").rstrip("/")
 OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "qwen3.5:9b")
-OLLAMA_NUM_PREDICT = int(os.getenv("OLLAMA_NUM_PREDICT", "4096"))
+OLLAMA_NUM_PREDICT = int(os.getenv("OLLAMA_NUM_PREDICT", "16384"))
 OLLAMA_THINK = os.getenv("OLLAMA_THINK", "true").lower() in ("1", "true", "yes")
 DB_PATH = os.getenv("DB_PATH", "/data/ui.db")