From 91b8522916e1944ce3829b6a1a3a3cd9206b6502 Mon Sep 17 00:00:00 2001 From: wb Date: Wed, 6 May 2026 15:56:59 +0200 Subject: [PATCH] feat(diarization-ui): dynamic num_ctx for Ollama based on prompt size Estimates required context window from prompt length (chars/3 + 2048 response buffer) and rounds up to the nearest fixed tier (4096, 8192, 16384, 32768, 65536). Fixed tiers prevent Ollama from reloading the model on every call. The chosen num_ctx is prepended to the stored llm_prompt so it's visible in the debug window. Co-Authored-By: Claude Sonnet 4.6 --- app.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/app.py b/app.py index 7b11f45..be38f5e 100644 --- a/app.py +++ b/app.py @@ -36,6 +36,14 @@ def now_iso() -> str: return datetime.utcnow().isoformat() +def _estimate_num_ctx(prompt: str) -> int: + needed = len(prompt) // 3 + 2048 # rough token estimate + response buffer + for ctx in (4096, 8192, 16384, 32768, 65536): + if needed <= ctx: + return ctx + return 65536 + + def init_db(): os.makedirs(os.path.dirname(DB_PATH), exist_ok=True) with db() as c: @@ -371,13 +379,14 @@ def _process_analysis_job(job_id: int): + f"\\nTEXT:\\n{doc['content_md']}\\n" ) - _job_set(job_id, llm_prompt=llm_prompt) + num_ctx = _estimate_num_ctx(llm_prompt) + _job_set(job_id, llm_prompt=f"[num_ctx={num_ctx}]\n\n{llm_prompt}") with _JOB_STREAM_LOCK: _JOB_STREAMS[job_id] = {"thinking": "", "response": ""} r = requests.post( f"{OLLAMA_BASE_URL}/api/generate", - json={"model": OLLAMA_MODEL, "prompt": llm_prompt, "stream": True}, + json={"model": OLLAMA_MODEL, "prompt": llm_prompt, "stream": True, "options": {"num_ctx": num_ctx}}, stream=True, timeout=1200, )