From aae53d91b112b5cb41537114341e8c240c963d64 Mon Sep 17 00:00:00 2001 From: wb Date: Wed, 6 May 2026 16:04:12 +0200 Subject: [PATCH] fix(diarization-ui): prevent repetition loops in Ollama generation Adds repeat_penalty=1.15 and repeat_last_n=128 to suppress token repetition loops (e.g. "tragen" -> "tragen" -> ...). Also caps output via num_predict (default 4096, configurable via OLLAMA_NUM_PREDICT env var) as a hard stop in case the model still gets stuck. Co-Authored-By: Claude Sonnet 4.6 --- .env.example | 1 + app.py | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.env.example b/.env.example index cf68448..147c8f1 100644 --- a/.env.example +++ b/.env.example @@ -1,3 +1,4 @@ API_BASE=http://gx10.aquantico.lan:8093 OLLAMA_BASE_URL=http://gx10.aquantico.lan:11434 OLLAMA_MODEL=qwen3.5:9b +OLLAMA_NUM_PREDICT=4096 diff --git a/app.py b/app.py index be38f5e..35db195 100644 --- a/app.py +++ b/app.py @@ -15,6 +15,7 @@ from fastapi.responses import HTMLResponse, PlainTextResponse, Response, JSONRes API_BASE = os.getenv("API_BASE", "http://gx10.aquantico.lan:8093").rstrip("/") OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://gx10.aquantico.lan:11434").rstrip("/") OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "qwen3.5:9b") +OLLAMA_NUM_PREDICT = int(os.getenv("OLLAMA_NUM_PREDICT", "4096")) DB_PATH = os.getenv("DB_PATH", "/data/ui.db") app = FastAPI(title="Diarization UI") @@ -386,7 +387,12 @@ def _process_analysis_job(job_id: int): r = requests.post( f"{OLLAMA_BASE_URL}/api/generate", - json={"model": OLLAMA_MODEL, "prompt": llm_prompt, "stream": True, "options": {"num_ctx": num_ctx}}, + json={"model": OLLAMA_MODEL, "prompt": llm_prompt, "stream": True, "options": { + "num_ctx": num_ctx, + "num_predict": OLLAMA_NUM_PREDICT, + "repeat_penalty": 1.15, + "repeat_last_n": 128, + }}, stream=True, timeout=1200, )