fix(diarization-ui): raise default num_predict to 16384
Thinking tokens count against num_predict. At 4096 the model was running out mid-response after spending ~3000 tokens on thinking. 16384 gives enough headroom for thinking + full response. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,5 @@
|
|||||||
API_BASE=http://gx10.aquantico.lan:8093
|
API_BASE=http://gx10.aquantico.lan:8093
|
||||||
OLLAMA_BASE_URL=http://gx10.aquantico.lan:11434
|
OLLAMA_BASE_URL=http://gx10.aquantico.lan:11434
|
||||||
OLLAMA_MODEL=qwen3.5:9b
|
OLLAMA_MODEL=qwen3.5:9b
|
||||||
OLLAMA_NUM_PREDICT=4096
|
OLLAMA_NUM_PREDICT=16384
|
||||||
OLLAMA_THINK=true
|
OLLAMA_THINK=true
|
||||||
|
|||||||
2
app.py
2
app.py
@@ -15,7 +15,7 @@ from fastapi.responses import HTMLResponse, PlainTextResponse, Response, JSONRes
|
|||||||
API_BASE = os.getenv("API_BASE", "http://gx10.aquantico.lan:8093").rstrip("/")
|
API_BASE = os.getenv("API_BASE", "http://gx10.aquantico.lan:8093").rstrip("/")
|
||||||
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://gx10.aquantico.lan:11434").rstrip("/")
|
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://gx10.aquantico.lan:11434").rstrip("/")
|
||||||
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "qwen3.5:9b")
|
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "qwen3.5:9b")
|
||||||
OLLAMA_NUM_PREDICT = int(os.getenv("OLLAMA_NUM_PREDICT", "4096"))
|
OLLAMA_NUM_PREDICT = int(os.getenv("OLLAMA_NUM_PREDICT", "16384"))
|
||||||
OLLAMA_THINK = os.getenv("OLLAMA_THINK", "true").lower() in ("1", "true", "yes")
|
OLLAMA_THINK = os.getenv("OLLAMA_THINK", "true").lower() in ("1", "true", "yes")
|
||||||
DB_PATH = os.getenv("DB_PATH", "/data/ui.db")
|
DB_PATH = os.getenv("DB_PATH", "/data/ui.db")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user