version = 1 [*] flash-attn = on parallel = 1 jinja = true n-gpu-layers = 99 threads = 8 ubatch-size = 256 cache-type-k = bf16 cache-type-v = bf16 mmap = false no-context-shift = true fit = on c = 131072 # Multilingual embedding model for RAG, kept co-resident with `coder` # (requires --models-max 2). Overrides global defaults that break embeddings: # ubatch-size must cover the whole input (global 256 would truncate), and the # context is pinned to the model's 8192 max instead of the global 131072. [bge-m3] hf = ggml-org/bge-m3-Q8_0-GGUF embeddings = true pooling = cls ubatch-size = 8192 c = 8192 fit = off load-on-startup = true [coder] hf = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K spec-type = ngram-simple,draft-mtp spec-draft-n-max = 5 threads-batch = 16 temp = 0.6 top-p = 0.95 top-k = 20 min-p = 0.0 presence-penalty = 1.5 parallel = 2 c = 524288 load-on-startup = true chat-template-kwargs = {"preserve_thinking": true} [fast] hf = byteshape/Qwen3.6-35B-A3B-MTP-GGUF:IQ4_XS spec-type = ngram-simple,draft-mtp spec-draft-n-max = 3 threads-batch = 16 temp = 0.6 top-p = 0.95 top-k = 20 min-p = 0.0 presence-penalty = 1.5 parallel = 1 c = 131072 load-on-startup = true chat-template-kwargs = {"preserve_thinking": true}