feat(sgx): add CLI RAG stack (Qdrant + embeddings gateway + rag tool)
Stand up document retrieval as shared, client-agnostic primitives rather than locking it inside Open WebUI: - Qdrant as the LAN-reachable vector store - LiteLLM gains a bge-m3 route so sgx:4000 also serves /v1/embeddings - a thin `rag` CLI (ingest/query, optional coder synthesis) usable from any machine and from scripts Embeddings and synthesis run on halo via the gateway; the CLI is configured entirely through RAG_* env vars.
This commit is contained in:
parent
ab729a0720
commit
95668b71a7
4 changed files with 180 additions and 0 deletions
159
packages/rag/default.nix
Normal file
159
packages/rag/default.nix
Normal file
|
|
@ -0,0 +1,159 @@
|
||||||
|
{ writers, python3Packages }:
|
||||||
|
# Thin CLI RAG over a shared Qdrant collection. Embeddings + synthesis go
|
||||||
|
# through the OpenAI-compatible gateway (LiteLLM on sgx -> halo's llama-server).
|
||||||
|
# Configure via env: RAG_API_BASE, RAG_API_KEY, RAG_QDRANT_URL, RAG_EMBED_MODEL,
|
||||||
|
# RAG_CHAT_MODEL, RAG_COLLECTION.
|
||||||
|
writers.writePython3Bin "rag"
|
||||||
|
{
|
||||||
|
libraries = [
|
||||||
|
python3Packages.openai
|
||||||
|
python3Packages.qdrant-client
|
||||||
|
];
|
||||||
|
flakeIgnore = [ "E501" ];
|
||||||
|
}
|
||||||
|
''
|
||||||
|
import argparse
|
||||||
|
import hashlib
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from openai import OpenAI
|
||||||
|
from qdrant_client import QdrantClient
|
||||||
|
from qdrant_client.models import Distance, PointStruct, VectorParams
|
||||||
|
|
||||||
|
API_BASE = os.environ.get("RAG_API_BASE", "http://localhost:4000/v1")
|
||||||
|
API_KEY = os.environ.get("RAG_API_KEY", "none")
|
||||||
|
QDRANT_URL = os.environ.get("RAG_QDRANT_URL", "http://localhost:6333")
|
||||||
|
EMBED_MODEL = os.environ.get("RAG_EMBED_MODEL", "bge-m3")
|
||||||
|
CHAT_MODEL = os.environ.get("RAG_CHAT_MODEL", "coder")
|
||||||
|
DEFAULT_COLLECTION = os.environ.get("RAG_COLLECTION", "docs")
|
||||||
|
|
||||||
|
client = OpenAI(base_url=API_BASE, api_key=API_KEY)
|
||||||
|
qdrant = QdrantClient(url=QDRANT_URL)
|
||||||
|
|
||||||
|
|
||||||
|
def embed(texts):
|
||||||
|
resp = client.embeddings.create(model=EMBED_MODEL, input=texts)
|
||||||
|
return [d.embedding for d in resp.data]
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_text(text, size=1000, overlap=200):
|
||||||
|
chunks = []
|
||||||
|
start = 0
|
||||||
|
n = len(text)
|
||||||
|
while start < n:
|
||||||
|
end = min(start + size, n)
|
||||||
|
chunks.append(text[start:end])
|
||||||
|
if end == n:
|
||||||
|
break
|
||||||
|
start = end - overlap
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def iter_files(paths):
|
||||||
|
for p in paths:
|
||||||
|
path = pathlib.Path(p)
|
||||||
|
if path.is_dir():
|
||||||
|
for f in sorted(path.rglob("*")):
|
||||||
|
if f.is_file():
|
||||||
|
yield f
|
||||||
|
elif path.is_file():
|
||||||
|
yield path
|
||||||
|
|
||||||
|
|
||||||
|
def read_text(f):
|
||||||
|
try:
|
||||||
|
return f.read_text(encoding="utf-8")
|
||||||
|
except (UnicodeDecodeError, OSError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_collection(name, dim):
|
||||||
|
if not qdrant.collection_exists(name):
|
||||||
|
qdrant.create_collection(
|
||||||
|
name,
|
||||||
|
vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def point_id(source, idx):
|
||||||
|
h = hashlib.sha1(f"{source}:{idx}".encode()).hexdigest()
|
||||||
|
return int(h[:16], 16)
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_ingest(args):
|
||||||
|
collection = args.collection
|
||||||
|
total = 0
|
||||||
|
for f in iter_files(args.paths):
|
||||||
|
text = read_text(f)
|
||||||
|
if not text or not text.strip():
|
||||||
|
continue
|
||||||
|
chunks = chunk_text(text)
|
||||||
|
vectors = embed(chunks)
|
||||||
|
ensure_collection(collection, len(vectors[0]))
|
||||||
|
points = [
|
||||||
|
PointStruct(
|
||||||
|
id=point_id(str(f), i),
|
||||||
|
vector=v,
|
||||||
|
payload={"text": c, "source": str(f), "chunk": i},
|
||||||
|
)
|
||||||
|
for i, (c, v) in enumerate(zip(chunks, vectors))
|
||||||
|
]
|
||||||
|
qdrant.upsert(collection, points)
|
||||||
|
total += len(points)
|
||||||
|
print(f"ingested {len(points):4d} chunks {f}", file=sys.stderr)
|
||||||
|
print(f"done: {total} chunks into {collection!r}", file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_query(args):
|
||||||
|
collection = args.collection
|
||||||
|
qvec = embed([args.text])[0]
|
||||||
|
hits = qdrant.query_points(collection, query=qvec, limit=args.k).points
|
||||||
|
if not args.synthesize:
|
||||||
|
for h in hits:
|
||||||
|
src = h.payload.get("source", "?")
|
||||||
|
chunk = h.payload.get("chunk", "?")
|
||||||
|
txt = h.payload.get("text", "")
|
||||||
|
print(f"\n# {h.score:.3f} {src}#{chunk}\n{txt}")
|
||||||
|
return
|
||||||
|
context = "\n\n".join(
|
||||||
|
f"[{i + 1}] {h.payload.get('source')}\n{h.payload.get('text')}"
|
||||||
|
for i, h in enumerate(hits)
|
||||||
|
)
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": "Answer the question using only the context below. Cite sources by their [n] tag. If the answer is not in the context, say so."},
|
||||||
|
{"role": "user", "content": f"Context:\n{context}\n\nQuestion: {args.text}"},
|
||||||
|
]
|
||||||
|
stream = client.chat.completions.create(model=CHAT_MODEL, messages=messages, stream=True)
|
||||||
|
for ev in stream:
|
||||||
|
delta = ev.choices[0].delta.content
|
||||||
|
if delta:
|
||||||
|
sys.stdout.write(delta)
|
||||||
|
sys.stdout.flush()
|
||||||
|
sys.stdout.write("\n")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(prog="rag", description="CLI RAG over a Qdrant collection using halo embeddings.")
|
||||||
|
sub = parser.add_subparsers(dest="cmd", required=True)
|
||||||
|
|
||||||
|
pi = sub.add_parser("ingest", help="embed files/dirs into a collection")
|
||||||
|
pi.add_argument("paths", nargs="+")
|
||||||
|
pi.add_argument("-c", "--collection", default=DEFAULT_COLLECTION)
|
||||||
|
pi.set_defaults(func=cmd_ingest)
|
||||||
|
|
||||||
|
pq = sub.add_parser("query", help="search a collection")
|
||||||
|
pq.add_argument("text")
|
||||||
|
pq.add_argument("-c", "--collection", default=DEFAULT_COLLECTION)
|
||||||
|
pq.add_argument("-k", type=int, default=5)
|
||||||
|
pq.add_argument("-s", "--synthesize", action="store_true", help="feed hits to the chat model for a written answer")
|
||||||
|
pq.set_defaults(func=cmd_query)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
args.func(args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
''
|
||||||
|
|
@ -12,6 +12,7 @@
|
||||||
./wyoming.nix
|
./wyoming.nix
|
||||||
./searx.nix
|
./searx.nix
|
||||||
./litellm.nix
|
./litellm.nix
|
||||||
|
./qdrant.nix
|
||||||
./uptime-kuma.nix
|
./uptime-kuma.nix
|
||||||
./firefly.nix
|
./firefly.nix
|
||||||
./opencode.nix
|
./opencode.nix
|
||||||
|
|
@ -25,6 +26,7 @@
|
||||||
environment.systemPackages = with pkgs; [
|
environment.systemPackages = with pkgs; [
|
||||||
claude-code
|
claude-code
|
||||||
opencode
|
opencode
|
||||||
|
metacfg.rag
|
||||||
];
|
];
|
||||||
|
|
||||||
services.tailscale.enable = true;
|
services.tailscale.enable = true;
|
||||||
|
|
|
||||||
|
|
@ -22,6 +22,16 @@
|
||||||
api_key = "none"; # llama-server requires no key; value is ignored
|
api_key = "none"; # llama-server requires no key; value is ignored
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
{
|
||||||
|
# Multilingual embeddings, also served by halo's router (the `[bge-m3]`
|
||||||
|
# preset). Exposes /v1/embeddings on this gateway for the rag CLI.
|
||||||
|
model_name = "bge-m3";
|
||||||
|
litellm_params = {
|
||||||
|
model = "openai/bge-m3";
|
||||||
|
api_base = "http://halo:8000/v1";
|
||||||
|
api_key = "none";
|
||||||
|
};
|
||||||
|
}
|
||||||
];
|
];
|
||||||
|
|
||||||
general_settings = {
|
general_settings = {
|
||||||
|
|
|
||||||
9
systems/x86_64-linux/sgx/qdrant.nix
Normal file
9
systems/x86_64-linux/sgx/qdrant.nix
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
_: {
|
||||||
|
# Shared vector store for RAG, queried from any LAN machine by the rag CLI.
|
||||||
|
services.qdrant = {
|
||||||
|
enable = true;
|
||||||
|
settings.service.host = "0.0.0.0"; # default 127.0.0.1; LAN-reachable
|
||||||
|
};
|
||||||
|
|
||||||
|
networking.firewall.allowedTCPPorts = [ 6333 ]; # HTTP/REST API
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue