feat(sgx): add CLI RAG stack (Qdrant + embeddings gateway + rag tool)

Stand up document retrieval as shared, client-agnostic primitives rather than locking it inside Open WebUI: - Qdrant as the LAN-reachable vector store - LiteLLM gains a bge-m3 route so sgx:4000 also serves /v1/embeddings - a thin `rag` CLI (ingest/query, optional coder synthesis) usable from any machine and from scripts Embeddings and synthesis run on halo via the gateway; the CLI is configured entirely through RAG_* env vars.
2026-05-22 00:35:35 +02:00 · 2026-05-22 00:35:35 +02:00 · 95668b71a7
commit 95668b71a7
parent ab729a0720
4 changed files with 180 additions and 0 deletions
--- a/systems/x86_64-linux/sgx/default.nix
+++ b/systems/x86_64-linux/sgx/default.nix
@ -12,6 +12,7 @@
    ./wyoming.nix
    ./searx.nix
    ./litellm.nix
+    ./qdrant.nix
    ./uptime-kuma.nix
    ./firefly.nix
    ./opencode.nix
@ -25,6 +26,7 @@
  environment.systemPackages = with pkgs; [
    claude-code
    opencode
+    metacfg.rag
  ];

  services.tailscale.enable = true;
--- a/systems/x86_64-linux/sgx/litellm.nix
+++ b/systems/x86_64-linux/sgx/litellm.nix
@ -22,6 +22,16 @@
            api_key = "none"; # llama-server requires no key; value is ignored
          };
        }
+        {
+          # Multilingual embeddings, also served by halo's router (the `[bge-m3]`
+          # preset). Exposes /v1/embeddings on this gateway for the rag CLI.
+          model_name = "bge-m3";
+          litellm_params = {
+            model = "openai/bge-m3";
+            api_base = "http://halo:8000/v1";
+            api_key = "none";
+          };
+        }
      ];

      general_settings = {
--- a/systems/x86_64-linux/sgx/qdrant.nix
+++ b/systems/x86_64-linux/sgx/qdrant.nix
@ -0,0 +1,9 @@
+_: {
+  # Shared vector store for RAG, queried from any LAN machine by the rag CLI.
+  services.qdrant = {
+    enable = true;
+    settings.service.host = "0.0.0.0"; # default 127.0.0.1; LAN-reachable
+  };
+
+  networking.firewall.allowedTCPPorts = [ 6333 ]; # HTTP/REST API
+}