From dd541bd7e4fff79663bd24a3fc2ce2440f3808fa Mon Sep 17 00:00:00 2001
From: Alex Gorevski <algore@microsoft.com>
Date: Thu, 19 Feb 2026 13:19:05 -0800
Subject: [PATCH] docs(code): add decision-point comments to agent loop,
 security policy, and reliable provider

Adds section markers and decision-point comments to the three most complex
control-flow modules. Comments explain loop invariants, retry/fallback
strategy, security policy precedence rules, and error handling rationale.

This improves maintainability by making the reasoning behind complex
branches explicit for reviewers and future contributors.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/agent/loop_.rs        | 33 +++++++++++++++++++++++++++++++++
 src/providers/reliable.rs | 29 ++++++++++++++++++++++++++++-
 src/security/policy.rs    | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 97 insertions(+), 1 deletion(-)
diff --git a/src/agent/loop_.rs b/src/agent/loop_.rs
index 79e52ba..a59b055 100644
--- a/src/agent/loop_.rs
+++ b/src/agent/loop_.rs
@@ -590,6 +590,17 @@ fn parse_glm_style_tool_calls(text: &str) -> Vec<(String, serde_json::Value, Opt
     calls
 }
 
+// ── Tool-Call Parsing ─────────────────────────────────────────────────────
+// LLM responses may contain tool calls in multiple formats depending on
+// the provider. Parsing follows a priority chain:
+//   1. OpenAI-style JSON with `tool_calls` array (native API)
+//   2. XML tags: <tool_call>, <toolcall>, <tool-call>, <invoke>
+//   3. Markdown code blocks with `tool_call` language
+//   4. GLM-style line-based format (e.g. `shell/command>ls`)
+// SECURITY: We never fall back to extracting arbitrary JSON from the
+// response body, because that would enable prompt-injection attacks where
+// malicious content in emails/files/web pages mimics a tool call.
+
 /// Parse tool calls from an LLM response that uses XML-style function calling.
 ///
 /// Expected format (common with system-prompt-guided tool use):
@@ -874,6 +885,18 @@ pub(crate) async fn agent_turn(
     .await
 }
 
+// ── Agent Tool-Call Loop ──────────────────────────────────────────────────
+// Core agentic iteration: send conversation to the LLM, parse any tool
+// calls from the response, execute them, append results to history, and
+// repeat until the LLM produces a final text-only answer.
+//
+// Loop invariant: at the start of each iteration, `history` contains the
+// full conversation so far (system prompt + user messages + prior tool
+// results). The loop exits when:
+//   • the LLM returns no tool calls (final answer), or
+//   • max_iterations is reached (runaway safety), or
+//   • the cancellation token fires (external abort).
+
 /// Execute a single turn of the agent loop: send messages, parse tool calls,
 /// execute tools, and loop until the LLM produces a final text response.
 #[allow(clippy::too_many_arguments)]
@@ -972,6 +995,10 @@ pub(crate) async fn run_tool_call_loop(
                     });
 
                     let response_text = resp.text_or_empty().to_string();
+                    // First try native structured tool calls (OpenAI-format).
+                    // Fall back to text-based parsing (XML tags, markdown blocks,
+                    // GLM format) only if the provider returned no native calls —
+                    // this ensures we support both native and prompt-guided models.
                     let mut calls = parse_structured_tool_calls(&resp.tool_calls);
                     let mut parsed_text = String::new();
 
@@ -1190,6 +1217,12 @@ pub(crate) fn build_tool_instructions(tools_registry: &[Box<dyn Tool>]) -> Strin
     instructions
 }
 
+// ── CLI Entrypoint ───────────────────────────────────────────────────────
+// Wires up all subsystems (observer, runtime, security, memory, tools,
+// provider, hardware RAG, peripherals) and enters either single-shot or
+// interactive REPL mode. The interactive loop manages history compaction
+// and hard trimming to keep the context window bounded.
+
 #[allow(clippy::too_many_lines)]
 pub async fn run(
     config: Config,
diff --git a/src/providers/reliable.rs b/src/providers/reliable.rs
index 61812e7..d3621d5 100644
--- a/src/providers/reliable.rs
+++ b/src/providers/reliable.rs
@@ -6,18 +6,28 @@ use std::collections::HashMap;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::time::Duration;
 
+// ── Error Classification ─────────────────────────────────────────────────
+// Errors are split into retryable (transient server/network failures) and
+// non-retryable (permanent client errors). This distinction drives whether
+// the retry loop continues, falls back to the next provider, or aborts
+// immediately — avoiding wasted latency on errors that cannot self-heal.
+
 /// Check if an error is non-retryable (client errors that won't resolve with retries).
 fn is_non_retryable(err: &anyhow::Error) -> bool {
     if is_context_window_exceeded(err) {
         return true;
     }
 
+    // 4xx errors are generally non-retryable (bad request, auth failure, etc.),
+    // except 429 (rate-limit — transient) and 408 (timeout — worth retrying).
     if let Some(reqwest_err) = err.downcast_ref::<reqwest::Error>() {
         if let Some(status) = reqwest_err.status() {
             let code = status.as_u16();
             return status.is_client_error() && code != 429 && code != 408;
         }
     }
+    // Fallback: parse status codes from stringified errors (some providers
+    // embed codes in error messages rather than returning typed HTTP errors).
     let msg = err.to_string();
     for word in msg.split(|c: char| !c.is_ascii_digit()) {
         if let Ok(code) = word.parse::<u16>() {
@@ -27,6 +37,8 @@ fn is_non_retryable(err: &anyhow::Error) -> bool {
         }
     }
 
+    // Heuristic: detect auth/model failures by keyword when no HTTP status
+    // is available (e.g. gRPC or custom transport errors).
     let msg_lower = msg.to_lowercase();
     let auth_failure_hints = [
         "invalid api key",
@@ -197,6 +209,16 @@ fn push_failure(
     ));
 }
 
+// ── Resilient Provider Wrapper ────────────────────────────────────────────
+// Three-level failover strategy: model chain → provider chain → retry loop.
+//   Outer loop:  iterate model fallback chain (original model first, then
+//                configured alternatives).
+//   Middle loop: iterate registered providers in priority order.
+//   Inner loop:  retry the same (provider, model) pair with exponential
+//                backoff, rotating API keys on rate-limit errors.
+// Loop invariant: `failures` accumulates every failed attempt so the final
+// error message gives operators a complete diagnostic trail.
+
 /// Provider wrapper with retry, fallback, auth rotation, and model failover.
 pub struct ReliableProvider {
     providers: Vec<(String, Box<dyn Provider>)>,
@@ -288,6 +310,10 @@ impl Provider for ReliableProvider {
         let models = self.model_chain(model);
         let mut failures = Vec::new();
 
+        // Outer: model fallback chain. Middle: provider priority. Inner: retries.
+        // Each iteration: attempt one (provider, model) call. On success, return
+        // immediately. On non-retryable error, break to next provider. On
+        // retryable error, sleep with exponential backoff and retry.
         for current_model in &models {
             for (provider_name, provider) in &self.providers {
                 let mut backoff_ms = self.base_backoff_ms;
@@ -326,7 +352,8 @@ impl Provider for ReliableProvider {
                                 &error_detail,
                             );
 
-                            // On rate-limit, try rotating API key
+                            // Rate-limit with rotatable keys: cycle to the next API key
+                            // so the retry hits a different quota bucket.
                             if rate_limited && !non_retryable_rate_limit {
                                 if let Some(new_key) = self.rotate_key() {
                                     tracing::info!(
diff --git a/src/security/policy.rs b/src/security/policy.rs
index debc7f6..c6fe6aa 100644
--- a/src/security/policy.rs
+++ b/src/security/policy.rs
@@ -144,6 +144,12 @@ impl Default for SecurityPolicy {
     }
 }
 
+// ── Shell Command Parsing Utilities ───────────────────────────────────────
+// These helpers implement a minimal quote-aware shell lexer. They exist
+// because security validation must reason about the *structure* of a
+// command (separators, operators, quoting) rather than treating it as a
+// flat string — otherwise an attacker could hide dangerous sub-commands
+// inside quoted arguments or chained operators.
 /// Skip leading environment variable assignments (e.g. `FOO=bar cmd args`).
 /// Returns the remainder starting at the first non-assignment word.
 fn skip_env_assignments(s: &str) -> &str {
@@ -376,6 +382,11 @@ fn contains_unquoted_char(command: &str, target: char) -> bool {
 }
 
 impl SecurityPolicy {
+    // ── Risk Classification ──────────────────────────────────────────────
+    // Risk is assessed per-segment (split on shell operators), and the
+    // highest risk across all segments wins. This prevents bypasses like
+    // `ls && rm -rf /` from being classified as Low just because `ls` is safe.
+
     /// Classify command risk. Any high-risk segment marks the whole command high.
     pub fn command_risk_level(&self, command: &str) -> CommandRiskLevel {
         let mut saw_medium = false;
@@ -483,6 +494,15 @@ impl SecurityPolicy {
         }
     }
 
+    // ── Command Execution Policy Gate ──────────────────────────────────────
+    // Validation follows a strict precedence order:
+    //   1. Allowlist check (is the base command permitted at all?)
+    //   2. Risk classification (high / medium / low)
+    //   3. Policy flags (block_high_risk_commands, require_approval_for_medium_risk)
+    //   4. Autonomy level × approval status (supervised requires explicit approval)
+    // This ordering ensures deny-by-default: unknown commands are rejected
+    // before any risk or autonomy logic runs.
+
     /// Validate full command execution policy (allowlist + risk gate).
     pub fn validate_command_execution(
         &self,
@@ -520,6 +540,11 @@ impl SecurityPolicy {
         Ok(risk)
     }
 
+    // ── Layered Command Allowlist ──────────────────────────────────────────
+    // Defence-in-depth: five independent gates run in order before the
+    // per-segment allowlist check. Each gate targets a specific bypass
+    // technique. If any gate rejects, the whole command is blocked.
+
     /// Check if a shell command is allowed.
     ///
     /// Validates the **entire** command string, not just the first word:
@@ -627,6 +652,12 @@ impl SecurityPolicy {
         }
     }
 
+    // ── Path Validation ────────────────────────────────────────────────
+    // Layered checks: null-byte injection → component-level traversal →
+    // URL-encoded traversal → tilde expansion → absolute-path block →
+    // forbidden-prefix match. Each layer addresses a distinct escape
+    // technique; together they enforce workspace confinement.
+
     /// Check if a file path is allowed (no path traversal, within workspace)
     pub fn is_path_allowed(&self, path: &str) -> bool {
         // Block null bytes (can truncate paths in C-backed syscalls)
@@ -703,6 +734,11 @@ impl SecurityPolicy {
         self.autonomy != AutonomyLevel::ReadOnly
     }
 
+    // ── Tool Operation Gating ──────────────────────────────────────────────
+    // Read operations bypass autonomy and rate checks because they have
+    // no side effects. Act operations must pass both the autonomy gate
+    // (not read-only) and the sliding-window rate limiter.
+
     /// Enforce policy for a tool operation.
     ///
     /// Read operations are always allowed by autonomy/rate gates.