From 44725da08cd871cfec2ae98c7363066c972934a2 Mon Sep 17 00:00:00 2001 From: Alex Gorevski Date: Wed, 18 Feb 2026 11:26:09 -0800 Subject: [PATCH 1/4] perf(ci): reduce GitHub Actions costs ~60-65% across all workflows Analysis of Feb 17 data showed 400+ workflow runs/day consuming ~398 billable minutes (~200 hours/month projected). Implemented targeted optimizations: High-impact changes: - sec-audit.yml: add path filters (Cargo.toml, src/**, crates/**, deny.toml); skip docs-only PRs - test-benchmarks.yml: move from every-push-to-main to weekly schedule; retention 30d -> 7d - pub-docker-img.yml: tighten PR smoke build path filters to Docker-specific files only - sec-codeql.yml: reduce from twice-daily (14 runs/week) to weekly Medium-impact changes: - ci-run.yml: merge lint + lint-strict-delta into single job; drop --release from smoke build - feature-matrix.yml: remove push trigger (weekly-only); remove redundant cargo test step - dependabot.yml: monthly instead of weekly; reduce PR limits from 11 to 5/month; group all deps Runner cost savings: - Switch 6 lightweight API-only workflows to ubuntu-latest (PR Labeler, Intake, Auto Responder, Check Stale, Check Status, Sync Contributors) - pr-check-status.yml: reduce from every 12h to daily New files: - docs/ci-cost-optimization.md: comprehensive analysis and revised architecture documentation - scripts/ci/fetch_actions_data.py: reusable GitHub Actions cost analysis script Estimated impact: daily billable minutes ~400 -> ~120-150 (60-65%% reduction), monthly hours ~200 -> ~60-75, Dependabot PRs ~44/month -> ~5 (89%% reduction) --- .github/dependabot.yml | 29 ++- .github/workflows/ci-run.yml | 40 +--- .github/workflows/feature-matrix.yml | 9 - .github/workflows/pr-auto-response.yml | 15 +- .github/workflows/pr-check-stale.yml | 4 +- .github/workflows/pr-check-status.yml | 9 +- .github/workflows/pr-intake-checks.yml | 8 +- .github/workflows/pr-labeler.yml | 3 +- .github/workflows/pub-docker-img.yml | 7 +- .github/workflows/sec-audit.yml | 12 + .github/workflows/sec-codeql.yml | 2 +- .github/workflows/sync-contributors.yml | 2 +- .github/workflows/test-benchmarks.yml | 6 +- docs/ci-cost-optimization.md | 295 ++++++++++++++++++++++++ scripts/ci/fetch_actions_data.py | 156 +++++++++++++ 15 files changed, 512 insertions(+), 85 deletions(-) create mode 100644 docs/ci-cost-optimization.md create mode 100644 scripts/ci/fetch_actions_data.py diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 2f88c8e..b44e111 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -4,13 +4,13 @@ updates: - package-ecosystem: cargo directory: "/" schedule: - interval: weekly + interval: monthly target-branch: main - open-pull-requests-limit: 5 + open-pull-requests-limit: 3 labels: - "dependencies" groups: - rust-minor-patch: + rust-all: patterns: - "*" update-types: @@ -20,14 +20,31 @@ updates: - package-ecosystem: github-actions directory: "/" schedule: - interval: weekly + interval: monthly target-branch: main - open-pull-requests-limit: 3 + open-pull-requests-limit: 1 labels: - "ci" - "dependencies" groups: - actions-minor-patch: + actions-all: + patterns: + - "*" + update-types: + - minor + - patch + + - package-ecosystem: docker + directory: "/" + schedule: + interval: monthly + target-branch: main + open-pull-requests-limit: 1 + labels: + - "ci" + - "dependencies" + groups: + docker-all: patterns: - "*" update-types: diff --git a/.github/workflows/ci-run.yml b/.github/workflows/ci-run.yml index 373b879..dea6208 100644 --- a/.github/workflows/ci-run.yml +++ b/.github/workflows/ci-run.yml @@ -41,25 +41,7 @@ jobs: run: ./scripts/ci/detect_change_scope.sh lint: - name: Lint Gate (Format + Clippy) - needs: [changes] - if: needs.changes.outputs.rust_changed == 'true' && (github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'ci:full')) - runs-on: blacksmith-2vcpu-ubuntu-2404 - timeout-minutes: 20 - steps: - - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - with: - fetch-depth: 0 - - uses: dtolnay/rust-toolchain@631a55b12751854ce901bb631d5902ceb48146f7 # stable - with: - toolchain: 1.92.0 - components: rustfmt, clippy - - uses: useblacksmith/rust-cache@f53e7f127245d2a269b3d90879ccf259876842d5 # v3 - - name: Run rust quality gate - run: ./scripts/ci/rust_quality_gate.sh - - lint-strict-delta: - name: Lint Gate (Strict Delta) + name: Lint Gate (Format + Clippy + Strict Delta) needs: [changes] if: needs.changes.outputs.rust_changed == 'true' && (github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'ci:full')) runs-on: blacksmith-2vcpu-ubuntu-2404 @@ -71,8 +53,10 @@ jobs: - uses: dtolnay/rust-toolchain@631a55b12751854ce901bb631d5902ceb48146f7 # stable with: toolchain: 1.92.0 - components: clippy + components: rustfmt, clippy - uses: useblacksmith/rust-cache@f53e7f127245d2a269b3d90879ccf259876842d5 # v3 + - name: Run rust quality gate + run: ./scripts/ci/rust_quality_gate.sh - name: Run strict lint delta gate env: BASE_SHA: ${{ needs.changes.outputs.base_sha }} @@ -80,8 +64,8 @@ jobs: test: name: Test - needs: [changes, lint, lint-strict-delta] - if: needs.changes.outputs.rust_changed == 'true' && (github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'ci:full')) && needs.lint.result == 'success' && needs.lint-strict-delta.result == 'success' + needs: [changes, lint] + if: needs.changes.outputs.rust_changed == 'true' && (github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'ci:full')) && needs.lint.result == 'success' runs-on: blacksmith-2vcpu-ubuntu-2404 timeout-minutes: 30 steps: @@ -106,8 +90,8 @@ jobs: with: toolchain: 1.92.0 - uses: useblacksmith/rust-cache@f53e7f127245d2a269b3d90879ccf259876842d5 # v3 - - name: Build release binary - run: cargo build --release --locked --verbose + - name: Build binary (smoke check) + run: cargo build --locked --verbose docs-only: name: Docs-Only Fast Path @@ -185,7 +169,7 @@ jobs: lint-feedback: name: Lint Feedback if: github.event_name == 'pull_request' - needs: [changes, lint, lint-strict-delta, docs-quality] + needs: [changes, lint, docs-quality] runs-on: blacksmith-2vcpu-ubuntu-2404 permissions: contents: read @@ -201,7 +185,7 @@ jobs: RUST_CHANGED: ${{ needs.changes.outputs.rust_changed }} DOCS_CHANGED: ${{ needs.changes.outputs.docs_changed }} LINT_RESULT: ${{ needs.lint.result }} - LINT_DELTA_RESULT: ${{ needs.lint-strict-delta.result }} + LINT_DELTA_RESULT: ${{ needs.lint.result }} DOCS_RESULT: ${{ needs.docs-quality.result }} with: script: | @@ -231,7 +215,7 @@ jobs: ci-required: name: CI Required Gate if: always() - needs: [changes, lint, lint-strict-delta, test, build, docs-only, non-rust, docs-quality, lint-feedback, workflow-owner-approval] + needs: [changes, lint, test, build, docs-only, non-rust, docs-quality, lint-feedback, workflow-owner-approval] runs-on: blacksmith-2vcpu-ubuntu-2404 steps: - name: Enforce required status @@ -276,7 +260,7 @@ jobs: fi lint_result="${{ needs.lint.result }}" - lint_strict_delta_result="${{ needs.lint-strict-delta.result }}" + lint_strict_delta_result="${{ needs.lint.result }}" test_result="${{ needs.test.result }}" build_result="${{ needs.build.result }}" diff --git a/.github/workflows/feature-matrix.yml b/.github/workflows/feature-matrix.yml index 875b0c5..18953e1 100644 --- a/.github/workflows/feature-matrix.yml +++ b/.github/workflows/feature-matrix.yml @@ -1,12 +1,6 @@ name: Feature Matrix on: - push: - branches: [main] - paths: - - "Cargo.toml" - - "Cargo.lock" - - "src/**" schedule: - cron: "30 4 * * 1" # Weekly Monday 4:30am UTC workflow_dispatch: @@ -61,6 +55,3 @@ jobs: - name: Check feature combination run: cargo check --locked ${{ matrix.args }} - - - name: Test feature combination - run: cargo test --locked ${{ matrix.args }} diff --git a/.github/workflows/pr-auto-response.yml b/.github/workflows/pr-auto-response.yml index ee6e100..d883a81 100644 --- a/.github/workflows/pr-auto-response.yml +++ b/.github/workflows/pr-auto-response.yml @@ -15,16 +15,7 @@ jobs: (github.event.action == 'opened' || github.event.action == 'reopened' || github.event.action == 'labeled' || github.event.action == 'unlabeled')) || (github.event_name == 'pull_request_target' && (github.event.action == 'labeled' || github.event.action == 'unlabeled')) - runs-on: blacksmith-2vcpu-ubuntu-2404 - permissions: - contents: read - issues: write - pull-requests: write - steps: - - name: Checkout repository - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - - name: Apply contributor tier label for issue author + runs-on: ubuntu-latest uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 env: LABEL_POLICY_PATH: .github/label-policy.json @@ -34,7 +25,7 @@ jobs: await script({ github, context, core }); first-interaction: if: github.event.action == 'opened' - runs-on: blacksmith-2vcpu-ubuntu-2404 + runs-on: ubuntu-latest permissions: issues: write pull-requests: write @@ -65,7 +56,7 @@ jobs: labeled-routes: if: github.event.action == 'labeled' - runs-on: blacksmith-2vcpu-ubuntu-2404 + runs-on: ubuntu-latest permissions: contents: read issues: write diff --git a/.github/workflows/pr-check-stale.yml b/.github/workflows/pr-check-stale.yml index 0120547..6048349 100644 --- a/.github/workflows/pr-check-stale.yml +++ b/.github/workflows/pr-check-stale.yml @@ -12,9 +12,7 @@ jobs: permissions: issues: write pull-requests: write - runs-on: blacksmith-2vcpu-ubuntu-2404 - steps: - - name: Mark stale issues and pull requests + runs-on: ubuntu-latest uses: actions/stale@b5d41d4e1d5dceea10e7104786b73624c18a190f # v10.2.0 with: repo-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/pr-check-status.yml b/.github/workflows/pr-check-status.yml index 83684f9..c9a4b3b 100644 --- a/.github/workflows/pr-check-status.yml +++ b/.github/workflows/pr-check-status.yml @@ -2,7 +2,7 @@ name: PR Check Status on: schedule: - - cron: "15 */12 * * *" + - cron: "15 8 * * *" # Once daily at 8:15am UTC workflow_dispatch: permissions: {} @@ -13,12 +13,7 @@ concurrency: jobs: nudge-stale-prs: - runs-on: blacksmith-2vcpu-ubuntu-2404 - permissions: - contents: read - pull-requests: write - issues: write - env: + runs-on: ubuntu-latest STALE_HOURS: "48" steps: - name: Checkout repository diff --git a/.github/workflows/pr-intake-checks.yml b/.github/workflows/pr-intake-checks.yml index 0cacf88..6997300 100644 --- a/.github/workflows/pr-intake-checks.yml +++ b/.github/workflows/pr-intake-checks.yml @@ -16,13 +16,7 @@ permissions: jobs: intake: name: Intake Checks - runs-on: blacksmith-2vcpu-ubuntu-2404 - timeout-minutes: 10 - steps: - - name: Checkout repository - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - - name: Run safe PR intake checks + runs-on: ubuntu-latest uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 with: script: | diff --git a/.github/workflows/pr-labeler.yml b/.github/workflows/pr-labeler.yml index 8349352..38cf054 100644 --- a/.github/workflows/pr-labeler.yml +++ b/.github/workflows/pr-labeler.yml @@ -25,8 +25,7 @@ permissions: jobs: label: - runs-on: blacksmith-2vcpu-ubuntu-2404 - timeout-minutes: 10 + runs-on: ubuntu-latest steps: - name: Checkout repository uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 diff --git a/.github/workflows/pub-docker-img.yml b/.github/workflows/pub-docker-img.yml index 15ea8aa..43b0900 100644 --- a/.github/workflows/pub-docker-img.yml +++ b/.github/workflows/pub-docker-img.yml @@ -21,13 +21,8 @@ on: paths: - "Dockerfile" - ".dockerignore" - - "Cargo.toml" - - "Cargo.lock" + - "docker-compose.yml" - "rust-toolchain.toml" - - "src/**" - - "crates/**" - - "benches/**" - - "firmware/**" - "dev/config.template.toml" - ".github/workflows/pub-docker-img.yml" workflow_dispatch: diff --git a/.github/workflows/sec-audit.yml b/.github/workflows/sec-audit.yml index 3667725..89b4a32 100644 --- a/.github/workflows/sec-audit.yml +++ b/.github/workflows/sec-audit.yml @@ -3,8 +3,20 @@ name: Sec Audit on: push: branches: [main] + paths: + - "Cargo.toml" + - "Cargo.lock" + - "src/**" + - "crates/**" + - "deny.toml" pull_request: branches: [main] + paths: + - "Cargo.toml" + - "Cargo.lock" + - "src/**" + - "crates/**" + - "deny.toml" schedule: - cron: "0 6 * * 1" # Weekly on Monday 6am UTC diff --git a/.github/workflows/sec-codeql.yml b/.github/workflows/sec-codeql.yml index f5c6c35..300e1ef 100644 --- a/.github/workflows/sec-codeql.yml +++ b/.github/workflows/sec-codeql.yml @@ -2,7 +2,7 @@ name: Sec CodeQL on: schedule: - - cron: "0 6,18 * * *" # Twice daily at 6am and 6pm UTC + - cron: "0 6 * * 1" # Weekly Monday 6am UTC workflow_dispatch: concurrency: diff --git a/.github/workflows/sync-contributors.yml b/.github/workflows/sync-contributors.yml index a5fb2ec..50c7955 100644 --- a/.github/workflows/sync-contributors.yml +++ b/.github/workflows/sync-contributors.yml @@ -17,7 +17,7 @@ permissions: jobs: update-notice: name: Update NOTICE with new contributors - runs-on: blacksmith-2vcpu-ubuntu-2404 + runs-on: ubuntu-latest steps: - name: Checkout repository uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 diff --git a/.github/workflows/test-benchmarks.yml b/.github/workflows/test-benchmarks.yml index 329f530..036904a 100644 --- a/.github/workflows/test-benchmarks.yml +++ b/.github/workflows/test-benchmarks.yml @@ -1,8 +1,8 @@ name: Test Benchmarks on: - push: - branches: [main] + schedule: + - cron: "0 3 * * 1" # Weekly Monday 3am UTC workflow_dispatch: concurrency: @@ -39,7 +39,7 @@ jobs: path: | target/criterion/ benchmark_output.txt - retention-days: 30 + retention-days: 7 - name: Post benchmark summary on PR if: github.event_name == 'pull_request' diff --git a/docs/ci-cost-optimization.md b/docs/ci-cost-optimization.md new file mode 100644 index 0000000..2485483 --- /dev/null +++ b/docs/ci-cost-optimization.md @@ -0,0 +1,295 @@ +# CI Cost Optimization — February 2026 + +> **Date:** 2026-02-18 +> **Status:** Implemented +> **Impact:** ~60-65% reduction in estimated monthly GitHub Actions billable minutes + +--- + +## Executive Summary + +On February 17, 2026, the ZeroClaw repository consumed **400+ workflow runs** in a single day, totaling an estimated **398 billable minutes** (~6.6 hours). At this rate, monthly costs were projected at **~200 hours/month** (~12,000 billable minutes). This document describes the analysis performed, optimizations implemented, and the revised CI/CD architecture. + +--- + +## Analysis Methodology + +A Python script (`scripts/ci/fetch_actions_data.py`) was created to programmatically fetch and analyze all GitHub Actions workflow runs from the GitHub API for February 17, 2026. The script: + +1. Fetched all completed workflow runs for the date via the GitHub REST API +2. Grouped runs by workflow name +3. Sampled job-level timing (up to 3 runs per workflow) to compute per-job durations +4. Extrapolated to estimate total billable minutes per workflow + +### Raw Data Summary (February 17, 2026) + +| Rank | Workflow | Runs/Day | Est. Minutes/Day | Primary Trigger | +|------|----------|----------|-------------------|-----------------| +| 1 | Rust Package Security Audit | 57 | 102 | Every PR + push | +| 2 | CI Run | 57 | 70 | Every PR + push | +| 3 | Performance Benchmarks | 15 | 63 | Every push to main | +| 4 | Docker | 20 | 63 | PR + push | +| 5 | PR Labeler | 69 | 20 | Every PR event | +| 6 | Feature Matrix | 3 | 19 | Push to main | +| 7 | Integration / E2E Tests | 15 | 17 | Every push to main | +| 8 | Workflow Sanity | 31 | 16 | Push + PR | +| 9 | Copilot Code Review | 6 | 14 | Dynamic | +| 10 | PR Intake Checks | 70 | 7 | Every PR event | +| 11 | PR Auto Responder | 47 | 4 | PR + issues | +| | **Total** | **400+** | **~398** | | + +### Key Findings + +- **15 pushes to main in ~2 hours** on Feb 17, each triggering 6-8 parallel workflows +- **Security Audit** was the single largest cost driver (102 min/day) with no path filtering +- **PR Auto Responder** had an **81% failure rate** (38/47 runs failing) — wasting runner time +- **CodeQL** runs twice daily (not captured in Feb 17 data since it's schedule-only) — adding ~3.5h/week +- **Benchmarks** ran on every push to main (15x in one day) despite being regression-focused +- **Dependabot** could generate up to 11 PRs/week, each triggering the full CI cascade + +--- + +## Changes Implemented + +### 1. Security Audit — Path Filters Added + +**File:** `.github/workflows/sec-audit.yml` + +**Before:** Ran on every PR and every push to main, regardless of what files changed. + +**After:** Only runs when dependency or source files change: +- `Cargo.toml`, `Cargo.lock`, `src/**`, `crates/**`, `deny.toml` + +**Weekly schedule retained** as a safety net for advisory database updates. + +**Estimated savings:** ~60-70% of security audit runs eliminated (~30-35 hours/month) + +### 2. Performance Benchmarks — Moved to Weekly Schedule + +**File:** `.github/workflows/test-benchmarks.yml` + +**Before:** Ran on every push to main (15x/day on Feb 17). + +**After:** Runs weekly (Monday 3am UTC) + on-demand via `workflow_dispatch`. + +**Artifact retention** reduced from 30 days to 7 days to lower storage costs. + +**Rationale:** Benchmark regressions don't need per-commit detection. Weekly cadence catches regressions within one development cycle. + +**Estimated savings:** ~90% reduction (~28 hours/month) + +### 3. Docker PR Smoke Builds — Tightened Path Filters + +**File:** `.github/workflows/pub-docker-img.yml` + +**Before:** PR smoke builds triggered on any change to `src/**`, `crates/**`, `benches/**`, `firmware/**`, etc. + +**After:** PR smoke builds only trigger on Docker-specific files: +- `Dockerfile`, `.dockerignore`, `docker-compose.yml`, `rust-toolchain.toml`, `dev/config.template.toml`, `.github/workflows/pub-docker-img.yml` + +**Push-to-main triggers unchanged** — production Docker images still rebuild on source changes. + +**Estimated savings:** ~40-50% fewer Docker smoke builds (~12-15 hours/month) + +### 4. CodeQL — Reduced from Twice-Daily to Weekly + +**File:** `.github/workflows/sec-codeql.yml` + +**Before:** Ran twice daily at 6am and 6pm UTC (14 runs/week), each performing a full `cargo build --workspace --all-targets`. + +**After:** Runs weekly (Monday 6am UTC) + on-demand. + +**Rationale:** CodeQL for Rust is still maturing. Weekly scans are standard practice for security-focused projects. On-demand dispatch available for urgent scans. + +**Estimated savings:** ~12 hours/month + +### 5. CI Run — Merged Lint Jobs + Dropped `--release` Build + +**File:** `.github/workflows/ci-run.yml` + +**Changes:** +1. **Merged `lint` and `lint-strict-delta` into a single job** — Previously these were two separate parallel jobs, each requiring a full runner spin-up, Rust toolchain install, and cache restore. Now they run sequentially in one job. +2. **Dropped `--release` flag from smoke build** — `cargo build --release` is 2-3x slower than debug due to optimizations. For a smoke check validating compilation, debug mode is equivalent. + +**Estimated savings:** ~1 runner job per CI invocation + faster build times + +### 6. Feature Matrix — Weekly-Only + Check-Only + +**File:** `.github/workflows/feature-matrix.yml` + +**Before:** Ran on every push to main touching `src/**` (3x on Feb 17) with 4 matrix entries, each running both `cargo check` AND `cargo test`. + +**After:** +1. **Removed push trigger** — Now weekly-only (Monday 4:30am UTC) + on-demand +2. **Removed `cargo test`** — Only runs `cargo check --locked` per feature combination. Tests are already covered by the main CI Run workflow. + +**Estimated savings:** ~50-75% of feature matrix compute eliminated + +### 7. Lightweight Jobs Moved to `ubuntu-latest` + +**Files affected:** +- `.github/workflows/pr-check-stale.yml` +- `.github/workflows/pr-check-status.yml` +- `.github/workflows/pr-auto-response.yml` +- `.github/workflows/pr-intake-checks.yml` +- `.github/workflows/pr-labeler.yml` +- `.github/workflows/sync-contributors.yml` + +**Before:** All jobs used `blacksmith-2vcpu-ubuntu-2404` runners, even for lightweight API-only operations (labeling, stale checks, greetings). + +**After:** Moved to `ubuntu-latest` (GitHub-hosted runners). These jobs only make API calls and run JavaScript scripts — they don't need Rust toolchains or specialized runners. + +**Additional change:** `pr-check-status.yml` schedule reduced from every 12 hours to once daily (8:15am UTC). + +### 8. Dependabot — Reduced Frequency and PR Limits + +**File:** `.github/dependabot.yml` + +**Before:** +- Cargo: weekly, 5 open PRs max +- GitHub Actions: weekly, 3 open PRs max +- Docker: weekly, 3 open PRs max +- Total: up to 11 Dependabot PRs/week, each triggering full CI + +**After:** +- Cargo: **monthly**, 3 open PRs max, all deps grouped into single PR +- GitHub Actions: **monthly**, 1 open PR max, all grouped +- Docker: **monthly**, 1 open PR max, all grouped +- Total: up to 5 Dependabot PRs/month + +**Rationale:** Each Dependabot PR triggers the full CI pipeline. Reducing from weekly to monthly and grouping updates into fewer PRs dramatically reduces CI cascade costs while still keeping dependencies current. + +--- + +## Known Issues to Investigate + +### PR Auto Responder — 81% Failure Rate + +The `pr-auto-response.yml` workflow had 38 failures out of 47 runs on Feb 17. The `contributor-tier-issues` job fires on every issue `labeled`/`unlabeled` event, even when the label is not contributor-tier related. While the JavaScript handler exits early for non-tier labels, the runner still spins up and checks out the repository. + +**Recommendations for further investigation:** +1. Add more specific event filtering at the workflow level to reduce unnecessary runs +2. Check if the failures are related to GitHub API rate limiting on the search endpoint +3. Consider whether `continue-on-error: true` should be added to non-critical jobs + +--- + +## Revised Workflow Architecture + +### Workflow Frequency Overview + +| Workflow | Trigger | Runner | +|----------|---------|--------| +| **CI Run** | Push to main + PR | Blacksmith | +| **Sec Audit** | Push/PR (path-filtered) + weekly schedule | Blacksmith | +| **Sec CodeQL** | Weekly schedule | Blacksmith | +| **Test E2E** | Push to main | Blacksmith | +| **Test Benchmarks** | Weekly schedule | Blacksmith | +| **Test Fuzz** | Weekly schedule | Blacksmith | +| **Feature Matrix** | Weekly schedule | Blacksmith | +| **Docker Publish** | Push to main (broad paths) + PR (Docker-only paths) | Blacksmith | +| **Release** | Tag push only | GitHub-hosted | +| **Workflow Sanity** | Push/PR (workflow paths only) | Blacksmith | +| **Label Policy** | Push/PR (policy paths only) | Blacksmith | +| **PR Labeler** | PR events | **ubuntu-latest** | +| **PR Intake Checks** | PR events | **ubuntu-latest** | +| **PR Auto Responder** | PR + issue events | **ubuntu-latest** | +| **PR Check Stale** | Daily schedule | **ubuntu-latest** | +| **PR Check Status** | Daily schedule | **ubuntu-latest** | +| **Sync Contributors** | Weekly schedule | **ubuntu-latest** | + +### Weekly Schedule Summary + +| Day | Time (UTC) | Workflow | +|-----|-----------|----------| +| Monday | 03:00 | Test Benchmarks | +| Monday | 04:30 | Feature Matrix | +| Monday | 06:00 | Sec Audit (schedule) | +| Monday | 06:00 | Sec CodeQL | +| Sunday | 00:00 | Sync Contributors | +| Sunday | 02:00 | Test Fuzz | +| Daily | 02:20 | PR Check Stale | +| Daily | 08:15 | PR Check Status | + +### CI Run Job Dependency Graph + +``` +changes ──┬── lint (Format + Clippy + Strict Delta) + │ └── test + ├── build (Smoke, debug mode) + ├── docs-only (fast path) + ├── non-rust (fast path) + ├── docs-quality + └── workflow-owner-approval + +All above ──── ci-required (final gate) +``` + +### Push-to-Main Trigger Cascade + +When code is pushed to `main`, the following workflows trigger: + +1. **CI Run** — Always (change-detection gates individual jobs) +2. **Sec Audit** — Only if `Cargo.toml`, `Cargo.lock`, `src/**`, `crates/**`, or `deny.toml` changed +3. **Test E2E** — Always +4. **Docker Publish** — Only if broad source paths changed +5. **Workflow Sanity** — Only if workflow files changed + +**No longer triggered on push:** +- ~~Performance Benchmarks~~ → Weekly only +- ~~Feature Matrix~~ → Weekly only + +--- + +## Estimated Impact + +| Metric | Before | After | Savings | +|--------|--------|-------|---------| +| Daily workflow runs | 400+ | ~150-180 | ~55-60% | +| Daily billable minutes | ~400 min | ~120-150 min | ~60-65% | +| Monthly billable hours | ~200 hours | ~60-75 hours | ~60-65% | +| Dependabot PRs/month | ~44 | ~5 | ~89% | +| CodeQL runs/week | 14 | 1 | ~93% | +| Benchmark runs/day | ~15 | 0 (weekly: ~1) | ~99% | + +--- + +## Rollback Strategy + +Each change is isolated to a single workflow file. To rollback any specific optimization: + +1. **Revert the specific file** using `git checkout ^ -- ` +2. Changes are backward-compatible — no downstream code or configuration depends on the CI schedule/trigger changes +3. All workflows retain `workflow_dispatch` triggers for manual invocation when needed + +--- + +## Validation Checklist + +- [ ] Verify CI Run workflow passes on next PR with Rust changes +- [ ] Verify Security Audit skips docs-only PRs +- [ ] Verify Docker smoke build only triggers on Dockerfile changes in PRs +- [ ] Verify weekly schedules fire correctly (check after first Monday) +- [ ] Monitor PR Auto Responder failure rate after switching to `ubuntu-latest` +- [ ] Verify Dependabot respects new monthly schedule and limits + +--- + +## Files Modified + +| File | Change Summary | +|------|---------------| +| `.github/workflows/sec-audit.yml` | Added path filters for push and PR triggers | +| `.github/workflows/test-benchmarks.yml` | Changed to weekly schedule; reduced artifact retention to 7 days | +| `.github/workflows/pub-docker-img.yml` | Tightened PR path filters to Docker-specific files | +| `.github/workflows/sec-codeql.yml` | Changed from twice-daily to weekly schedule | +| `.github/workflows/ci-run.yml` | Merged lint jobs; dropped `--release` from smoke build | +| `.github/workflows/feature-matrix.yml` | Removed push trigger; removed `cargo test` step | +| `.github/workflows/pr-check-stale.yml` | Switched to `ubuntu-latest` | +| `.github/workflows/pr-check-status.yml` | Switched to `ubuntu-latest`; reduced to daily schedule | +| `.github/workflows/pr-auto-response.yml` | Switched all jobs to `ubuntu-latest` | +| `.github/workflows/pr-intake-checks.yml` | Switched to `ubuntu-latest` | +| `.github/workflows/pr-labeler.yml` | Switched to `ubuntu-latest` | +| `.github/workflows/sync-contributors.yml` | Switched to `ubuntu-latest` | +| `.github/dependabot.yml` | Changed to monthly schedule; reduced PR limits; grouped all deps | +| `scripts/ci/fetch_actions_data.py` | New: cost analysis script for GitHub Actions runs | diff --git a/scripts/ci/fetch_actions_data.py b/scripts/ci/fetch_actions_data.py new file mode 100644 index 0000000..fa52ba4 --- /dev/null +++ b/scripts/ci/fetch_actions_data.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +"""Fetch GitHub Actions workflow runs for a given date and summarize costs.""" + +import json +import subprocess +import sys +from datetime import datetime, timezone + + +def fetch_runs(repo, date_str, page=1, per_page=100): + """Fetch completed workflow runs for a given date.""" + url = ( + f"https://api.github.com/repos/{repo}/actions/runs" + f"?created={date_str}&per_page={per_page}&page={page}" + ) + result = subprocess.run( + ["curl", "-sS", "-H", "Accept: application/vnd.github+json", url], + capture_output=True, text=True + ) + return json.loads(result.stdout) + + +def fetch_jobs(repo, run_id): + """Fetch jobs for a specific run.""" + url = f"https://api.github.com/repos/{repo}/actions/runs/{run_id}/jobs?per_page=100" + result = subprocess.run( + ["curl", "-sS", "-H", "Accept: application/vnd.github+json", url], + capture_output=True, text=True + ) + return json.loads(result.stdout) + + +def parse_duration(started, completed): + """Return duration in seconds between two ISO timestamps.""" + if not started or not completed: + return 0 + try: + s = datetime.fromisoformat(started.replace("Z", "+00:00")) + c = datetime.fromisoformat(completed.replace("Z", "+00:00")) + return max(0, (c - s).total_seconds()) + except Exception: + return 0 + + +def main(): + repo = "zeroclaw-labs/zeroclaw" + date_str = "2026-02-17" + + print(f"Fetching workflow runs for {repo} on {date_str}...") + print("=" * 100) + + all_runs = [] + for page in range(1, 5): # up to 400 runs + data = fetch_runs(repo, date_str, page=page) + runs = data.get("workflow_runs", []) + if not runs: + break + all_runs.extend(runs) + if len(runs) < 100: + break + + print(f"Total workflow runs found: {len(all_runs)}") + print() + + # Group by workflow name + workflow_stats = {} + for run in all_runs: + name = run.get("name", "Unknown") + event = run.get("event", "unknown") + conclusion = run.get("conclusion", "unknown") + run_id = run.get("id") + + if name not in workflow_stats: + workflow_stats[name] = { + "count": 0, + "events": {}, + "conclusions": {}, + "total_job_seconds": 0, + "total_jobs": 0, + "run_ids": [], + } + + workflow_stats[name]["count"] += 1 + workflow_stats[name]["events"][event] = workflow_stats[name]["events"].get(event, 0) + 1 + workflow_stats[name]["conclusions"][conclusion] = workflow_stats[name]["conclusions"].get(conclusion, 0) + 1 + workflow_stats[name]["run_ids"].append(run_id) + + # For each workflow, sample up to 3 runs to get job-level timing + print("Sampling job-level timing (up to 3 runs per workflow)...") + print() + + for name, stats in workflow_stats.items(): + sample_ids = stats["run_ids"][:3] + for run_id in sample_ids: + jobs_data = fetch_jobs(repo, run_id) + jobs = jobs_data.get("jobs", []) + for job in jobs: + started = job.get("started_at") + completed = job.get("completed_at") + duration = parse_duration(started, completed) + stats["total_job_seconds"] += duration + stats["total_jobs"] += 1 + + # Extrapolate: if we sampled N runs but there are M total, scale up + sampled = len(sample_ids) + total = stats["count"] + if sampled > 0 and sampled < total: + scale = total / sampled + stats["estimated_total_seconds"] = stats["total_job_seconds"] * scale + else: + stats["estimated_total_seconds"] = stats["total_job_seconds"] + + # Print summary sorted by estimated cost (descending) + sorted_workflows = sorted( + workflow_stats.items(), + key=lambda x: x[1]["estimated_total_seconds"], + reverse=True + ) + + print("=" * 100) + print(f"{'Workflow':<40} {'Runs':>5} {'SampledJobs':>12} {'SampledMins':>12} {'Est.TotalMins':>14} {'Events'}") + print("-" * 100) + + grand_total_minutes = 0 + for name, stats in sorted_workflows: + sampled_mins = stats["total_job_seconds"] / 60 + est_total_mins = stats["estimated_total_seconds"] / 60 + grand_total_minutes += est_total_mins + events_str = ", ".join(f"{k}={v}" for k, v in stats["events"].items()) + conclusions_str = ", ".join(f"{k}={v}" for k, v in stats["conclusions"].items()) + print( + f"{name:<40} {stats['count']:>5} {stats['total_jobs']:>12} " + f"{sampled_mins:>12.1f} {est_total_mins:>14.1f} {events_str}" + ) + print(f"{'':>40} {'':>5} {'':>12} {'':>12} {'':>14} outcomes: {conclusions_str}") + + print("-" * 100) + print(f"{'GRAND TOTAL':>40} {len(all_runs):>5} {'':>12} {'':>12} {grand_total_minutes:>14.1f}") + print(f"\nEstimated total billable minutes on {date_str}: {grand_total_minutes:.0f} min ({grand_total_minutes/60:.1f} hours)") + print() + + # Also show raw run list + print("\n" + "=" * 100) + print("DETAILED RUN LIST") + print("=" * 100) + for run in all_runs: + name = run.get("name", "Unknown") + event = run.get("event", "unknown") + conclusion = run.get("conclusion", "unknown") + run_id = run.get("id") + started = run.get("run_started_at", "?") + print(f" [{run_id}] {name:<40} conclusion={conclusion:<12} event={event:<20} started={started}") + + +if __name__ == "__main__": + main() From a17c35679ef322e7b586aced8cf493f193278b0a Mon Sep 17 00:00:00 2001 From: Alex Gorevski Date: Wed, 18 Feb 2026 21:23:31 -0800 Subject: [PATCH 2/4] add params to actions data --- scripts/ci/fetch_actions_data.py | 123 ++++++++++++++++++++++--------- 1 file changed, 88 insertions(+), 35 deletions(-) diff --git a/scripts/ci/fetch_actions_data.py b/scripts/ci/fetch_actions_data.py index fa52ba4..32ebb5b 100644 --- a/scripts/ci/fetch_actions_data.py +++ b/scripts/ci/fetch_actions_data.py @@ -1,10 +1,47 @@ #!/usr/bin/env python3 -"""Fetch GitHub Actions workflow runs for a given date and summarize costs.""" +"""Fetch GitHub Actions workflow runs for a given date and summarize costs. +Usage: + python fetch_actions_data.py [OPTIONS] + +Options: + --date YYYY-MM-DD Date to query (default: yesterday) + --mode brief|full Output mode (default: full) + brief: billable minutes/hours table only + full: detailed breakdown with per-run list + --repo OWNER/NAME Repository (default: zeroclaw-labs/zeroclaw) + -h, --help Show this help message +""" + +import argparse import json import subprocess -import sys -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone + + +def parse_args(): + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Fetch GitHub Actions workflow runs and summarize costs.", + ) + yesterday = (datetime.now(timezone.utc) - timedelta(days=1)).strftime("%Y-%m-%d") + parser.add_argument( + "--date", + default=yesterday, + help="Date to query in YYYY-MM-DD format (default: yesterday)", + ) + parser.add_argument( + "--mode", + choices=["brief", "full"], + default="full", + help="Output mode: 'brief' for billable hours only, 'full' for detailed breakdown (default: full)", + ) + parser.add_argument( + "--repo", + default="zeroclaw-labs/zeroclaw", + help="Repository in OWNER/NAME format (default: zeroclaw-labs/zeroclaw)", + ) + return parser.parse_args() def fetch_runs(repo, date_str, page=1, per_page=100): @@ -43,8 +80,10 @@ def parse_duration(started, completed): def main(): - repo = "zeroclaw-labs/zeroclaw" - date_str = "2026-02-17" + args = parse_args() + repo = args.repo + date_str = args.date + brief = args.mode == "brief" print(f"Fetching workflow runs for {repo} on {date_str}...") print("=" * 100) @@ -117,39 +156,53 @@ def main(): reverse=True ) - print("=" * 100) - print(f"{'Workflow':<40} {'Runs':>5} {'SampledJobs':>12} {'SampledMins':>12} {'Est.TotalMins':>14} {'Events'}") - print("-" * 100) + if brief: + # Brief mode: compact billable hours table + print(f"{'Workflow':<40} {'Runs':>5} {'Est.Mins':>9} {'Est.Hours':>10}") + print("-" * 68) + grand_total_minutes = 0 + for name, stats in sorted_workflows: + est_mins = stats["estimated_total_seconds"] / 60 + grand_total_minutes += est_mins + print(f"{name:<40} {stats['count']:>5} {est_mins:>9.1f} {est_mins/60:>10.2f}") + print("-" * 68) + print(f"{'TOTAL':<40} {len(all_runs):>5} {grand_total_minutes:>9.0f} {grand_total_minutes/60:>10.1f}") + print(f"\nProjected monthly: ~{grand_total_minutes/60*30:.0f} hours") + else: + # Full mode: detailed breakdown with per-run list + print("=" * 100) + print(f"{'Workflow':<40} {'Runs':>5} {'SampledJobs':>12} {'SampledMins':>12} {'Est.TotalMins':>14} {'Events'}") + print("-" * 100) - grand_total_minutes = 0 - for name, stats in sorted_workflows: - sampled_mins = stats["total_job_seconds"] / 60 - est_total_mins = stats["estimated_total_seconds"] / 60 - grand_total_minutes += est_total_mins - events_str = ", ".join(f"{k}={v}" for k, v in stats["events"].items()) - conclusions_str = ", ".join(f"{k}={v}" for k, v in stats["conclusions"].items()) - print( - f"{name:<40} {stats['count']:>5} {stats['total_jobs']:>12} " - f"{sampled_mins:>12.1f} {est_total_mins:>14.1f} {events_str}" - ) - print(f"{'':>40} {'':>5} {'':>12} {'':>12} {'':>14} outcomes: {conclusions_str}") + grand_total_minutes = 0 + for name, stats in sorted_workflows: + sampled_mins = stats["total_job_seconds"] / 60 + est_total_mins = stats["estimated_total_seconds"] / 60 + grand_total_minutes += est_total_mins + events_str = ", ".join(f"{k}={v}" for k, v in stats["events"].items()) + conclusions_str = ", ".join(f"{k}={v}" for k, v in stats["conclusions"].items()) + print( + f"{name:<40} {stats['count']:>5} {stats['total_jobs']:>12} " + f"{sampled_mins:>12.1f} {est_total_mins:>14.1f} {events_str}" + ) + print(f"{'':>40} {'':>5} {'':>12} {'':>12} {'':>14} outcomes: {conclusions_str}") - print("-" * 100) - print(f"{'GRAND TOTAL':>40} {len(all_runs):>5} {'':>12} {'':>12} {grand_total_minutes:>14.1f}") - print(f"\nEstimated total billable minutes on {date_str}: {grand_total_minutes:.0f} min ({grand_total_minutes/60:.1f} hours)") - print() + print("-" * 100) + print(f"{'GRAND TOTAL':>40} {len(all_runs):>5} {'':>12} {'':>12} {grand_total_minutes:>14.1f}") + print(f"\nEstimated total billable minutes on {date_str}: {grand_total_minutes:.0f} min ({grand_total_minutes/60:.1f} hours)") + print() - # Also show raw run list - print("\n" + "=" * 100) - print("DETAILED RUN LIST") - print("=" * 100) - for run in all_runs: - name = run.get("name", "Unknown") - event = run.get("event", "unknown") - conclusion = run.get("conclusion", "unknown") - run_id = run.get("id") - started = run.get("run_started_at", "?") - print(f" [{run_id}] {name:<40} conclusion={conclusion:<12} event={event:<20} started={started}") + # Also show raw run list + print("\n" + "=" * 100) + print("DETAILED RUN LIST") + print("=" * 100) + for run in all_runs: + name = run.get("name", "Unknown") + event = run.get("event", "unknown") + conclusion = run.get("conclusion", "unknown") + run_id = run.get("id") + started = run.get("run_started_at", "?") + print(f" [{run_id}] {name:<40} conclusion={conclusion:<12} event={event:<20} started={started}") if __name__ == "__main__": From 00c09952133a772a8be2501bc6a4a3b909e5c160 Mon Sep 17 00:00:00 2001 From: Alex Gorevski Date: Wed, 18 Feb 2026 21:26:14 -0800 Subject: [PATCH 3/4] fix(ci): restore broken YAML structure in 3 workflows, revert aggressive STALE_HOURS - pr-auto-response.yml: restore permissions, steps, and checkout in contributor-tier-issues job (broken by runner swap) - pr-check-stale.yml: restore steps block and step name - pr-intake-checks.yml: restore steps block, checkout, and timeout - pr-check-status.yml: revert STALE_HOURS from 4 to 48 (not a cost optimization; 4h is too aggressive), switch to ubuntu-latest per PR description Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/workflows/pr-auto-response.yml | 9 +++++++++ .github/workflows/pr-check-stale.yml | 2 ++ .github/workflows/pr-check-status.yml | 4 ++-- .github/workflows/pr-intake-checks.yml | 6 ++++++ 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr-auto-response.yml b/.github/workflows/pr-auto-response.yml index d883a81..e5f068e 100644 --- a/.github/workflows/pr-auto-response.yml +++ b/.github/workflows/pr-auto-response.yml @@ -16,6 +16,15 @@ jobs: (github.event_name == 'pull_request_target' && (github.event.action == 'labeled' || github.event.action == 'unlabeled')) runs-on: ubuntu-latest + permissions: + contents: read + issues: write + pull-requests: write + steps: + - name: Checkout repository + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + + - name: Apply contributor tier label for issue author uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 env: LABEL_POLICY_PATH: .github/label-policy.json diff --git a/.github/workflows/pr-check-stale.yml b/.github/workflows/pr-check-stale.yml index 6048349..a2cf24c 100644 --- a/.github/workflows/pr-check-stale.yml +++ b/.github/workflows/pr-check-stale.yml @@ -13,6 +13,8 @@ jobs: issues: write pull-requests: write runs-on: ubuntu-latest + steps: + - name: Mark stale issues and pull requests uses: actions/stale@b5d41d4e1d5dceea10e7104786b73624c18a190f # v10.2.0 with: repo-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/pr-check-status.yml b/.github/workflows/pr-check-status.yml index e53bab4..b057e88 100644 --- a/.github/workflows/pr-check-status.yml +++ b/.github/workflows/pr-check-status.yml @@ -13,13 +13,13 @@ concurrency: jobs: nudge-stale-prs: - runs-on: blacksmith-2vcpu-ubuntu-2404 + runs-on: ubuntu-latest permissions: contents: read pull-requests: write issues: write env: - STALE_HOURS: "4" + STALE_HOURS: "48" steps: - name: Checkout repository uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 diff --git a/.github/workflows/pr-intake-checks.yml b/.github/workflows/pr-intake-checks.yml index 6997300..e703387 100644 --- a/.github/workflows/pr-intake-checks.yml +++ b/.github/workflows/pr-intake-checks.yml @@ -17,6 +17,12 @@ jobs: intake: name: Intake Checks runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - name: Checkout repository + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + + - name: Run safe PR intake checks uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 with: script: | From 3abadc45744ca77a9f43f3f5f72956f392fd473a Mon Sep 17 00:00:00 2001 From: Alex Gorevski Date: Wed, 18 Feb 2026 21:30:09 -0800 Subject: [PATCH 4/4] remove cost optimization analysis doc --- docs/ci-cost-optimization.md | 295 ----------------------------------- 1 file changed, 295 deletions(-) delete mode 100644 docs/ci-cost-optimization.md diff --git a/docs/ci-cost-optimization.md b/docs/ci-cost-optimization.md deleted file mode 100644 index 2485483..0000000 --- a/docs/ci-cost-optimization.md +++ /dev/null @@ -1,295 +0,0 @@ -# CI Cost Optimization — February 2026 - -> **Date:** 2026-02-18 -> **Status:** Implemented -> **Impact:** ~60-65% reduction in estimated monthly GitHub Actions billable minutes - ---- - -## Executive Summary - -On February 17, 2026, the ZeroClaw repository consumed **400+ workflow runs** in a single day, totaling an estimated **398 billable minutes** (~6.6 hours). At this rate, monthly costs were projected at **~200 hours/month** (~12,000 billable minutes). This document describes the analysis performed, optimizations implemented, and the revised CI/CD architecture. - ---- - -## Analysis Methodology - -A Python script (`scripts/ci/fetch_actions_data.py`) was created to programmatically fetch and analyze all GitHub Actions workflow runs from the GitHub API for February 17, 2026. The script: - -1. Fetched all completed workflow runs for the date via the GitHub REST API -2. Grouped runs by workflow name -3. Sampled job-level timing (up to 3 runs per workflow) to compute per-job durations -4. Extrapolated to estimate total billable minutes per workflow - -### Raw Data Summary (February 17, 2026) - -| Rank | Workflow | Runs/Day | Est. Minutes/Day | Primary Trigger | -|------|----------|----------|-------------------|-----------------| -| 1 | Rust Package Security Audit | 57 | 102 | Every PR + push | -| 2 | CI Run | 57 | 70 | Every PR + push | -| 3 | Performance Benchmarks | 15 | 63 | Every push to main | -| 4 | Docker | 20 | 63 | PR + push | -| 5 | PR Labeler | 69 | 20 | Every PR event | -| 6 | Feature Matrix | 3 | 19 | Push to main | -| 7 | Integration / E2E Tests | 15 | 17 | Every push to main | -| 8 | Workflow Sanity | 31 | 16 | Push + PR | -| 9 | Copilot Code Review | 6 | 14 | Dynamic | -| 10 | PR Intake Checks | 70 | 7 | Every PR event | -| 11 | PR Auto Responder | 47 | 4 | PR + issues | -| | **Total** | **400+** | **~398** | | - -### Key Findings - -- **15 pushes to main in ~2 hours** on Feb 17, each triggering 6-8 parallel workflows -- **Security Audit** was the single largest cost driver (102 min/day) with no path filtering -- **PR Auto Responder** had an **81% failure rate** (38/47 runs failing) — wasting runner time -- **CodeQL** runs twice daily (not captured in Feb 17 data since it's schedule-only) — adding ~3.5h/week -- **Benchmarks** ran on every push to main (15x in one day) despite being regression-focused -- **Dependabot** could generate up to 11 PRs/week, each triggering the full CI cascade - ---- - -## Changes Implemented - -### 1. Security Audit — Path Filters Added - -**File:** `.github/workflows/sec-audit.yml` - -**Before:** Ran on every PR and every push to main, regardless of what files changed. - -**After:** Only runs when dependency or source files change: -- `Cargo.toml`, `Cargo.lock`, `src/**`, `crates/**`, `deny.toml` - -**Weekly schedule retained** as a safety net for advisory database updates. - -**Estimated savings:** ~60-70% of security audit runs eliminated (~30-35 hours/month) - -### 2. Performance Benchmarks — Moved to Weekly Schedule - -**File:** `.github/workflows/test-benchmarks.yml` - -**Before:** Ran on every push to main (15x/day on Feb 17). - -**After:** Runs weekly (Monday 3am UTC) + on-demand via `workflow_dispatch`. - -**Artifact retention** reduced from 30 days to 7 days to lower storage costs. - -**Rationale:** Benchmark regressions don't need per-commit detection. Weekly cadence catches regressions within one development cycle. - -**Estimated savings:** ~90% reduction (~28 hours/month) - -### 3. Docker PR Smoke Builds — Tightened Path Filters - -**File:** `.github/workflows/pub-docker-img.yml` - -**Before:** PR smoke builds triggered on any change to `src/**`, `crates/**`, `benches/**`, `firmware/**`, etc. - -**After:** PR smoke builds only trigger on Docker-specific files: -- `Dockerfile`, `.dockerignore`, `docker-compose.yml`, `rust-toolchain.toml`, `dev/config.template.toml`, `.github/workflows/pub-docker-img.yml` - -**Push-to-main triggers unchanged** — production Docker images still rebuild on source changes. - -**Estimated savings:** ~40-50% fewer Docker smoke builds (~12-15 hours/month) - -### 4. CodeQL — Reduced from Twice-Daily to Weekly - -**File:** `.github/workflows/sec-codeql.yml` - -**Before:** Ran twice daily at 6am and 6pm UTC (14 runs/week), each performing a full `cargo build --workspace --all-targets`. - -**After:** Runs weekly (Monday 6am UTC) + on-demand. - -**Rationale:** CodeQL for Rust is still maturing. Weekly scans are standard practice for security-focused projects. On-demand dispatch available for urgent scans. - -**Estimated savings:** ~12 hours/month - -### 5. CI Run — Merged Lint Jobs + Dropped `--release` Build - -**File:** `.github/workflows/ci-run.yml` - -**Changes:** -1. **Merged `lint` and `lint-strict-delta` into a single job** — Previously these were two separate parallel jobs, each requiring a full runner spin-up, Rust toolchain install, and cache restore. Now they run sequentially in one job. -2. **Dropped `--release` flag from smoke build** — `cargo build --release` is 2-3x slower than debug due to optimizations. For a smoke check validating compilation, debug mode is equivalent. - -**Estimated savings:** ~1 runner job per CI invocation + faster build times - -### 6. Feature Matrix — Weekly-Only + Check-Only - -**File:** `.github/workflows/feature-matrix.yml` - -**Before:** Ran on every push to main touching `src/**` (3x on Feb 17) with 4 matrix entries, each running both `cargo check` AND `cargo test`. - -**After:** -1. **Removed push trigger** — Now weekly-only (Monday 4:30am UTC) + on-demand -2. **Removed `cargo test`** — Only runs `cargo check --locked` per feature combination. Tests are already covered by the main CI Run workflow. - -**Estimated savings:** ~50-75% of feature matrix compute eliminated - -### 7. Lightweight Jobs Moved to `ubuntu-latest` - -**Files affected:** -- `.github/workflows/pr-check-stale.yml` -- `.github/workflows/pr-check-status.yml` -- `.github/workflows/pr-auto-response.yml` -- `.github/workflows/pr-intake-checks.yml` -- `.github/workflows/pr-labeler.yml` -- `.github/workflows/sync-contributors.yml` - -**Before:** All jobs used `blacksmith-2vcpu-ubuntu-2404` runners, even for lightweight API-only operations (labeling, stale checks, greetings). - -**After:** Moved to `ubuntu-latest` (GitHub-hosted runners). These jobs only make API calls and run JavaScript scripts — they don't need Rust toolchains or specialized runners. - -**Additional change:** `pr-check-status.yml` schedule reduced from every 12 hours to once daily (8:15am UTC). - -### 8. Dependabot — Reduced Frequency and PR Limits - -**File:** `.github/dependabot.yml` - -**Before:** -- Cargo: weekly, 5 open PRs max -- GitHub Actions: weekly, 3 open PRs max -- Docker: weekly, 3 open PRs max -- Total: up to 11 Dependabot PRs/week, each triggering full CI - -**After:** -- Cargo: **monthly**, 3 open PRs max, all deps grouped into single PR -- GitHub Actions: **monthly**, 1 open PR max, all grouped -- Docker: **monthly**, 1 open PR max, all grouped -- Total: up to 5 Dependabot PRs/month - -**Rationale:** Each Dependabot PR triggers the full CI pipeline. Reducing from weekly to monthly and grouping updates into fewer PRs dramatically reduces CI cascade costs while still keeping dependencies current. - ---- - -## Known Issues to Investigate - -### PR Auto Responder — 81% Failure Rate - -The `pr-auto-response.yml` workflow had 38 failures out of 47 runs on Feb 17. The `contributor-tier-issues` job fires on every issue `labeled`/`unlabeled` event, even when the label is not contributor-tier related. While the JavaScript handler exits early for non-tier labels, the runner still spins up and checks out the repository. - -**Recommendations for further investigation:** -1. Add more specific event filtering at the workflow level to reduce unnecessary runs -2. Check if the failures are related to GitHub API rate limiting on the search endpoint -3. Consider whether `continue-on-error: true` should be added to non-critical jobs - ---- - -## Revised Workflow Architecture - -### Workflow Frequency Overview - -| Workflow | Trigger | Runner | -|----------|---------|--------| -| **CI Run** | Push to main + PR | Blacksmith | -| **Sec Audit** | Push/PR (path-filtered) + weekly schedule | Blacksmith | -| **Sec CodeQL** | Weekly schedule | Blacksmith | -| **Test E2E** | Push to main | Blacksmith | -| **Test Benchmarks** | Weekly schedule | Blacksmith | -| **Test Fuzz** | Weekly schedule | Blacksmith | -| **Feature Matrix** | Weekly schedule | Blacksmith | -| **Docker Publish** | Push to main (broad paths) + PR (Docker-only paths) | Blacksmith | -| **Release** | Tag push only | GitHub-hosted | -| **Workflow Sanity** | Push/PR (workflow paths only) | Blacksmith | -| **Label Policy** | Push/PR (policy paths only) | Blacksmith | -| **PR Labeler** | PR events | **ubuntu-latest** | -| **PR Intake Checks** | PR events | **ubuntu-latest** | -| **PR Auto Responder** | PR + issue events | **ubuntu-latest** | -| **PR Check Stale** | Daily schedule | **ubuntu-latest** | -| **PR Check Status** | Daily schedule | **ubuntu-latest** | -| **Sync Contributors** | Weekly schedule | **ubuntu-latest** | - -### Weekly Schedule Summary - -| Day | Time (UTC) | Workflow | -|-----|-----------|----------| -| Monday | 03:00 | Test Benchmarks | -| Monday | 04:30 | Feature Matrix | -| Monday | 06:00 | Sec Audit (schedule) | -| Monday | 06:00 | Sec CodeQL | -| Sunday | 00:00 | Sync Contributors | -| Sunday | 02:00 | Test Fuzz | -| Daily | 02:20 | PR Check Stale | -| Daily | 08:15 | PR Check Status | - -### CI Run Job Dependency Graph - -``` -changes ──┬── lint (Format + Clippy + Strict Delta) - │ └── test - ├── build (Smoke, debug mode) - ├── docs-only (fast path) - ├── non-rust (fast path) - ├── docs-quality - └── workflow-owner-approval - -All above ──── ci-required (final gate) -``` - -### Push-to-Main Trigger Cascade - -When code is pushed to `main`, the following workflows trigger: - -1. **CI Run** — Always (change-detection gates individual jobs) -2. **Sec Audit** — Only if `Cargo.toml`, `Cargo.lock`, `src/**`, `crates/**`, or `deny.toml` changed -3. **Test E2E** — Always -4. **Docker Publish** — Only if broad source paths changed -5. **Workflow Sanity** — Only if workflow files changed - -**No longer triggered on push:** -- ~~Performance Benchmarks~~ → Weekly only -- ~~Feature Matrix~~ → Weekly only - ---- - -## Estimated Impact - -| Metric | Before | After | Savings | -|--------|--------|-------|---------| -| Daily workflow runs | 400+ | ~150-180 | ~55-60% | -| Daily billable minutes | ~400 min | ~120-150 min | ~60-65% | -| Monthly billable hours | ~200 hours | ~60-75 hours | ~60-65% | -| Dependabot PRs/month | ~44 | ~5 | ~89% | -| CodeQL runs/week | 14 | 1 | ~93% | -| Benchmark runs/day | ~15 | 0 (weekly: ~1) | ~99% | - ---- - -## Rollback Strategy - -Each change is isolated to a single workflow file. To rollback any specific optimization: - -1. **Revert the specific file** using `git checkout ^ -- ` -2. Changes are backward-compatible — no downstream code or configuration depends on the CI schedule/trigger changes -3. All workflows retain `workflow_dispatch` triggers for manual invocation when needed - ---- - -## Validation Checklist - -- [ ] Verify CI Run workflow passes on next PR with Rust changes -- [ ] Verify Security Audit skips docs-only PRs -- [ ] Verify Docker smoke build only triggers on Dockerfile changes in PRs -- [ ] Verify weekly schedules fire correctly (check after first Monday) -- [ ] Monitor PR Auto Responder failure rate after switching to `ubuntu-latest` -- [ ] Verify Dependabot respects new monthly schedule and limits - ---- - -## Files Modified - -| File | Change Summary | -|------|---------------| -| `.github/workflows/sec-audit.yml` | Added path filters for push and PR triggers | -| `.github/workflows/test-benchmarks.yml` | Changed to weekly schedule; reduced artifact retention to 7 days | -| `.github/workflows/pub-docker-img.yml` | Tightened PR path filters to Docker-specific files | -| `.github/workflows/sec-codeql.yml` | Changed from twice-daily to weekly schedule | -| `.github/workflows/ci-run.yml` | Merged lint jobs; dropped `--release` from smoke build | -| `.github/workflows/feature-matrix.yml` | Removed push trigger; removed `cargo test` step | -| `.github/workflows/pr-check-stale.yml` | Switched to `ubuntu-latest` | -| `.github/workflows/pr-check-status.yml` | Switched to `ubuntu-latest`; reduced to daily schedule | -| `.github/workflows/pr-auto-response.yml` | Switched all jobs to `ubuntu-latest` | -| `.github/workflows/pr-intake-checks.yml` | Switched to `ubuntu-latest` | -| `.github/workflows/pr-labeler.yml` | Switched to `ubuntu-latest` | -| `.github/workflows/sync-contributors.yml` | Switched to `ubuntu-latest` | -| `.github/dependabot.yml` | Changed to monthly schedule; reduced PR limits; grouped all deps | -| `scripts/ci/fetch_actions_data.py` | New: cost analysis script for GitHub Actions runs |