ci: unify rust quality gate and add incremental docs/link checks

2026-02-17 14:37:17 +08:00 · 2026-02-17 14:37:17 +08:00 · 6528613c8d
commit 6528613c8d
parent 8a6273b988
12 changed files with 514 additions and 47 deletions
--- a/scripts/ci/collect_changed_links.py
+++ b/scripts/ci/collect_changed_links.py
@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+
+DOC_PATH_RE = re.compile(r"\.mdx?$")
+URL_RE = re.compile(r"https?://[^\s<>'\"]+")
+INLINE_LINK_RE = re.compile(r"!?\[[^\]]*\]\(([^)]+)\)")
+REF_LINK_RE = re.compile(r"^\s*\[[^\]]+\]:\s*(\S+)")
+TRAILING_PUNCTUATION = ").,;:!?]}'\""
+
+
+def run_git(args: list[str]) -> subprocess.CompletedProcess[str]:
+    return subprocess.run(["git", *args], check=False, capture_output=True, text=True)
+
+
+def commit_exists(rev: str) -> bool:
+    if not rev:
+        return False
+    return run_git(["cat-file", "-e", f"{rev}^{{commit}}"]).returncode == 0
+
+
+def normalize_docs_files(raw: str) -> list[str]:
+    if not raw:
+        return []
+    files: list[str] = []
+    for line in raw.splitlines():
+        path = line.strip()
+        if path:
+            files.append(path)
+    return files
+
+
+def infer_base_sha(provided: str) -> str:
+    if commit_exists(provided):
+        return provided
+    if run_git(["rev-parse", "--verify", "origin/main"]).returncode != 0:
+        return ""
+    proc = run_git(["merge-base", "origin/main", "HEAD"])
+    candidate = proc.stdout.strip()
+    return candidate if commit_exists(candidate) else ""
+
+
+def infer_docs_files(base_sha: str, provided: list[str]) -> list[str]:
+    if provided:
+        return provided
+    if not base_sha:
+        return []
+    diff = run_git(["diff", "--name-only", base_sha, "HEAD"])
+    files: list[str] = []
+    for line in diff.stdout.splitlines():
+        path = line.strip()
+        if not path:
+            continue
+        if DOC_PATH_RE.search(path) or path in {"LICENSE", ".github/pull_request_template.md"}:
+            files.append(path)
+    return files
+
+
+def normalize_link_target(raw_target: str, source_path: str) -> str | None:
+    target = raw_target.strip()
+    if target.startswith("<") and target.endswith(">"):
+        target = target[1:-1].strip()
+
+    if not target:
+        return None
+
+    if " " in target:
+        target = target.split()[0].strip()
+
+    if not target or target.startswith("#"):
+        return None
+
+    lower = target.lower()
+    if lower.startswith(("mailto:", "tel:", "javascript:")):
+        return None
+
+    if target.startswith(("http://", "https://")):
+        return target.rstrip(TRAILING_PUNCTUATION)
+
+    path_without_fragment = target.split("#", 1)[0].split("?", 1)[0]
+    if not path_without_fragment:
+        return None
+
+    if path_without_fragment.startswith("/"):
+        resolved = path_without_fragment.lstrip("/")
+    else:
+        resolved = os.path.normpath(
+            os.path.join(os.path.dirname(source_path) or ".", path_without_fragment)
+        )
+
+    if not resolved or resolved == ".":
+        return None
+
+    return resolved
+
+
+def extract_links(text: str, source_path: str) -> list[str]:
+    links: list[str] = []
+    for match in URL_RE.findall(text):
+        url = match.rstrip(TRAILING_PUNCTUATION)
+        if url:
+            links.append(url)
+
+    for match in INLINE_LINK_RE.findall(text):
+        normalized = normalize_link_target(match, source_path)
+        if normalized:
+            links.append(normalized)
+
+    ref_match = REF_LINK_RE.match(text)
+    if ref_match:
+        normalized = normalize_link_target(ref_match.group(1), source_path)
+        if normalized:
+            links.append(normalized)
+
+    return links
+
+
+def added_lines_for_file(base_sha: str, path: str) -> list[str]:
+    if base_sha:
+        diff = run_git(["diff", "--unified=0", base_sha, "HEAD", "--", path])
+        lines: list[str] = []
+        for raw_line in diff.stdout.splitlines():
+            if raw_line.startswith("+++"):
+                continue
+            if raw_line.startswith("+"):
+                lines.append(raw_line[1:])
+        return lines
+
+    file_path = Path(path)
+    if not file_path.is_file():
+        return []
+    return file_path.read_text(encoding="utf-8", errors="ignore").splitlines()
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Collect HTTP(S) links added in changed docs lines")
+    parser.add_argument("--base", default="", help="Base commit SHA")
+    parser.add_argument(
+        "--docs-files",
+        default="",
+        help="Newline-separated docs files list",
+    )
+    parser.add_argument("--output", required=True, help="Output file for unique URLs")
+    args = parser.parse_args()
+
+    base_sha = infer_base_sha(args.base)
+    docs_files = infer_docs_files(base_sha, normalize_docs_files(args.docs_files))
+
+    existing_files = [path for path in docs_files if Path(path).is_file()]
+    if not existing_files:
+        Path(args.output).write_text("", encoding="utf-8")
+        print("No docs files available for link collection.")
+        return 0
+
+    unique_urls: list[str] = []
+    seen: set[str] = set()
+    for path in existing_files:
+        for line in added_lines_for_file(base_sha, path):
+            for link in extract_links(line, path):
+                if link not in seen:
+                    seen.add(link)
+                    unique_urls.append(link)
+
+    Path(args.output).write_text("\n".join(unique_urls) + ("\n" if unique_urls else ""), encoding="utf-8")
+    print(f"Collected {len(unique_urls)} added link(s) from {len(existing_files)} docs file(s).")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/ci/docs_links_gate.sh
+++ b/scripts/ci/docs_links_gate.sh
@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+BASE_SHA="${BASE_SHA:-}"
+DOCS_FILES_RAW="${DOCS_FILES:-}"
+
+LINKS_FILE="$(mktemp)"
+trap 'rm -f "$LINKS_FILE"' EXIT
+
+python3 ./scripts/ci/collect_changed_links.py \
+    --base "$BASE_SHA" \
+    --docs-files "$DOCS_FILES_RAW" \
+    --output "$LINKS_FILE"
+
+if [ ! -s "$LINKS_FILE" ]; then
+    echo "No added links detected in changed docs lines."
+    exit 0
+fi
+
+if ! command -v lychee >/dev/null 2>&1; then
+    echo "lychee is required to run docs link gate locally."
+    echo "Install via: cargo install lychee"
+    exit 1
+fi
+
+echo "Checking added links with lychee (offline mode)..."
+lychee --offline --no-progress --format detailed "$LINKS_FILE"
--- a/scripts/ci/docs_quality_gate.sh
+++ b/scripts/ci/docs_quality_gate.sh
@ -0,0 +1,181 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+BASE_SHA="${BASE_SHA:-}"
+DOCS_FILES_RAW="${DOCS_FILES:-}"
+
+if [ -z "$BASE_SHA" ] && git rev-parse --verify origin/main >/dev/null 2>&1; then
+    BASE_SHA="$(git merge-base origin/main HEAD)"
+fi
+
+if [ -z "$DOCS_FILES_RAW" ] && [ -n "$BASE_SHA" ] && git cat-file -e "$BASE_SHA^{commit}" 2>/dev/null; then
+    DOCS_FILES_RAW="$(git diff --name-only "$BASE_SHA" HEAD | awk '
+        /\.md$/ || /\.mdx$/ || $0 == "LICENSE" || $0 == ".github/pull_request_template.md" {
+            print
+        }
+    ')"
+fi
+
+if [ -z "$DOCS_FILES_RAW" ]; then
+    echo "No docs files detected; skipping docs quality gate."
+    exit 0
+fi
+
+if [ -z "$BASE_SHA" ] || ! git cat-file -e "$BASE_SHA^{commit}" 2>/dev/null; then
+    echo "BASE_SHA is missing or invalid; falling back to full-file markdown lint."
+    BASE_SHA=""
+fi
+
+ALL_FILES=()
+while IFS= read -r file; do
+    if [ -n "$file" ]; then
+        ALL_FILES+=("$file")
+    fi
+done < <(printf '%s\n' "$DOCS_FILES_RAW")
+
+if [ "${#ALL_FILES[@]}" -eq 0 ]; then
+    echo "No docs files detected after normalization; skipping docs quality gate."
+    exit 0
+fi
+
+EXISTING_FILES=()
+for file in "${ALL_FILES[@]}"; do
+    if [ -f "$file" ]; then
+        EXISTING_FILES+=("$file")
+    fi
+done
+
+if [ "${#EXISTING_FILES[@]}" -eq 0 ]; then
+    echo "No existing docs files to lint; skipping docs quality gate."
+    exit 0
+fi
+
+if command -v npx >/dev/null 2>&1; then
+    MD_CMD=(npx --yes markdownlint-cli2@0.20.0)
+elif command -v markdownlint-cli2 >/dev/null 2>&1; then
+    MD_CMD=(markdownlint-cli2)
+else
+    echo "markdownlint-cli2 is required (via npx or local binary)."
+    exit 1
+fi
+
+echo "Linting docs files: ${EXISTING_FILES[*]}"
+
+LINT_OUTPUT_FILE="$(mktemp)"
+set +e
+"${MD_CMD[@]}" "${EXISTING_FILES[@]}" >"$LINT_OUTPUT_FILE" 2>&1
+LINT_EXIT=$?
+set -e
+
+if [ "$LINT_EXIT" -eq 0 ]; then
+    cat "$LINT_OUTPUT_FILE"
+    rm -f "$LINT_OUTPUT_FILE"
+    exit 0
+fi
+
+if [ -z "$BASE_SHA" ]; then
+    cat "$LINT_OUTPUT_FILE"
+    rm -f "$LINT_OUTPUT_FILE"
+    exit "$LINT_EXIT"
+fi
+
+CHANGED_LINES_JSON_FILE="$(mktemp)"
+python3 - "$BASE_SHA" "${EXISTING_FILES[@]}" >"$CHANGED_LINES_JSON_FILE" <<'PY'
+import json
+import re
+import subprocess
+import sys
+
+base = sys.argv[1]
+files = sys.argv[2:]
+
+changed = {}
+hunk = re.compile(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@")
+
+for path in files:
+    proc = subprocess.run(
+        ["git", "diff", "--unified=0", base, "HEAD", "--", path],
+        check=False,
+        capture_output=True,
+        text=True,
+    )
+    ranges = []
+    for line in proc.stdout.splitlines():
+        m = hunk.match(line)
+        if not m:
+            continue
+        start = int(m.group(1))
+        count = int(m.group(2) or "1")
+        if count > 0:
+            ranges.append([start, start + count - 1])
+    changed[path] = ranges
+
+print(json.dumps(changed))
+PY
+
+FILTERED_OUTPUT_FILE="$(mktemp)"
+set +e
+python3 - "$LINT_OUTPUT_FILE" "$CHANGED_LINES_JSON_FILE" >"$FILTERED_OUTPUT_FILE" <<'PY'
+import json
+import re
+import sys
+
+lint_file = sys.argv[1]
+changed_file = sys.argv[2]
+
+with open(changed_file, "r", encoding="utf-8") as f:
+    changed = json.load(f)
+
+line_re = re.compile(r"^(.+?):(\d+)\s+error\s+(MD\d+(?:/[^\s]+)?)\s+(.*)$")
+
+blocking = []
+baseline = []
+other_lines = []
+
+with open(lint_file, "r", encoding="utf-8") as f:
+    for raw_line in f:
+        line = raw_line.rstrip("\n")
+        m = line_re.match(line)
+        if not m:
+            other_lines.append(line)
+            continue
+
+        path, line_no_s, rule, msg = m.groups()
+        line_no = int(line_no_s)
+        ranges = changed.get(path, [])
+
+        is_changed_line = any(start <= line_no <= end for start, end in ranges)
+        entry = f"{path}:{line_no} {rule} {msg}"
+        if is_changed_line:
+            blocking.append(entry)
+        else:
+            baseline.append(entry)
+
+if baseline:
+    print("Existing markdown issues outside changed lines (non-blocking):")
+    for entry in baseline:
+        print(f"  - {entry}")
+
+if blocking:
+    print("Markdown issues introduced on changed lines (blocking):")
+    for entry in blocking:
+        print(f"  - {entry}")
+    print(f"Blocking markdown issues: {len(blocking)}")
+    sys.exit(1)
+
+if baseline:
+    print("No blocking markdown issues on changed lines.")
+    sys.exit(0)
+
+for line in other_lines:
+    print(line)
+print("No blocking markdown issues on changed lines.")
+PY
+SCRIPT_EXIT=$?
+set -e
+
+cat "$FILTERED_OUTPUT_FILE"
+
+rm -f "$LINT_OUTPUT_FILE" "$CHANGED_LINES_JSON_FILE" "$FILTERED_OUTPUT_FILE"
+exit "$SCRIPT_EXIT"
--- a/scripts/ci/rust_quality_gate.sh
+++ b/scripts/ci/rust_quality_gate.sh
@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+MODE="correctness"
+if [ "${1:-}" = "--strict" ]; then
+    MODE="strict"
+fi
+
+echo "==> rust quality: cargo fmt --all -- --check"
+cargo fmt --all -- --check
+
+if [ "$MODE" = "strict" ]; then
+    echo "==> rust quality: cargo clippy --locked --all-targets -- -D warnings"
+    cargo clippy --locked --all-targets -- -D warnings
+else
+    echo "==> rust quality: cargo clippy --locked --all-targets -- -D clippy::correctness"
+    cargo clippy --locked --all-targets -- -D clippy::correctness
+fi