feat: SkillForge — automated skill discovery, evaluation & integration engine (#144)

* feat: add SkillForge — automated skill discovery, evaluation, and integration engine

SkillForge adds a 3-stage pipeline for autonomous skill management:

- Scout: discovers candidate skills from GitHub (extensible to ClawHub, HuggingFace)
- Evaluate: scores candidates on compatibility, quality, and security (weighted 0.30/0.35/0.35)
- Integrate: generates standard SKILL.toml + SKILL.md manifests for approved candidates

Thresholds: >=0.7 auto-integrate, 0.4-0.7 manual review, <0.4 skip.
Uses only existing dependencies (reqwest, serde, tokio, tracing, chrono, anyhow).
Includes unit tests for all modules.

* fix: address code review feedback on SkillForge PR #115

- evaluate: whole-word matching for BAD_PATTERNS (fixes hackathon false positive)
- evaluate: guard against future timestamps in recency bonus
- integrate: escape URLs in TOML output via escape_toml()
- integrate: handle control chars (\n, \r, \t, \b, \f) in escape_toml()
- mod: redact github_token in Debug impl to prevent log leakage
- mod: fix auto_integrated count when auto_integrate=false
- mod: per-candidate error handling (single failure no longer aborts pipeline)
- scout: add 30s request timeout, remove unused token field
- deps: enable chrono serde feature for DateTime serialization
- tests: add hackathon/exact-hack tests, update escape_toml test coverage

* fix: address round-2 CodeRabbit review feedback

- integrate: add sanitize_path_component() to prevent directory traversal
- mod: GitHub scout failure now logs warning and continues (no pipeline abort)
- scout: network/parse errors per-query use warn+continue instead of ?
- scout: implement std::str::FromStr for ScoutSource (replaces custom from_str)
- tests: add path sanitization tests (traversal, separators, dot trimming)

---------

Co-authored-by: stawky <stakeswky@gmail.com>
This commit is contained in:
Argenis 2026-02-15 09:26:13 -05:00 committed by GitHub
parent 2ac571f406
commit 35b63d6b12
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 1098 additions and 1 deletions

261
src/skillforge/evaluate.rs Normal file
View file

@ -0,0 +1,261 @@
//! Evaluator — scores discovered skill candidates across multiple dimensions.
use serde::{Deserialize, Serialize};
use super::scout::ScoutResult;
// ---------------------------------------------------------------------------
// Scoring dimensions
// ---------------------------------------------------------------------------
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Scores {
/// OS / arch / runtime compatibility (0.01.0).
pub compatibility: f64,
/// Code quality signals: stars, tests, docs (0.01.0).
pub quality: f64,
/// Security posture: license, known-bad patterns (0.01.0).
pub security: f64,
}
impl Scores {
/// Weighted total. Weights: compatibility 0.3, quality 0.35, security 0.35.
pub fn total(&self) -> f64 {
self.compatibility * 0.30 + self.quality * 0.35 + self.security * 0.35
}
}
// ---------------------------------------------------------------------------
// Recommendation
// ---------------------------------------------------------------------------
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum Recommendation {
/// Score >= threshold → safe to auto-integrate.
Auto,
/// Score in [0.4, threshold) → needs human review.
Manual,
/// Score < 0.4 → skip entirely.
Skip,
}
// ---------------------------------------------------------------------------
// EvalResult
// ---------------------------------------------------------------------------
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalResult {
pub candidate: ScoutResult,
pub scores: Scores,
pub total_score: f64,
pub recommendation: Recommendation,
}
// ---------------------------------------------------------------------------
// Evaluator
// ---------------------------------------------------------------------------
pub struct Evaluator {
/// Minimum total score for auto-integration.
min_score: f64,
}
/// Known-bad patterns in repo names / descriptions (matched as whole words).
const BAD_PATTERNS: &[&str] = &[
"malware",
"exploit",
"hack",
"crack",
"keygen",
"ransomware",
"trojan",
];
/// Check if `haystack` contains `word` as a whole word (bounded by non-alphanumeric chars).
fn contains_word(haystack: &str, word: &str) -> bool {
for (i, _) in haystack.match_indices(word) {
let before_ok = i == 0
|| !haystack.as_bytes()[i - 1].is_ascii_alphanumeric();
let after = i + word.len();
let after_ok = after >= haystack.len()
|| !haystack.as_bytes()[after].is_ascii_alphanumeric();
if before_ok && after_ok {
return true;
}
}
false
}
impl Evaluator {
pub fn new(min_score: f64) -> Self {
Self { min_score }
}
pub fn evaluate(&self, candidate: ScoutResult) -> EvalResult {
let compatibility = self.score_compatibility(&candidate);
let quality = self.score_quality(&candidate);
let security = self.score_security(&candidate);
let scores = Scores {
compatibility,
quality,
security,
};
let total_score = scores.total();
let recommendation = if total_score >= self.min_score {
Recommendation::Auto
} else if total_score >= 0.4 {
Recommendation::Manual
} else {
Recommendation::Skip
};
EvalResult {
candidate,
scores,
total_score,
recommendation,
}
}
// -- Dimension scorers --------------------------------------------------
/// Compatibility: favour Rust repos; penalise unknown languages.
fn score_compatibility(&self, c: &ScoutResult) -> f64 {
match c.language.as_deref() {
Some("Rust") => 1.0,
Some("Python" | "TypeScript" | "JavaScript") => 0.6,
Some(_) => 0.3,
None => 0.2,
}
}
/// Quality: based on star count (log scale, capped at 1.0).
fn score_quality(&self, c: &ScoutResult) -> f64 {
// log2(stars + 1) / 10, capped at 1.0
let raw = ((c.stars as f64) + 1.0).log2() / 10.0;
raw.min(1.0)
}
/// Security: license presence + bad-pattern check.
fn score_security(&self, c: &ScoutResult) -> f64 {
let mut score: f64 = 0.5;
// License bonus
if c.has_license {
score += 0.3;
}
// Bad-pattern penalty (whole-word match)
let lower_name = c.name.to_lowercase();
let lower_desc = c.description.to_lowercase();
for pat in BAD_PATTERNS {
if contains_word(&lower_name, pat) || contains_word(&lower_desc, pat) {
score -= 0.5;
break;
}
}
// Recency bonus: updated within last 180 days (guard against future timestamps)
if let Some(updated) = c.updated_at {
let age_days = (chrono::Utc::now() - updated).num_days();
if (0..180).contains(&age_days) {
score += 0.2;
}
}
score.clamp(0.0, 1.0)
}
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
use crate::skillforge::scout::{ScoutResult, ScoutSource};
fn make_candidate(stars: u64, lang: Option<&str>, has_license: bool) -> ScoutResult {
ScoutResult {
name: "test-skill".into(),
url: "https://github.com/test/test-skill".into(),
description: "A test skill".into(),
stars,
language: lang.map(String::from),
updated_at: Some(chrono::Utc::now()),
source: ScoutSource::GitHub,
owner: "test".into(),
has_license,
}
}
#[test]
fn high_quality_rust_repo_gets_auto() {
let eval = Evaluator::new(0.7);
let c = make_candidate(500, Some("Rust"), true);
let res = eval.evaluate(c);
assert!(res.total_score >= 0.7, "score: {}", res.total_score);
assert_eq!(res.recommendation, Recommendation::Auto);
}
#[test]
fn low_star_no_license_gets_manual_or_skip() {
let eval = Evaluator::new(0.7);
let c = make_candidate(1, None, false);
let res = eval.evaluate(c);
assert!(res.total_score < 0.7, "score: {}", res.total_score);
assert_ne!(res.recommendation, Recommendation::Auto);
}
#[test]
fn bad_pattern_tanks_security() {
let eval = Evaluator::new(0.7);
let mut c = make_candidate(1000, Some("Rust"), true);
c.name = "malware-skill".into();
let res = eval.evaluate(c);
// 0.5 base + 0.3 license - 0.5 bad_pattern + 0.2 recency = 0.5
assert!(res.scores.security <= 0.5, "security: {}", res.scores.security);
}
#[test]
fn scores_total_weighted() {
let s = Scores {
compatibility: 1.0,
quality: 1.0,
security: 1.0,
};
assert!((s.total() - 1.0).abs() < f64::EPSILON);
let s2 = Scores {
compatibility: 0.0,
quality: 0.0,
security: 0.0,
};
assert!((s2.total()).abs() < f64::EPSILON);
}
#[test]
fn hackathon_not_flagged_as_bad() {
let eval = Evaluator::new(0.7);
let mut c = make_candidate(500, Some("Rust"), true);
c.name = "hackathon-tools".into();
c.description = "Tools for hackathons and lifehacks".into();
let res = eval.evaluate(c);
// "hack" should NOT match "hackathon" or "lifehacks"
assert!(res.scores.security >= 0.5, "security: {}", res.scores.security);
}
#[test]
fn exact_hack_is_flagged() {
let eval = Evaluator::new(0.7);
let mut c = make_candidate(500, Some("Rust"), false);
c.name = "hack-tool".into();
c.updated_at = None;
let res = eval.evaluate(c);
// 0.5 base + 0.0 license - 0.5 bad_pattern + 0.0 recency = 0.0
assert!(res.scores.security < 0.5, "security: {}", res.scores.security);
}
}