fix(cron): add timeout and bounded execution for due jobs

This commit is contained in:
fettpl 2026-02-17 23:20:32 +01:00 committed by Chummy
parent 5f5cb27690
commit 7de052c7d2
2 changed files with 172 additions and 26 deletions

View file

@ -9,15 +9,21 @@ use crate::cron::{
use crate::security::SecurityPolicy;
use anyhow::Result;
use chrono::{DateTime, Utc};
use futures_util::{stream, StreamExt};
use std::sync::Arc;
use tokio::process::Command;
use tokio::time::{self, Duration};
const MIN_POLL_SECONDS: u64 = 5;
const SHELL_JOB_TIMEOUT_SECS: u64 = 120;
pub async fn run(config: Config) -> Result<()> {
let poll_secs = config.reliability.scheduler_poll_secs.max(MIN_POLL_SECONDS);
let mut interval = time::interval(Duration::from_secs(poll_secs));
let security = SecurityPolicy::from_config(&config.autonomy, &config.workspace_dir);
let security = Arc::new(SecurityPolicy::from_config(
&config.autonomy,
&config.workspace_dir,
));
crate::health::mark_component_ok("scheduler");
@ -33,20 +39,7 @@ pub async fn run(config: Config) -> Result<()> {
}
};
for job in jobs {
crate::health::mark_component_ok("scheduler");
warn_if_high_frequency_agent_job(&job);
let started_at = Utc::now();
let (success, output) = execute_job_with_retry(&config, &security, &job).await;
let finished_at = Utc::now();
let success =
persist_job_result(&config, &job, success, &output, started_at, finished_at).await;
if !success {
crate::health::mark_component_error("scheduler", format!("job {} failed", job.id));
}
}
process_due_jobs(&config, &security, jobs).await;
}
}
@ -90,6 +83,38 @@ async fn execute_job_with_retry(
(false, last_output)
}
async fn process_due_jobs(config: &Config, security: &Arc<SecurityPolicy>, jobs: Vec<CronJob>) {
let max_concurrent = config.scheduler.max_concurrent.max(1);
let mut in_flight = stream::iter(jobs.into_iter().map(|job| {
let config = config.clone();
let security = Arc::clone(security);
async move { execute_and_persist_job(&config, security.as_ref(), &job).await }
}))
.buffer_unordered(max_concurrent);
while let Some((job_id, success)) = in_flight.next().await {
if !success {
crate::health::mark_component_error("scheduler", format!("job {job_id} failed"));
}
}
}
async fn execute_and_persist_job(
config: &Config,
security: &SecurityPolicy,
job: &CronJob,
) -> (String, bool) {
crate::health::mark_component_ok("scheduler");
warn_if_high_frequency_agent_job(job);
let started_at = Utc::now();
let (success, output) = execute_job_with_retry(config, security, job).await;
let finished_at = Utc::now();
let success = persist_job_result(config, job, success, &output, started_at, finished_at).await;
(job.id.clone(), success)
}
async fn run_agent_job(config: &Config, job: &CronJob) -> (bool, String) {
let name = job.name.clone().unwrap_or_else(|| "cron-job".to_string());
let prompt = job.prompt.clone().unwrap_or_default();
@ -346,6 +371,21 @@ async fn run_job_command(
config: &Config,
security: &SecurityPolicy,
job: &CronJob,
) -> (bool, String) {
run_job_command_with_timeout(
config,
security,
job,
Duration::from_secs(SHELL_JOB_TIMEOUT_SECS),
)
.await
}
async fn run_job_command_with_timeout(
config: &Config,
security: &SecurityPolicy,
job: &CronJob,
timeout: Duration,
) -> (bool, String) {
if !security.can_act() {
return (
@ -385,15 +425,19 @@ async fn run_job_command(
);
}
let output = Command::new("sh")
let child = match Command::new("sh")
.arg("-lc")
.arg(&job.command)
.current_dir(&config.workspace_dir)
.output()
.await;
.kill_on_drop(true)
.spawn()
{
Ok(child) => child,
Err(e) => return (false, format!("spawn error: {e}")),
};
match output {
Ok(output) => {
match time::timeout(timeout, child.wait_with_output()).await {
Ok(Ok(output)) => {
let stdout = String::from_utf8_lossy(&output.stdout);
let stderr = String::from_utf8_lossy(&output.stderr);
let combined = format!(
@ -404,7 +448,11 @@ async fn run_job_command(
);
(output.status.success(), combined)
}
Err(e) => (false, format!("spawn error: {e}")),
Ok(Err(e)) => (false, format!("spawn error: {e}")),
Err(_) => (
false,
format!("job timed out after {}s", timeout.as_secs_f64()),
),
}
}
@ -478,6 +526,20 @@ mod tests {
assert!(output.contains("status=exit status:"));
}
#[tokio::test]
async fn run_job_command_times_out() {
let tmp = TempDir::new().unwrap();
let mut config = test_config(&tmp);
config.autonomy.allowed_commands = vec!["sleep".into()];
let job = test_job("sleep 1");
let security = SecurityPolicy::from_config(&config.autonomy, &config.workspace_dir);
let (success, output) =
run_job_command_with_timeout(&config, &security, &job, Duration::from_millis(50)).await;
assert!(!success);
assert!(output.contains("job timed out after"));
}
#[tokio::test]
async fn run_job_command_blocks_disallowed_command() {
let tmp = TempDir::new().unwrap();