fix(cron): add timeout and bounded execution for due jobs
This commit is contained in:
parent
5f5cb27690
commit
7de052c7d2
2 changed files with 172 additions and 26 deletions
|
|
@ -9,15 +9,21 @@ use crate::cron::{
|
|||
use crate::security::SecurityPolicy;
|
||||
use anyhow::Result;
|
||||
use chrono::{DateTime, Utc};
|
||||
use futures_util::{stream, StreamExt};
|
||||
use std::sync::Arc;
|
||||
use tokio::process::Command;
|
||||
use tokio::time::{self, Duration};
|
||||
|
||||
const MIN_POLL_SECONDS: u64 = 5;
|
||||
const SHELL_JOB_TIMEOUT_SECS: u64 = 120;
|
||||
|
||||
pub async fn run(config: Config) -> Result<()> {
|
||||
let poll_secs = config.reliability.scheduler_poll_secs.max(MIN_POLL_SECONDS);
|
||||
let mut interval = time::interval(Duration::from_secs(poll_secs));
|
||||
let security = SecurityPolicy::from_config(&config.autonomy, &config.workspace_dir);
|
||||
let security = Arc::new(SecurityPolicy::from_config(
|
||||
&config.autonomy,
|
||||
&config.workspace_dir,
|
||||
));
|
||||
|
||||
crate::health::mark_component_ok("scheduler");
|
||||
|
||||
|
|
@ -33,20 +39,7 @@ pub async fn run(config: Config) -> Result<()> {
|
|||
}
|
||||
};
|
||||
|
||||
for job in jobs {
|
||||
crate::health::mark_component_ok("scheduler");
|
||||
warn_if_high_frequency_agent_job(&job);
|
||||
|
||||
let started_at = Utc::now();
|
||||
let (success, output) = execute_job_with_retry(&config, &security, &job).await;
|
||||
let finished_at = Utc::now();
|
||||
let success =
|
||||
persist_job_result(&config, &job, success, &output, started_at, finished_at).await;
|
||||
|
||||
if !success {
|
||||
crate::health::mark_component_error("scheduler", format!("job {} failed", job.id));
|
||||
}
|
||||
}
|
||||
process_due_jobs(&config, &security, jobs).await;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -90,6 +83,38 @@ async fn execute_job_with_retry(
|
|||
(false, last_output)
|
||||
}
|
||||
|
||||
async fn process_due_jobs(config: &Config, security: &Arc<SecurityPolicy>, jobs: Vec<CronJob>) {
|
||||
let max_concurrent = config.scheduler.max_concurrent.max(1);
|
||||
let mut in_flight = stream::iter(jobs.into_iter().map(|job| {
|
||||
let config = config.clone();
|
||||
let security = Arc::clone(security);
|
||||
async move { execute_and_persist_job(&config, security.as_ref(), &job).await }
|
||||
}))
|
||||
.buffer_unordered(max_concurrent);
|
||||
|
||||
while let Some((job_id, success)) = in_flight.next().await {
|
||||
if !success {
|
||||
crate::health::mark_component_error("scheduler", format!("job {job_id} failed"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn execute_and_persist_job(
|
||||
config: &Config,
|
||||
security: &SecurityPolicy,
|
||||
job: &CronJob,
|
||||
) -> (String, bool) {
|
||||
crate::health::mark_component_ok("scheduler");
|
||||
warn_if_high_frequency_agent_job(job);
|
||||
|
||||
let started_at = Utc::now();
|
||||
let (success, output) = execute_job_with_retry(config, security, job).await;
|
||||
let finished_at = Utc::now();
|
||||
let success = persist_job_result(config, job, success, &output, started_at, finished_at).await;
|
||||
|
||||
(job.id.clone(), success)
|
||||
}
|
||||
|
||||
async fn run_agent_job(config: &Config, job: &CronJob) -> (bool, String) {
|
||||
let name = job.name.clone().unwrap_or_else(|| "cron-job".to_string());
|
||||
let prompt = job.prompt.clone().unwrap_or_default();
|
||||
|
|
@ -346,6 +371,21 @@ async fn run_job_command(
|
|||
config: &Config,
|
||||
security: &SecurityPolicy,
|
||||
job: &CronJob,
|
||||
) -> (bool, String) {
|
||||
run_job_command_with_timeout(
|
||||
config,
|
||||
security,
|
||||
job,
|
||||
Duration::from_secs(SHELL_JOB_TIMEOUT_SECS),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn run_job_command_with_timeout(
|
||||
config: &Config,
|
||||
security: &SecurityPolicy,
|
||||
job: &CronJob,
|
||||
timeout: Duration,
|
||||
) -> (bool, String) {
|
||||
if !security.can_act() {
|
||||
return (
|
||||
|
|
@ -385,15 +425,19 @@ async fn run_job_command(
|
|||
);
|
||||
}
|
||||
|
||||
let output = Command::new("sh")
|
||||
let child = match Command::new("sh")
|
||||
.arg("-lc")
|
||||
.arg(&job.command)
|
||||
.current_dir(&config.workspace_dir)
|
||||
.output()
|
||||
.await;
|
||||
.kill_on_drop(true)
|
||||
.spawn()
|
||||
{
|
||||
Ok(child) => child,
|
||||
Err(e) => return (false, format!("spawn error: {e}")),
|
||||
};
|
||||
|
||||
match output {
|
||||
Ok(output) => {
|
||||
match time::timeout(timeout, child.wait_with_output()).await {
|
||||
Ok(Ok(output)) => {
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
let combined = format!(
|
||||
|
|
@ -404,7 +448,11 @@ async fn run_job_command(
|
|||
);
|
||||
(output.status.success(), combined)
|
||||
}
|
||||
Err(e) => (false, format!("spawn error: {e}")),
|
||||
Ok(Err(e)) => (false, format!("spawn error: {e}")),
|
||||
Err(_) => (
|
||||
false,
|
||||
format!("job timed out after {}s", timeout.as_secs_f64()),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -478,6 +526,20 @@ mod tests {
|
|||
assert!(output.contains("status=exit status:"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn run_job_command_times_out() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let mut config = test_config(&tmp);
|
||||
config.autonomy.allowed_commands = vec!["sleep".into()];
|
||||
let job = test_job("sleep 1");
|
||||
let security = SecurityPolicy::from_config(&config.autonomy, &config.workspace_dir);
|
||||
|
||||
let (success, output) =
|
||||
run_job_command_with_timeout(&config, &security, &job, Duration::from_millis(50)).await;
|
||||
assert!(!success);
|
||||
assert!(output.contains("job timed out after"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn run_job_command_blocks_disallowed_command() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue