diff --git a/config.example.toml b/config.example.toml index cc755ac..c830ec9 100644 --- a/config.example.toml +++ b/config.example.toml @@ -5,6 +5,25 @@ # Working directory (default: current directory) # working_dir = "/path/to/project" +# Cache keep-alive interval in seconds (default: 0 = disabled). +# Sends a minimal API request to keep Anthropic's prompt cache warm during +# idle periods, preventing expensive cache re-creation when you return to +# a session after being away. +# +# How it works: +# - Anthropic caches your conversation prefix (system prompt + message history) +# - Default cache TTL is 5 minutes; unused caches expire and must be re-written +# - Keep-alive sends a 1-token request using your existing prefix to refresh the TTL +# - Cache reads cost 10% of normal input pricing (e.g., $0.50/MTok vs $5 for Opus) +# - Without keep-alive, returning after 5+ min idle costs full cache write (1.25x input) +# +# Recommended values: +# 240 = ping every 4 min (safe margin for 5-min default TTL) +# 3300 = ping every 55 min (if using 1-hour extended TTL) +# 0 = disabled (default) +# +# cache_keepalive_secs = 240 + [agents.foreground] # Model to use (default: claude-opus-4-6) model = "claude-opus-4-6" diff --git a/src/app.rs b/src/app.rs index 916ef21..844a857 100644 --- a/src/app.rs +++ b/src/app.rs @@ -361,6 +361,12 @@ impl App { self.chat.render(&mut self.terminal); self.draw(); + // Cache keep-alive timer: check every 30s if a keep-alive ping is needed. + // The actual interval is configured via `cache_keepalive_secs` in config; + // this tick rate just controls how often we check. + let mut keepalive_tick = tokio::time::interval(Duration::from_secs(30)); + keepalive_tick.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + // CANCEL-SAFETY: When one branch of tokio::select! completes, all other // futures are dropped (not paused). Any async fn polled here must store // its state on `self`, not in local variables, so it can resume correctly @@ -398,6 +404,29 @@ impl App { self.draw(); } } + // Cache keep-alive: send minimal ping to refresh Anthropic prompt cache TTL + _ = keepalive_tick.tick() => { + if let Some(agent_mutex) = self.agents.primary() { + let mut agent = agent_mutex.lock().await; + if agent.needs_cache_keepalive() { + match agent.send_cache_keepalive().await { + Ok(usage) => { + if usage.cache_read_tokens > 0 { + tracing::info!( + "Cache keep-alive refreshed {} cached tokens", + usage.cache_read_tokens + ); + } else { + tracing::debug!("Cache keep-alive sent (no cached tokens in response)"); + } + } + Err(e) => { + tracing::debug!("Cache keep-alive failed: {}", e); + } + } + } + } + } } if self.should_quit { diff --git a/src/config.rs b/src/config.rs index 22a02a5..7989453 100644 --- a/src/config.rs +++ b/src/config.rs @@ -50,6 +50,7 @@ pub const CORRECTIONS_FILENAME: &str = "corrections.json"; /// max_retries: 5, /// compaction_thinking_budget: 8_000, /// fast_mode: false, +/// cache_keepalive_secs: 0, /// }; /// ``` #[derive(Debug, Clone)] @@ -62,6 +63,10 @@ pub struct AgentRuntimeConfig { /// Enable fast mode (research preview) for lower-latency responses. /// Only effective with opus-4-6 models. pub fast_mode: bool, + /// Interval in seconds for cache keep-alive pings. 0 = disabled. + /// Sends a minimal API request to refresh Anthropic's prompt cache TTL + /// before it expires, avoiding expensive cache re-creation on idle sessions. + pub cache_keepalive_secs: u64, } impl Default for AgentRuntimeConfig { @@ -73,6 +78,7 @@ impl Default for AgentRuntimeConfig { max_retries: 5, compaction_thinking_budget: 8_000, fast_mode: false, + cache_keepalive_secs: 0, } } } @@ -92,6 +98,7 @@ impl AgentRuntimeConfig { max_retries: config.general.max_retries, compaction_thinking_budget: config.general.compaction_thinking_budget, fast_mode: config.agents.foreground.fast_mode, + cache_keepalive_secs: config.general.cache_keepalive_secs, } } @@ -104,6 +111,7 @@ impl AgentRuntimeConfig { max_retries: config.general.max_retries, compaction_thinking_budget: config.general.compaction_thinking_budget, fast_mode: config.agents.background.fast_mode, + cache_keepalive_secs: config.general.cache_keepalive_secs, } } } @@ -225,6 +233,10 @@ pub struct GeneralConfig { pub compaction_threshold: u32, /// Thinking budget for compaction requests (default: 8,000) pub compaction_thinking_budget: u32, + /// Interval in seconds for cache keep-alive pings. 0 = disabled (default). + /// Sends a minimal API request to keep Anthropic's prompt cache warm during + /// idle periods. Recommended: 240 (4 min) for 5-min TTL, 3300 (55 min) for 1-hour TTL. + pub cache_keepalive_secs: u64, } #[cfg(feature = "cli")] @@ -235,6 +247,7 @@ impl Default for GeneralConfig { max_retries: 5, compaction_threshold: 192_000, compaction_thinking_budget: 8_000, + cache_keepalive_secs: 0, } } } diff --git a/src/llm/agent.rs b/src/llm/agent.rs index 819ad1e..b1919e8 100644 --- a/src/llm/agent.rs +++ b/src/llm/agent.rs @@ -200,6 +200,9 @@ pub struct Agent { /// Retry attempt counter, persists across calls to exec_chat_with_retry. /// Reset on successful request or new user message. retry_attempt: u32, + + /// Timestamp of the last successful API request (for cache keep-alive scheduling). + last_request_time: Option, } impl Agent { @@ -233,6 +236,7 @@ impl Agent { fast_mode_cooldown_until: None, retry_attempt: 0, + last_request_time: None, } } @@ -270,6 +274,7 @@ impl Agent { fast_mode_cooldown_until: None, retry_attempt: 0, + last_request_time: None, } } @@ -400,6 +405,101 @@ impl Agent { self.active_stream = None; } + /// Returns true if a cache keep-alive ping is needed. + /// + /// Checks whether: keep-alive is enabled, the agent is idle (no active stream), + /// and enough time has elapsed since the last API request. + pub fn needs_cache_keepalive(&self) -> bool { + let interval = self.config.cache_keepalive_secs; + if interval == 0 { + return false; + } + // Don't send keep-alive if the agent is actively streaming + if self.state.is_some() { + return false; + } + // Need at least one prior request to have something cached + let Some(last) = self.last_request_time else { + return false; + }; + // Don't keep-alive on OpenRouter (cache is Anthropic-specific) + if is_openrouter_model(&self.config.model) { + return false; + } + last.elapsed() >= Duration::from_secs(interval) + } + + /// Send a minimal API request to keep the Anthropic prompt cache warm. + /// + /// Uses the existing message history as the cache prefix, appends a + /// disposable keep-alive message, sends with max_tokens=1 and no tools, + /// then discards the response. The keep-alive message is NOT added to + /// the agent's conversation history. + pub async fn send_cache_keepalive(&mut self) -> Result { + info!("Sending cache keep-alive ping"); + + // Build messages: clone current history + cache_control on last real message + let mut messages = self.messages.clone(); + if let Some(last_msg) = messages.last_mut() { + last_msg.options = Some(CacheControl::Ephemeral.into()); + } + // Append a disposable keep-alive user message (not stored in self.messages) + messages.push(ChatMessage::user("[cache-keepalive] Respond with: ack")); + + let request = ChatRequest::new(messages); + + // Build minimal headers (same auth as normal requests) + let headers = if let Some(ref oauth) = self.oauth { + Headers::from([ + ( + "authorization".to_string(), + format!("Bearer {}", oauth.access_token), + ), + ("anthropic-beta".to_string(), ANTHROPIC_BETA_HEADER.to_string()), + ("user-agent".to_string(), ANTHROPIC_USER_AGENT.to_string()), + ]) + } else { + Headers::from([("anthropic-beta".to_string(), "interleaved-thinking-2025-05-14".to_string())]) + }; + + // Minimal options: 1 output token, no tools, no thinking + let chat_options = ChatOptions::default() + .with_max_tokens(1) + .with_capture_usage(true) + .with_extra_headers(headers); + + match self + .client + .exec_chat_stream(&self.config.model, request, Some(&chat_options)) + .await + { + Ok(resp) => { + // Drain the stream to complete the request + let mut stream = Box::pin(resp.stream); + let mut turn_usage = Usage::default(); + while let Some(event) = stream.next().await { + if let Ok(ChatStreamEvent::End(end)) = event { + if let Some(ref genai_usage) = end.captured_usage { + turn_usage = Self::extract_turn_usage(genai_usage); + } + } + } + self.last_request_time = Some(Instant::now()); + info!( + "Cache keep-alive complete: {} cached tokens refreshed", + turn_usage.cache_read_tokens + ); + Ok(turn_usage) + } + Err(e) => { + warn!("Cache keep-alive failed: {:#}", e); + // Still update timestamp to avoid hammering on repeated failures + self.last_request_time = Some(Instant::now()); + Err(anyhow::anyhow!("Cache keep-alive failed: {:#}", e)) + } + } + } + /// Refresh OAuth token if expired. Returns true if refresh was needed and succeeded. #[allow(dead_code)] pub async fn refresh_oauth_if_needed(&mut self) -> Result { @@ -626,6 +726,7 @@ impl Agent { Ok(resp) => { info!("Chat request successful"); self.retry_attempt = 0; + self.last_request_time = Some(Instant::now()); Ok(resp) }, Err(e) => {