Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions config.example.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,25 @@
# Working directory (default: current directory)
# working_dir = "/path/to/project"

# Cache keep-alive interval in seconds (default: 0 = disabled).
# Sends a minimal API request to keep Anthropic's prompt cache warm during
# idle periods, preventing expensive cache re-creation when you return to
# a session after being away.
#
# How it works:
# - Anthropic caches your conversation prefix (system prompt + message history)
# - Default cache TTL is 5 minutes; unused caches expire and must be re-written
# - Keep-alive sends a 1-token request using your existing prefix to refresh the TTL
# - Cache reads cost 10% of normal input pricing (e.g., $0.50/MTok vs $5 for Opus)
# - Without keep-alive, returning after 5+ min idle costs full cache write (1.25x input)
#
# Recommended values:
# 240 = ping every 4 min (safe margin for 5-min default TTL)
# 3300 = ping every 55 min (if using 1-hour extended TTL)
# 0 = disabled (default)
#
# cache_keepalive_secs = 240

[agents.foreground]
# Model to use (default: claude-opus-4-6)
model = "claude-opus-4-6"
Expand Down
29 changes: 29 additions & 0 deletions src/app.rs
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,12 @@ impl App {
self.chat.render(&mut self.terminal);
self.draw();

// Cache keep-alive timer: check every 30s if a keep-alive ping is needed.
// The actual interval is configured via `cache_keepalive_secs` in config;
// this tick rate just controls how often we check.
let mut keepalive_tick = tokio::time::interval(Duration::from_secs(30));
keepalive_tick.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);

// CANCEL-SAFETY: When one branch of tokio::select! completes, all other
// futures are dropped (not paused). Any async fn polled here must store
// its state on `self`, not in local variables, so it can resume correctly
Expand Down Expand Up @@ -398,6 +404,29 @@ impl App {
self.draw();
}
}
// Cache keep-alive: send minimal ping to refresh Anthropic prompt cache TTL
_ = keepalive_tick.tick() => {
if let Some(agent_mutex) = self.agents.primary() {
let mut agent = agent_mutex.lock().await;
if agent.needs_cache_keepalive() {
match agent.send_cache_keepalive().await {
Ok(usage) => {
if usage.cache_read_tokens > 0 {
tracing::info!(
"Cache keep-alive refreshed {} cached tokens",
usage.cache_read_tokens
);
} else {
tracing::debug!("Cache keep-alive sent (no cached tokens in response)");
}
}
Err(e) => {
tracing::debug!("Cache keep-alive failed: {}", e);
}
}
}
}
}
}

if self.should_quit {
Expand Down
13 changes: 13 additions & 0 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ pub const CORRECTIONS_FILENAME: &str = "corrections.json";
/// max_retries: 5,
/// compaction_thinking_budget: 8_000,
/// fast_mode: false,
/// cache_keepalive_secs: 0,
/// };
/// ```
#[derive(Debug, Clone)]
Expand All @@ -62,6 +63,10 @@ pub struct AgentRuntimeConfig {
/// Enable fast mode (research preview) for lower-latency responses.
/// Only effective with opus-4-6 models.
pub fast_mode: bool,
/// Interval in seconds for cache keep-alive pings. 0 = disabled.
/// Sends a minimal API request to refresh Anthropic's prompt cache TTL
/// before it expires, avoiding expensive cache re-creation on idle sessions.
pub cache_keepalive_secs: u64,
}

impl Default for AgentRuntimeConfig {
Expand All @@ -73,6 +78,7 @@ impl Default for AgentRuntimeConfig {
max_retries: 5,
compaction_thinking_budget: 8_000,
fast_mode: false,
cache_keepalive_secs: 0,
}
}
}
Expand All @@ -92,6 +98,7 @@ impl AgentRuntimeConfig {
max_retries: config.general.max_retries,
compaction_thinking_budget: config.general.compaction_thinking_budget,
fast_mode: config.agents.foreground.fast_mode,
cache_keepalive_secs: config.general.cache_keepalive_secs,
}
}

Expand All @@ -104,6 +111,7 @@ impl AgentRuntimeConfig {
max_retries: config.general.max_retries,
compaction_thinking_budget: config.general.compaction_thinking_budget,
fast_mode: config.agents.background.fast_mode,
cache_keepalive_secs: config.general.cache_keepalive_secs,
}
}
}
Expand Down Expand Up @@ -225,6 +233,10 @@ pub struct GeneralConfig {
pub compaction_threshold: u32,
/// Thinking budget for compaction requests (default: 8,000)
pub compaction_thinking_budget: u32,
/// Interval in seconds for cache keep-alive pings. 0 = disabled (default).
/// Sends a minimal API request to keep Anthropic's prompt cache warm during
/// idle periods. Recommended: 240 (4 min) for 5-min TTL, 3300 (55 min) for 1-hour TTL.
pub cache_keepalive_secs: u64,
}

#[cfg(feature = "cli")]
Expand All @@ -235,6 +247,7 @@ impl Default for GeneralConfig {
max_retries: 5,
compaction_threshold: 192_000,
compaction_thinking_budget: 8_000,
cache_keepalive_secs: 0,
}
}
}
Expand Down
101 changes: 101 additions & 0 deletions src/llm/agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,9 @@ pub struct Agent {
/// Retry attempt counter, persists across calls to exec_chat_with_retry.
/// Reset on successful request or new user message.
retry_attempt: u32,

/// Timestamp of the last successful API request (for cache keep-alive scheduling).
last_request_time: Option<Instant>,
}

impl Agent {
Expand Down Expand Up @@ -233,6 +236,7 @@ impl Agent {

fast_mode_cooldown_until: None,
retry_attempt: 0,
last_request_time: None,
}
}

Expand Down Expand Up @@ -270,6 +274,7 @@ impl Agent {

fast_mode_cooldown_until: None,
retry_attempt: 0,
last_request_time: None,
}
}

Expand Down Expand Up @@ -400,6 +405,101 @@ impl Agent {
self.active_stream = None;
}

/// Returns true if a cache keep-alive ping is needed.
///
/// Checks whether: keep-alive is enabled, the agent is idle (no active stream),
/// and enough time has elapsed since the last API request.
pub fn needs_cache_keepalive(&self) -> bool {
let interval = self.config.cache_keepalive_secs;
if interval == 0 {
return false;
}
// Don't send keep-alive if the agent is actively streaming
if self.state.is_some() {
return false;
}
// Need at least one prior request to have something cached
let Some(last) = self.last_request_time else {
return false;
};
// Don't keep-alive on OpenRouter (cache is Anthropic-specific)
if is_openrouter_model(&self.config.model) {
return false;
}
last.elapsed() >= Duration::from_secs(interval)
}

/// Send a minimal API request to keep the Anthropic prompt cache warm.
///
/// Uses the existing message history as the cache prefix, appends a
/// disposable keep-alive message, sends with max_tokens=1 and no tools,
/// then discards the response. The keep-alive message is NOT added to
/// the agent's conversation history.
pub async fn send_cache_keepalive(&mut self) -> Result<Usage> {
info!("Sending cache keep-alive ping");

// Build messages: clone current history + cache_control on last real message
let mut messages = self.messages.clone();
if let Some(last_msg) = messages.last_mut() {
last_msg.options = Some(CacheControl::Ephemeral.into());
}
// Append a disposable keep-alive user message (not stored in self.messages)
messages.push(ChatMessage::user("[cache-keepalive] Respond with: ack"));

let request = ChatRequest::new(messages);

// Build minimal headers (same auth as normal requests)
let headers = if let Some(ref oauth) = self.oauth {
Headers::from([
(
"authorization".to_string(),
format!("Bearer {}", oauth.access_token),
),
("anthropic-beta".to_string(), ANTHROPIC_BETA_HEADER.to_string()),
("user-agent".to_string(), ANTHROPIC_USER_AGENT.to_string()),
])
} else {
Headers::from([("anthropic-beta".to_string(), "interleaved-thinking-2025-05-14".to_string())])
};

// Minimal options: 1 output token, no tools, no thinking
let chat_options = ChatOptions::default()
.with_max_tokens(1)
.with_capture_usage(true)
.with_extra_headers(headers);

match self
.client
.exec_chat_stream(&self.config.model, request, Some(&chat_options))
.await
{
Ok(resp) => {
// Drain the stream to complete the request
let mut stream = Box::pin(resp.stream);
let mut turn_usage = Usage::default();
while let Some(event) = stream.next().await {
if let Ok(ChatStreamEvent::End(end)) = event {
if let Some(ref genai_usage) = end.captured_usage {
turn_usage = Self::extract_turn_usage(genai_usage);
}
}
}
self.last_request_time = Some(Instant::now());
info!(
"Cache keep-alive complete: {} cached tokens refreshed",
turn_usage.cache_read_tokens
);
Ok(turn_usage)
}
Err(e) => {
warn!("Cache keep-alive failed: {:#}", e);
// Still update timestamp to avoid hammering on repeated failures
self.last_request_time = Some(Instant::now());
Err(anyhow::anyhow!("Cache keep-alive failed: {:#}", e))
}
}
}

/// Refresh OAuth token if expired. Returns true if refresh was needed and succeeded.
#[allow(dead_code)]
pub async fn refresh_oauth_if_needed(&mut self) -> Result<bool> {
Expand Down Expand Up @@ -626,6 +726,7 @@ impl Agent {
Ok(resp) => {
info!("Chat request successful");
self.retry_attempt = 0;
self.last_request_time = Some(Instant::now());
Ok(resp)
},
Err(e) => {
Expand Down