Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crates/bitvm2-ga/src/operator/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -966,7 +966,7 @@ pub fn take2_timelocks(network: Network) -> (u32, u32) {
+ if network == Network::Regtest { 6 } else { 0 }, // Regtest extra delay
num_blocks_per_network(network, CONNECTOR_D_TIMELOCK)
+ if network == Network::Testnet { 6 } else { 0 } // Testnet extra delay
+ if network == Network::Testnet4 { 100 } else { 0 } // Testnet4 extra delay
+ if network == Network::Testnet4 { 60 } else { 0 } // Testnet4 extra delay
+ if network == Network::Regtest { 6 } else { 0 }, // Regtest extra delay
)
}
Expand Down
1 change: 0 additions & 1 deletion node/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -847,7 +847,6 @@ bitvm2-noded \
| `GOAT_ADDRESS` | Conditional | GOAT address (required for Operator/Challenger) | - |
| `ENABLE_RELAYER` | No | Enable relayer mode for Committee nodes | `false` |
| `BTC_CHAIN_URL` | No | Bitcoin Esplora API endpoint | Public Esplora |
| `MARA_SLIPSTREAM_API_URL` | No | MARA slipstream API base URL (used for non-standard tx broadcast) | mainnet: `https://slipstream.mara.com/api`; testnet4: `https://teststream.mara.com/api` |
| `GOAT_PROOF_BUILD_URL` | No | Proof Builder RPC endpoint | - |
| `NODE_NAME` | No | Node display name | `ZKM` |
| `OPERATOR_NODE_SERVICE_FEE` | No | Operator service fee rate | `0.001` |
Expand Down
16 changes: 0 additions & 16 deletions node/src/env.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,6 @@ pub const ENV_GOAT_SEQUENCER_SET_MULTI_SIG_VERIFIER_ADDRESS: &str =
pub const ENV_ENABLE_RELAYER: &str = "ENABLE_RELAYER";
pub const ENV_ENABLE_UPDATE_SPV_CONTRACT: &str = "ENABLE_UPDATE_SPV_CONTRACT";
pub const ENV_BTC_BLOCK_CONFIRMS: &str = "BTC_BLOCK_CONFIRMS";
pub const ENV_MARA_SLIPSTREAM_API_URL: &str = "MARA_SLIPSTREAM_API_URL";
pub const DEFAULT_MARA_SLIPSTREAM_MAINNET_API_URL: &str = "https://slipstream.mara.com/api";
pub const DEFAULT_MARA_SLIPSTREAM_TESTNET4_API_URL: &str = "https://teststream.mara.com/api";

pub const ENV_GOAT_PRIVATE_KEY: &str = "GOAT_PRIVATE_KEY";

Expand Down Expand Up @@ -352,19 +349,6 @@ pub fn get_btc_url_from_env() -> Option<String> {
std::env::var(ENV_BTC_CHAIN_URL).ok()
}

pub fn get_mara_slipstream_api_base_url(network: Network) -> String {
if let Ok(url) = std::env::var(ENV_MARA_SLIPSTREAM_API_URL)
&& !url.trim().is_empty()
{
return url;
}

match network {
Network::Bitcoin => DEFAULT_MARA_SLIPSTREAM_MAINNET_API_URL.to_string(),
_ => DEFAULT_MARA_SLIPSTREAM_TESTNET4_API_URL.to_string(),
}
}

pub fn get_sequencer_set_monitor_start_cosmos_block_from_env() -> Option<u64> {
std::env::var(ENV_SEQUENCER_SET_MONITOR_START_COSMOS_BLOCK)
.ok()
Expand Down
1 change: 1 addition & 0 deletions node/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ mod scheduled_tasks;
pub mod utils;
pub use scheduled_tasks::{
run_maintenance_tasks, run_sequencer_set_hash_monitor_task, run_watch_event_task,
run_watchdog_task,
};
mod error;

Expand Down
17 changes: 17 additions & 0 deletions node/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use bitvm2_noded::utils::{
};
use bitvm2_noded::{
rpc_service, run_maintenance_tasks, run_sequencer_set_hash_monitor_task, run_watch_event_task,
run_watchdog_task,
};

use anyhow::Result;
Expand Down Expand Up @@ -169,6 +170,7 @@ async fn main() -> Result<(), Box<dyn Error>> {
let local_db_clone2 = local_db.clone();
let local_db_clone3 = local_db.clone();
let local_db_clone4 = local_db.clone();
let local_db_clone5 = local_db.clone();
let opt_rpc_addr = opt.rpc_addr.clone();
let peer_id_string_clone = peer_id_string.clone();
let metric_registry_clone = Arc::new(Mutex::new(metric_registry));
Expand Down Expand Up @@ -275,6 +277,21 @@ async fn main() -> Result<(), Box<dyn Error>> {
}
}));

if actor == Actor::Challenger || actor == Actor::Watchtower {
let cancel_token_clone = cancellation_token.clone();
task_handles.push(tokio::spawn(async move {
let btc_client =
Arc::new(BTCClient::new(get_network(), get_btc_url_from_env().as_deref()));
match run_watchdog_task(local_db_clone5, btc_client, 60, cancel_token_clone).await {
Ok(tag) => Ok(tag),
Err(e) => {
tracing::error!("Watchdog task error: {}", e);
Err("watchdog_error".to_string())
}
}
}));
}

let swarm_actor = actor.clone();
let cancel_token_clone = cancellation_token.clone();
task_handles.push(tokio::spawn(async move {
Expand Down
2 changes: 2 additions & 0 deletions node/src/scheduled_tasks/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ pub mod instance_maintenance_tasks;
mod node_maintenance_tasks;
mod sequencer_set_hash_monitor_task;
mod spv_maintenance_tasks;
pub mod watchdog;

use crate::action::GOATMessageContent;
use crate::env::{get_maintenance_run_timeout_secs, is_enable_update_spv_contract, is_relayer};
Expand All @@ -21,6 +22,7 @@ use client::btc_chain::BTCClient;
use client::goat_chain::GOATClient;
pub use event_watch_task::{is_processing_gateway_history_events, run_watch_event_task};
pub use sequencer_set_hash_monitor_task::run_sequencer_set_hash_monitor_task;
pub use watchdog::run_watchdog_task;
use std::sync::Arc;
use std::time::{Duration, Instant};
use store::localdb::{LocalDB, StorageProcessor};
Expand Down
103 changes: 103 additions & 0 deletions node/src/scheduled_tasks/watchdog/alert.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
use super::monitor::{AlertType, LivenessAlert};
use reqwest::Client;
use serde_json::json;
use tracing::warn;

pub struct AlertConfig {
pub webhook_url: Option<String>,
pub telegram_bot_token: Option<String>,
pub telegram_chat_id: Option<String>,
pub pagerduty_routing_key: Option<String>,
}

impl AlertConfig {
pub fn from_env() -> Self {
Self {
webhook_url: std::env::var("WATCHDOG_WEBHOOK_URL").ok(),
telegram_bot_token: std::env::var("WATCHDOG_TELEGRAM_TOKEN").ok(),
telegram_chat_id: std::env::var("WATCHDOG_TELEGRAM_CHAT_ID").ok(),
pagerduty_routing_key: std::env::var("WATCHDOG_PAGERDUTY_KEY").ok(),
}
}

}

pub async fn dispatch_alerts(alerts: Vec<LivenessAlert>, config: &AlertConfig) {
for alert in &alerts {
let msg = format_alert_message(alert);

if let Some(url) = &config.webhook_url {
send_webhook(url, &msg).await;
}
if let (Some(token), Some(chat_id)) = (&config.telegram_bot_token, &config.telegram_chat_id)
{
send_telegram(token, chat_id, &msg).await;
}
if let Some(key) = &config.pagerduty_routing_key {
send_pagerduty(key, alert).await;
}
}
}

fn format_alert_message(a: &LivenessAlert) -> String {
let kind = match a.alert_type {
AlertType::AssertInitMissing => "AssertInitMissing",
AlertType::AssertCommitStall => "AssertCommitStall",
};
format!(
"[GOAT Watchdog] Challenger Liveness Alert\n\
Type: {kind}\n\
Instance: {}\n\
Graph: {}\n\
Kickoff TX: {}\n\
Blocks Until Timeout: {}\n\
Action Required: Check challenger node immediately!",
a.instance_id, a.graph_id, a.kickoff_txid, a.blocks_until_timeout
)
}

async fn send_webhook(url: &str, message: &str) {
let client = Client::new();
if let Err(e) = client.post(url).json(&json!({ "text": message })).send().await {
warn!("watchdog: webhook send failed: {e}");
}
}

async fn send_telegram(token: &str, chat_id: &str, message: &str) {
let url = format!("https://api.telegram.org/bot{token}/sendMessage");
let client = Client::new();
if let Err(e) =
client.post(&url).json(&json!({ "chat_id": chat_id, "text": message })).send().await
{
warn!("watchdog: telegram send failed: {e}");
}
}

async fn send_pagerduty(routing_key: &str, alert: &LivenessAlert) {
let kind = match alert.alert_type {
AlertType::AssertInitMissing => "AssertInitMissing",
AlertType::AssertCommitStall => "AssertCommitStall",
};
let client = Client::new();
if let Err(e) = client
.post("https://events.pagerduty.com/v2/enqueue")
.json(&json!({
"routing_key": routing_key,
"event_action": "trigger",
"payload": {
"summary": format!("Challenger offline - {kind}"),
"severity": "critical",
"source": "goat-bitvm2-watchdog",
"custom_details": {
"graph_id": alert.graph_id.to_string(),
"kickoff_txid": alert.kickoff_txid,
"blocks_until_timeout": alert.blocks_until_timeout,
}
}
}))
.send()
.await
{
warn!("watchdog: pagerduty send failed: {e}");
}
}
40 changes: 40 additions & 0 deletions node/src/scheduled_tasks/watchdog/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
pub mod alert;
pub mod monitor;

use alert::{dispatch_alerts, AlertConfig};
use client::btc_chain::BTCClient;
use monitor::scan_stale_challenges;
use std::sync::Arc;
use std::time::Duration;
use store::localdb::LocalDB;
use tokio_util::sync::CancellationToken;
use tracing::{info, warn};

pub async fn run_watchdog_task(
local_db: LocalDB,
btc_client: Arc<BTCClient>,
interval_secs: u64,
cancellation_token: CancellationToken,
) -> anyhow::Result<String> {
let alert_config = AlertConfig::from_env();
info!("watchdog: starting challenger liveness monitor (interval={}s)", interval_secs);

loop {
tokio::select! {
_ = tokio::time::sleep(Duration::from_secs(interval_secs)) => {
match scan_stale_challenges(&local_db, &btc_client).await {
Ok(alerts) if !alerts.is_empty() => {
warn!("watchdog: {} liveness alert(s) detected", alerts.len());
dispatch_alerts(alerts, &alert_config).await;
}
Ok(_) => info!("watchdog: all challenge graphs healthy"),
Err(e) => warn!("watchdog: scan error: {e}"),
}
}
_ = cancellation_token.cancelled() => {
info!("watchdog: received shutdown signal");
return Ok("watchdog_shutdown".to_string());
}
}
}
}
123 changes: 123 additions & 0 deletions node/src/scheduled_tasks/watchdog/monitor.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
use crate::env::get_network;
use crate::scheduled_tasks::graph_maintenance_tasks::{AssertCommitStatus, ChallengeSubStatus};
use client::btc_chain::BTCClient;
use store::localdb::{GraphQuery, LocalDB};
use store::GraphStatus;
use tracing::{info, warn};
use uuid::Uuid;

use bitvm2_lib::challenger::assert_commit_timeout_timelock;
use bitvm2_lib::operator::watchtower_challenge_timeout_timelock;

#[derive(Debug, Clone)]
pub struct LivenessAlert {
pub graph_id: Uuid,
pub instance_id: Uuid,
pub alert_type: AlertType,
pub blocks_until_timeout: i64,
pub kickoff_txid: String,
}

#[derive(Debug, Clone)]
pub enum AlertType {
/// Kickoff confirmed but Assert-Init not detected, approaching WT challenge timeout
AssertInitMissing,
/// Assert-Init detected but Assert-Commit not complete, approaching assert timeout
AssertCommitStall,
}

fn alert_margin_blocks() -> i64 {
std::env::var("WATCHDOG_ALERT_MARGIN_BLOCKS").ok().and_then(|v| v.parse().ok()).unwrap_or(6)
}

pub async fn scan_stale_challenges(
local_db: &LocalDB,
btc_client: &BTCClient,
) -> anyhow::Result<Vec<LivenessAlert>> {
let current_height = btc_client.get_height().await? as i64;
let assert_timelock = assert_commit_timeout_timelock(get_network()) as i64;
let wt_timelock = watchtower_challenge_timeout_timelock(get_network()) as i64;
let margin = alert_margin_blocks();

let mut storage = local_db.acquire().await?;
let (graphs, _) = storage
.find_graphs(GraphQuery::default().with_status(GraphStatus::Challenge.to_string()))
.await?;

info!("watchdog: scanning {} challenge graph(s) at height {}", graphs.len(), current_height);

let mut alerts = vec![];

for graph in graphs {
let sub_status: ChallengeSubStatus =
serde_json::from_str(&graph.sub_status).unwrap_or_default();

// Already resolved — skip
if sub_status.is_disproved() || sub_status.is_all_commit_success() {
continue;
}

let kickoff_txid_str =
graph.kickoff_txid.as_ref().map(|t| t.0.to_string()).unwrap_or_default();

// Check: kickoff confirmed but assert-init not yet seen
if graph.assert_init_txid.is_none() {
if let Some(ref txid) = graph.kickoff_txid {
match btc_client.get_tx_status(&txid.0).await {
Ok(status) if status.confirmed => {
if let Some(height) = status.block_height {
let blocks_left = (height as i64 + wt_timelock) - current_height;
if blocks_left < margin {
alerts.push(LivenessAlert {
graph_id: graph.graph_id,
instance_id: graph.instance_id,
alert_type: AlertType::AssertInitMissing,
blocks_until_timeout: blocks_left,
kickoff_txid: kickoff_txid_str.clone(),
});
}
}
}
Ok(_) => {} // not confirmed yet
Err(e) => {
warn!(
"watchdog: failed to get kickoff tx status for {}: {e}",
graph.graph_id
)
}
}
}
}

// Check: assert-init seen but assert-commit stalled
if sub_status.assert_commit_status == AssertCommitStatus::OperatorInit {
if let Some(ref txid) = graph.assert_init_txid {
match btc_client.get_tx_status(&txid.0).await {
Ok(status) if status.confirmed => {
if let Some(height) = status.block_height {
let blocks_left = (height as i64 + assert_timelock) - current_height;
if blocks_left < margin {
alerts.push(LivenessAlert {
graph_id: graph.graph_id,
instance_id: graph.instance_id,
alert_type: AlertType::AssertCommitStall,
blocks_until_timeout: blocks_left,
kickoff_txid: kickoff_txid_str,
});
}
}
}
Ok(_) => {}
Err(e) => {
warn!(
"watchdog: failed to get assert_init tx status for {}: {e}",
graph.graph_id
)
}
}
}
}
}

Ok(alerts)
}
Loading