From f88d44003f017a8ac147b9ecd391418b71b151ba Mon Sep 17 00:00:00 2001 From: fullzer4 Date: Sun, 22 Feb 2026 16:26:30 -0300 Subject: [PATCH 1/6] refactor: replace namespace isolation with Landlock v5 Remove user, PID, network, mount, UTS, and IPC namespaces. Landlock v5 is now the primary isolation mechanism for filesystem, network, signal, and IPC control. Plain fork() replaces clone() with CLONE_NEW* flags. Minimum kernel raised to 6.12. - Add rlimits module (RLIMIT_DATA instead of RLIMIT_AS) - Add seccomp user notify support (SECCOMP_RET_USER_NOTIF) - Update security tests for new isolation model --- .cargo/config.toml | 13 - Cargo.lock | 22 +- Cargo.toml | 6 +- crates/evalbox-sandbox/Cargo.toml | 2 +- crates/evalbox-sandbox/src/executor.rs | 403 +++++++++--------- .../evalbox-sandbox/src/isolation/lockdown.rs | 185 +++++--- crates/evalbox-sandbox/src/isolation/mod.rs | 14 +- .../src/isolation/namespace.rs | 83 ---- .../evalbox-sandbox/src/isolation/rlimits.rs | 71 +++ .../evalbox-sandbox/src/isolation/rootfs.rs | 267 ------------ crates/evalbox-sandbox/src/lib.rs | 20 +- crates/evalbox-sandbox/src/notify/mod.rs | 17 + .../evalbox-sandbox/src/notify/scm_rights.rs | 150 +++++++ .../evalbox-sandbox/src/notify/supervisor.rs | 270 ++++++++++++ .../evalbox-sandbox/src/notify/virtual_fs.rs | 139 ++++++ crates/evalbox-sandbox/src/plan.rs | 35 +- crates/evalbox-sandbox/src/resolve.rs | 3 +- crates/evalbox-sandbox/src/workspace.rs | 72 +--- crates/evalbox-sandbox/tests/common/mod.rs | 27 -- crates/evalbox-sandbox/tests/security/cve.rs | 72 +--- .../tests/security/filesystem.rs | 120 ++---- .../evalbox-sandbox/tests/security/network.rs | 34 +- .../tests/security/resources.rs | 33 +- .../evalbox-sandbox/tests/security/seccomp.rs | 65 +-- crates/evalbox-sys/src/check.rs | 64 +-- crates/evalbox-sys/src/landlock.rs | 21 + crates/evalbox-sys/src/lib.rs | 10 +- crates/evalbox-sys/src/seccomp.rs | 106 ++++- crates/evalbox-sys/src/seccomp_notify.rs | 247 +++++++++++ 29 files changed, 1522 insertions(+), 1049 deletions(-) delete mode 100644 .cargo/config.toml delete mode 100644 crates/evalbox-sandbox/src/isolation/namespace.rs create mode 100644 crates/evalbox-sandbox/src/isolation/rlimits.rs delete mode 100644 crates/evalbox-sandbox/src/isolation/rootfs.rs create mode 100644 crates/evalbox-sandbox/src/notify/mod.rs create mode 100644 crates/evalbox-sandbox/src/notify/scm_rights.rs create mode 100644 crates/evalbox-sandbox/src/notify/supervisor.rs create mode 100644 crates/evalbox-sandbox/src/notify/virtual_fs.rs create mode 100644 crates/evalbox-sys/src/seccomp_notify.rs diff --git a/.cargo/config.toml b/.cargo/config.toml deleted file mode 100644 index d47f4ee..0000000 --- a/.cargo/config.toml +++ /dev/null @@ -1,13 +0,0 @@ -[target.x86_64-unknown-linux-gnu] -linker = "clang" -rustflags = ["-C", "link-arg=-fuse-ld=mold"] - -[target.aarch64-unknown-linux-gnu] -linker = "clang" -rustflags = ["-C", "link-arg=-fuse-ld=mold"] - -[build] -rustflags = ["-C", "target-cpu=native"] - -[term] -color = "always" diff --git a/Cargo.lock b/Cargo.lock index 9e06bdb..bb2e97b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -59,12 +59,6 @@ dependencies = [ "parking_lot_core", ] -[[package]] -name = "either" -version = "1.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" - [[package]] name = "env_home" version = "0.1.0" @@ -101,7 +95,7 @@ dependencies = [ "tempfile", "thiserror", "walkdir", - "which 7.0.3", + "which", ] [[package]] @@ -115,7 +109,7 @@ dependencies = [ "rustix", "tempfile", "thiserror", - "which 8.0.0", + "which", ] [[package]] @@ -610,18 +604,6 @@ dependencies = [ "semver", ] -[[package]] -name = "which" -version = "7.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24d643ce3fd3e5b54854602a080f34fb10ab75e0b813ee32d00ca2b44fa74762" -dependencies = [ - "either", - "env_home", - "rustix", - "winsafe", -] - [[package]] name = "which" version = "8.0.0" diff --git a/Cargo.toml b/Cargo.toml index bc6800c..96b7910 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,9 @@ members = [ "crates/evalbox-sandbox", ] +[workspace.metadata.crane] +name = "evalbox" + [workspace.package] version = "0.1.0" edition = "2024" @@ -30,7 +33,7 @@ rustix = { version = "1", features = ["event", "process", "system", "mount", "fs thiserror = "2" tempfile = "3" mio = { version = "1.0", features = ["os-poll", "os-ext"] } -which = "7" +which = "8" [workspace.lints.rust] unsafe_op_in_unsafe_fn = "warn" @@ -39,7 +42,6 @@ unused_must_use = "warn" [workspace.lints.clippy] all = { level = "warn", priority = -1 } -# Useful pedantic lints (not all) cast_possible_truncation = "warn" cast_sign_loss = "warn" cloned_instead_of_copied = "warn" diff --git a/crates/evalbox-sandbox/Cargo.toml b/crates/evalbox-sandbox/Cargo.toml index 482e35e..7b3224e 100644 --- a/crates/evalbox-sandbox/Cargo.toml +++ b/crates/evalbox-sandbox/Cargo.toml @@ -14,7 +14,7 @@ rustix.workspace = true tempfile.workspace = true mio.workspace = true thiserror.workspace = true -which = "8" +which.workspace = true [build-dependencies] cc = "1.2" diff --git a/crates/evalbox-sandbox/src/executor.rs b/crates/evalbox-sandbox/src/executor.rs index 7d659de..fc96168 100644 --- a/crates/evalbox-sandbox/src/executor.rs +++ b/crates/evalbox-sandbox/src/executor.rs @@ -39,7 +39,7 @@ use std::collections::HashMap; use std::ffi::CString; use std::io::{self, Write as _}; use std::os::fd::{AsRawFd, OwnedFd, RawFd}; -use std::path::{Path, PathBuf}; +use std::path::PathBuf; use std::time::{Duration, Instant}; use mio::unix::SourceFd; @@ -48,14 +48,16 @@ use rustix::io::Errno; use rustix::process::{Pid, PidfdFlags, Signal, pidfd_open, pidfd_send_signal}; use thiserror::Error; -use evalbox_sys::{check, last_errno}; - -use crate::isolation::{ - LockdownError, bind_mount, lockdown, make_rprivate, mount_minimal_dev, mount_proc, - pivot_root_and_cleanup, set_hostname, setup_id_maps, +use evalbox_sys::seccomp::{ + DEFAULT_WHITELIST, NOTIFY_FS_SYSCALLS, SockFprog, build_notify_filter, build_whitelist_filter, }; +use evalbox_sys::seccomp_notify::seccomp_set_mode_filter_listener; +use evalbox_sys::{check, last_errno, seccomp::seccomp_set_mode_filter}; + +use crate::isolation::{LockdownError, close_extra_fds, lockdown}; use crate::monitor::{Output, Status, monitor, set_nonblocking, wait_for_exit, write_stdin}; -use crate::plan::{Mount, Plan}; +use crate::notify::scm_rights; +use crate::plan::{Mount, NotifyMode, Plan}; use crate::resolve::{ResolvedBinary, resolve_binary}; use crate::validate::validate_cmd; use crate::workspace::Workspace; @@ -75,15 +77,6 @@ pub enum ExecutorError { #[error("fork: {0}")] Fork(Errno), - #[error("unshare: {0}")] - Unshare(Errno), - - #[error("id map: {0}")] - IdMap(io::Error), - - #[error("rootfs: {0}")] - Rootfs(Errno), - #[error("lockdown: {0}")] Lockdown(#[from] LockdownError), @@ -102,6 +95,9 @@ pub enum ExecutorError { #[error("command not found: {0}")] CommandNotFound(String), + #[error("seccomp notify: {0}")] + SeccompNotify(String), + #[error("io: {0}")] Io(#[from] io::Error), } @@ -155,12 +151,17 @@ impl ExecutionInfo { } /// A spawned sandbox that hasn't been waited on yet. +/// +/// Some fields are never read but kept alive for RAII (fd lifetime, temp dir cleanup). +#[allow(dead_code)] struct SpawnedSandbox { pidfd: OwnedFd, stdin_fd: RawFd, stdout_fd: RawFd, stderr_fd: RawFd, - #[allow(dead_code)] + /// Seccomp listener fd kept alive for RAII; future supervisor integration. + notify_fd: Option, + /// Workspace kept alive so temp directory isn't deleted while sandbox runs. workspace: std::mem::ManuallyDrop, } @@ -235,15 +236,22 @@ impl Executor { let workspace = Workspace::with_prefix("evalbox-").map_err(ExecutorError::Workspace)?; + workspace + .setup_sandbox_dirs() + .map_err(ExecutorError::Workspace)?; for file in &plan.user_files { + let work_path = format!("work/{}", file.path); workspace - .write_file(&file.path, &file.content, file.executable) + .write_file(&work_path, &file.content, file.executable) .map_err(ExecutorError::Workspace)?; } - workspace - .setup_sandbox_dirs() - .map_err(ExecutorError::Workspace)?; - create_mount_dirs(&workspace, &exec_info, &plan)?; + + // Create socketpair for notify fd transfer (if needed) + let notify_sockets = if plan.notify_mode != NotifyMode::Disabled { + Some(scm_rights::create_socketpair().map_err(ExecutorError::Workspace)?) + } else { + None + }; let child_pid = unsafe { libc::fork() }; if child_pid < 0 { @@ -251,7 +259,9 @@ impl Executor { } if child_pid == 0 { - match child_process(&workspace, &plan, &exec_info) { + // In child: close parent's socket end + let child_socket = notify_sockets.map(|(_, child)| child); + match child_process(&workspace, &plan, &exec_info, child_socket.as_ref()) { Ok(()) => unsafe { libc::_exit(127) }, Err(e) => { writeln!(io::stderr(), "sandbox error: {e}").ok(); @@ -263,7 +273,22 @@ impl Executor { let pid = unsafe { Pid::from_raw_unchecked(child_pid) }; let pidfd = pidfd_open(pid, PidfdFlags::empty()).map_err(ExecutorError::Pidfd)?; - blocking_parent(child_pid, pidfd, workspace, plan) + // Parent: receive notify fd if applicable + let notify_fd = if let Some((parent_socket, _)) = notify_sockets { + poll_or_kill( + parent_socket.as_raw_fd(), + child_pid, + "timeout waiting for notify fd", + )?; + Some( + scm_rights::recv_fd(parent_socket.as_raw_fd()) + .map_err(|e| ExecutorError::SeccompNotify(e.to_string()))?, + ) + } else { + None + }; + + blocking_parent(child_pid, pidfd, notify_fd, workspace, plan) } /// Spawn a new sandbox. Returns immediately with a [`SandboxId`]. @@ -534,6 +559,50 @@ impl Executor { } } +/// Close the parent-side pipe ends that the child uses (stdin read, stdout write, stderr write). +fn close_parent_pipe_ends(workspace: &Workspace) { + unsafe { + libc::close(workspace.pipes.stdin.read.as_raw_fd()); + libc::close(workspace.pipes.stdout.write.as_raw_fd()); + libc::close(workspace.pipes.stderr.write.as_raw_fd()); + } +} + +/// Poll an fd with a 30-second timeout; kill the child on timeout or error. +fn poll_or_kill(fd: RawFd, child_pid: libc::pid_t, msg: &str) -> Result<(), ExecutorError> { + let mut pfd = libc::pollfd { + fd, + events: libc::POLLIN, + revents: 0, + }; + if unsafe { libc::poll(&mut pfd, 1, 30000) } <= 0 { + unsafe { libc::kill(child_pid, libc::SIGKILL) }; + return Err(ExecutorError::ChildSetup(msg.into())); + } + Ok(()) +} + +/// Wait for the child to signal readiness via eventfd, then signal back. +fn sync_with_child(workspace: &Workspace, child_pid: libc::pid_t) -> Result<(), ExecutorError> { + let child_ready_fd = workspace.pipes.sync.child_ready_fd(); + poll_or_kill(child_ready_fd, child_pid, "timeout waiting for child")?; + + let mut value: u64 = 0; + if unsafe { libc::read(child_ready_fd, (&mut value as *mut u64).cast(), 8) } != 8 { + unsafe { libc::kill(child_pid, libc::SIGKILL) }; + return Err(ExecutorError::ChildSetup("eventfd read failed".into())); + } + + let parent_done_fd = workspace.pipes.sync.parent_done_fd(); + let signal_value: u64 = 1; + if unsafe { libc::write(parent_done_fd, (&signal_value as *const u64).cast(), 8) } != 8 { + unsafe { libc::kill(child_pid, libc::SIGKILL) }; + return Err(ExecutorError::ChildSetup("eventfd write failed".into())); + } + + Ok(()) +} + fn spawn_sandbox(plan: Plan) -> Result { let cmd_refs: Vec<&str> = plan.cmd.iter().map(|s| s.as_str()).collect(); validate_cmd(&cmd_refs).map_err(ExecutorError::Validation)?; @@ -552,15 +621,22 @@ fn spawn_sandbox(plan: Plan) -> Result { let workspace = Workspace::with_prefix("evalbox-").map_err(ExecutorError::Workspace)?; + workspace + .setup_sandbox_dirs() + .map_err(ExecutorError::Workspace)?; for file in &plan.user_files { + let work_path = format!("work/{}", file.path); workspace - .write_file(&file.path, &file.content, file.executable) + .write_file(&work_path, &file.content, file.executable) .map_err(ExecutorError::Workspace)?; } - workspace - .setup_sandbox_dirs() - .map_err(ExecutorError::Workspace)?; - create_mount_dirs(&workspace, &exec_info, &plan)?; + + // Create socketpair for notify fd transfer (if needed) + let notify_sockets = if plan.notify_mode != NotifyMode::Disabled { + Some(scm_rights::create_socketpair().map_err(ExecutorError::Workspace)?) + } else { + None + }; let child_pid = unsafe { libc::fork() }; if child_pid < 0 { @@ -568,7 +644,8 @@ fn spawn_sandbox(plan: Plan) -> Result { } if child_pid == 0 { - match child_process(&workspace, &plan, &exec_info) { + let child_socket = notify_sockets.map(|(_, child)| child); + match child_process(&workspace, &plan, &exec_info, child_socket.as_ref()) { Ok(()) => unsafe { libc::_exit(127) }, Err(e) => { writeln!(io::stderr(), "sandbox error: {e}").ok(); @@ -580,47 +657,28 @@ fn spawn_sandbox(plan: Plan) -> Result { let pid = unsafe { Pid::from_raw_unchecked(child_pid) }; let pidfd = pidfd_open(pid, PidfdFlags::empty()).map_err(ExecutorError::Pidfd)?; - // Parent: close unused pipe ends let stdin_write_fd = workspace.pipes.stdin.write.as_raw_fd(); let stdout_read_fd = workspace.pipes.stdout.read.as_raw_fd(); let stderr_read_fd = workspace.pipes.stderr.read.as_raw_fd(); - unsafe { - libc::close(workspace.pipes.stdin.read.as_raw_fd()); - libc::close(workspace.pipes.stdout.write.as_raw_fd()); - libc::close(workspace.pipes.stderr.write.as_raw_fd()); - } + close_parent_pipe_ends(&workspace); - // Wait for child to signal readiness - let child_ready_fd = workspace.pipes.sync.child_ready_fd(); - let mut pfd = libc::pollfd { - fd: child_ready_fd, - events: libc::POLLIN, - revents: 0, + // Receive notify fd from child if applicable + let notify_fd = if let Some((parent_socket, _)) = notify_sockets { + poll_or_kill( + parent_socket.as_raw_fd(), + child_pid, + "timeout waiting for notify fd", + )?; + Some( + scm_rights::recv_fd(parent_socket.as_raw_fd()) + .map_err(|e| ExecutorError::SeccompNotify(e.to_string()))?, + ) + } else { + None }; - if unsafe { libc::poll(&mut pfd, 1, 30000) } <= 0 { - unsafe { libc::kill(child_pid, libc::SIGKILL) }; - return Err(ExecutorError::ChildSetup( - "timeout waiting for child".into(), - )); - } - - let mut value: u64 = 0; - if unsafe { libc::read(child_ready_fd, (&mut value as *mut u64).cast(), 8) } != 8 { - unsafe { libc::kill(child_pid, libc::SIGKILL) }; - return Err(ExecutorError::ChildSetup("eventfd read failed".into())); - } - - setup_id_maps(child_pid).map_err(ExecutorError::IdMap)?; - - // Signal child to continue - let parent_done_fd = workspace.pipes.sync.parent_done_fd(); - let signal_value: u64 = 1; - if unsafe { libc::write(parent_done_fd, (&signal_value as *const u64).cast(), 8) } != 8 { - unsafe { libc::kill(child_pid, libc::SIGKILL) }; - return Err(ExecutorError::ChildSetup("eventfd write failed".into())); - } + sync_with_child(&workspace, child_pid)?; // Write stdin if provided if let Some(ref stdin_data) = plan.stdin { @@ -647,6 +705,7 @@ fn spawn_sandbox(plan: Plan) -> Result { }, stdout_fd: stdout_read_fd, stderr_fd: stderr_read_fd, + notify_fd, workspace: std::mem::ManuallyDrop::new(workspace), }) } @@ -654,45 +713,15 @@ fn spawn_sandbox(plan: Plan) -> Result { fn blocking_parent( child_pid: libc::pid_t, pidfd: OwnedFd, + _notify_fd: Option, workspace: Workspace, plan: Plan, ) -> Result { let workspace = std::mem::ManuallyDrop::new(workspace); - unsafe { - libc::close(workspace.pipes.stdin.read.as_raw_fd()); - libc::close(workspace.pipes.stdout.write.as_raw_fd()); - libc::close(workspace.pipes.stderr.write.as_raw_fd()); - } - - let child_ready_fd = workspace.pipes.sync.child_ready_fd(); - let mut pfd = libc::pollfd { - fd: child_ready_fd, - events: libc::POLLIN, - revents: 0, - }; - - if unsafe { libc::poll(&mut pfd, 1, 30000) } <= 0 { - unsafe { libc::kill(child_pid, libc::SIGKILL) }; - return Err(ExecutorError::ChildSetup( - "timeout waiting for child".into(), - )); - } - - let mut value: u64 = 0; - if unsafe { libc::read(child_ready_fd, (&mut value as *mut u64).cast(), 8) } != 8 { - unsafe { libc::kill(child_pid, libc::SIGKILL) }; - return Err(ExecutorError::ChildSetup("eventfd read failed".into())); - } - - setup_id_maps(child_pid).map_err(ExecutorError::IdMap)?; + close_parent_pipe_ends(&workspace); - let parent_done_fd = workspace.pipes.sync.parent_done_fd(); - let signal_value: u64 = 1; - if unsafe { libc::write(parent_done_fd, (&signal_value as *const u64).cast(), 8) } != 8 { - unsafe { libc::kill(child_pid, libc::SIGKILL) }; - return Err(ExecutorError::ChildSetup("eventfd write failed".into())); - } + sync_with_child(&workspace, child_pid)?; if let Some(ref stdin_data) = plan.stdin { write_stdin(&workspace, stdin_data).map_err(ExecutorError::Monitor)?; @@ -711,100 +740,93 @@ fn blocking_parent( result } +/// Child process flow (runs after fork in the child). +/// +/// 1. Close parent pipe ends +/// 2. Setup stdio (dup2 stdin/stdout/stderr) +/// 3. chdir(workspace/work) +/// 4. Landlock v5 + rlimits + securebits + drop caps (lockdown) +/// 5. If `notify_mode` != Disabled: install notify filter, send listener fd +/// 6. Install kill seccomp filter (whitelist) +/// 7. Signal parent readiness +/// 8. Wait for parent signal +/// 9. `close_range(3, MAX, 0)` +/// 10. execve fn child_process( workspace: &Workspace, plan: &Plan, exec_info: &ExecutionInfo, + notify_socket: Option<&OwnedFd>, ) -> Result<(), ExecutorError> { + // 1. Close parent pipe ends unsafe { libc::close(workspace.pipes.stdin.write.as_raw_fd()); libc::close(workspace.pipes.stdout.read.as_raw_fd()); libc::close(workspace.pipes.stderr.read.as_raw_fd()); } - if unsafe { libc::unshare(libc::CLONE_NEWUSER) } != 0 { - return Err(ExecutorError::Unshare(last_errno())); + // 2. Setup stdio + setup_stdio(workspace)?; + + // 3. chdir to workspace/work + let work_dir = workspace.root().join("work"); + let work_cstr = + CString::new(work_dir.to_string_lossy().as_bytes()).map_err(|_| ExecutorError::Exec(Errno::INVAL))?; + if unsafe { libc::chdir(work_cstr.as_ptr()) } != 0 { + return Err(ExecutorError::Exec(last_errno())); + } + + // 4. Apply lockdown (Landlock v5 + rlimits + securebits + drop caps) + let extra_paths: Vec<&str> = exec_info + .extra_mounts + .iter() + .filter_map(|m| m.source.to_str()) + .collect(); + lockdown(plan, workspace.root(), &extra_paths).map_err(ExecutorError::Lockdown)?; + + // 5. If notify mode != Disabled: install notify seccomp filter, send listener fd + if plan.notify_mode != NotifyMode::Disabled { + let notify_filter = build_notify_filter(NOTIFY_FS_SYSCALLS); + let fprog = SockFprog { + len: notify_filter.len() as u16, + filter: notify_filter.as_ptr(), + }; + let listener_fd = unsafe { seccomp_set_mode_filter_listener(&fprog) }.map_err(|e| { + ExecutorError::SeccompNotify(format!("failed to install notify filter: {e}")) + })?; + + // Send listener fd to parent via SCM_RIGHTS + if let Some(sock) = notify_socket { + scm_rights::send_fd(sock.as_raw_fd(), listener_fd.as_raw_fd()).map_err(|e| { + ExecutorError::SeccompNotify(format!("failed to send listener fd: {e}")) + })?; + } } + // 6. Install kill seccomp filter (whitelist) + apply_seccomp(plan)?; + + // 7. Signal parent readiness let child_ready_fd = workspace.pipes.sync.child_ready_fd(); let signal_value: u64 = 1; if unsafe { libc::write(child_ready_fd, (&signal_value as *const u64).cast(), 8) } != 8 { return Err(ExecutorError::ChildSetup("eventfd write failed".into())); } + // 8. Wait for parent signal let parent_done_fd = workspace.pipes.sync.parent_done_fd(); let mut value: u64 = 0; if unsafe { libc::read(parent_done_fd, (&mut value as *mut u64).cast(), 8) } != 8 { return Err(ExecutorError::ChildSetup("eventfd read failed".into())); } - if unsafe { libc::unshare(libc::CLONE_NEWNS | libc::CLONE_NEWUTS | libc::CLONE_NEWIPC) } != 0 { - return Err(ExecutorError::Unshare(last_errno())); - } - - setup_rootfs(workspace, plan, exec_info)?; - setup_stdio(workspace)?; - - let extra_paths: Vec<&str> = exec_info - .extra_mounts - .iter() - .filter_map(|m| m.target.to_str()) - .collect(); - lockdown(plan, None, &extra_paths).map_err(ExecutorError::Lockdown)?; - - let cwd = CString::new(plan.cwd.as_bytes()).map_err(|_| ExecutorError::Exec(Errno::INVAL))?; - if unsafe { libc::chdir(cwd.as_ptr()) } != 0 { - return Err(ExecutorError::Exec(last_errno())); - } + // 9. Close all fds except 0,1,2 + close_extra_fds(); + // 10. execve exec_command(plan, exec_info) } -fn setup_rootfs( - workspace: &Workspace, - plan: &Plan, - exec_info: &ExecutionInfo, -) -> Result<(), ExecutorError> { - let sandbox_root = workspace.root(); - - make_rprivate().map_err(ExecutorError::Rootfs)?; - - for mount in &exec_info.extra_mounts { - let target = sandbox_root.join(mount.target.strip_prefix("/").unwrap_or(&mount.target)); - if mount.source.exists() { - bind_mount(&mount.source, &target, !mount.writable).map_err(ExecutorError::Rootfs)?; - } - } - - for mount in &plan.mounts { - let target = sandbox_root.join(mount.target.strip_prefix("/").unwrap_or(&mount.target)); - if let Some(parent) = target.parent() { - std::fs::create_dir_all(parent).map_err(ExecutorError::Workspace)?; - } - std::fs::create_dir_all(&target).map_err(ExecutorError::Workspace)?; - if mount.source.exists() { - bind_mount(&mount.source, &target, !mount.writable).map_err(ExecutorError::Rootfs)?; - } - } - - mount_proc(&sandbox_root.join("proc")).map_err(ExecutorError::Rootfs)?; - mount_minimal_dev(&sandbox_root.join("dev")).map_err(ExecutorError::Rootfs)?; - - for file in &plan.user_files { - let target_path = if file.path.starts_with('/') { - file.path[1..].to_string() - } else { - format!("work/{}", file.path) - }; - workspace - .write_file(&target_path, &file.content, file.executable) - .map_err(ExecutorError::Workspace)?; - } - - set_hostname("sandbox").map_err(ExecutorError::Rootfs)?; - pivot_root_and_cleanup(sandbox_root).map_err(ExecutorError::Rootfs) -} - fn setup_stdio(workspace: &Workspace) -> Result<(), ExecutorError> { let stdin_fd = workspace.pipes.stdin.read.as_raw_fd(); let stdout_fd = workspace.pipes.stdout.write.as_raw_fd(); @@ -827,6 +849,33 @@ fn setup_stdio(workspace: &Workspace) -> Result<(), ExecutorError> { Ok(()) } +fn apply_seccomp(plan: &Plan) -> Result<(), ExecutorError> { + let whitelist: Vec = if let Some(ref syscalls) = plan.syscalls { + let mut wl: Vec = DEFAULT_WHITELIST + .iter() + .copied() + .filter(|s| !syscalls.denied.contains(s)) + .collect(); + for s in &syscalls.allowed { + if !wl.contains(s) { + wl.push(*s); + } + } + wl + } else { + DEFAULT_WHITELIST.to_vec() + }; + + let filter = build_whitelist_filter(&whitelist); + let fprog = SockFprog { + len: filter.len() as u16, + filter: filter.as_ptr(), + }; + unsafe { seccomp_set_mode_filter(&fprog) } + .map_err(|e| ExecutorError::Lockdown(LockdownError::Seccomp(e)))?; + Ok(()) +} + fn exec_command(plan: &Plan, exec_info: &ExecutionInfo) -> Result<(), ExecutorError> { let cmd_path = CString::new(exec_info.binary_path.to_string_lossy().as_bytes()) .map_err(|_| ExecutorError::Exec(Errno::INVAL))?; @@ -861,36 +910,6 @@ fn exec_command(plan: &Plan, exec_info: &ExecutionInfo) -> Result<(), ExecutorEr Err(ExecutorError::Exec(last_errno())) } -fn create_mount_dirs( - workspace: &Workspace, - exec_info: &ExecutionInfo, - plan: &Plan, -) -> Result<(), ExecutorError> { - for mount in &exec_info.extra_mounts { - create_mount_dir(workspace, &mount.target)?; - } - for mount in &plan.mounts { - create_mount_dir(workspace, &mount.target)?; - } - Ok(()) -} - -fn create_mount_dir(workspace: &Workspace, target: &Path) -> Result<(), ExecutorError> { - if let Some(parent) = target.parent() { - if parent != Path::new("/") { - let target_dir = workspace - .root() - .join(parent.strip_prefix("/").unwrap_or(parent)); - std::fs::create_dir_all(&target_dir).map_err(ExecutorError::Workspace)?; - } - } - let mount_point = workspace - .root() - .join(target.strip_prefix("/").unwrap_or(target)); - std::fs::create_dir_all(&mount_point).map_err(ExecutorError::Workspace)?; - Ok(()) -} - #[cfg(test)] mod tests { use super::*; diff --git a/crates/evalbox-sandbox/src/isolation/lockdown.rs b/crates/evalbox-sandbox/src/isolation/lockdown.rs index 227725a..d9c96ff 100644 --- a/crates/evalbox-sandbox/src/isolation/lockdown.rs +++ b/crates/evalbox-sandbox/src/isolation/lockdown.rs @@ -1,41 +1,43 @@ //! Security lockdown for sandboxed processes. //! -//! Applies all security restrictions to the child process after `pivot_root`. +//! Applies all security restrictions to the child process. //! The order of operations is critical for security: //! -//! 1. **Landlock** - Filesystem and network access control (ABI 4+) -//! 2. **Seccomp** - Syscall whitelist filter (BPF) -//! 3. **Rlimits** - Resource limits (memory, CPU, files, processes) -//! 4. **Capabilities** - Drop all capabilities, set `NO_NEW_PRIVS` -//! 5. **Close FDs** - Close all file descriptors except stdin/stdout/stderr +//! 0. **`NO_NEW_PRIVS`** - Required before Landlock and seccomp +//! 1. **Landlock v5** - Filesystem, network, signal, and IPC access control +//! 2. **Rlimits** - Resource limits (memory, CPU, files, processes) +//! 3. **Securebits** - Lock capability state permanently +//! 4. **Capabilities** - Drop all capabilities +//! +//! Note: Seccomp filters and fd closing are handled separately in `child_process()` +//! because the notify filter must return a listener fd that gets sent to the parent. //! //! After lockdown, the process cannot: //! - Access files outside allowed paths -//! - Make network connections (if landlock ABI >= 4) -//! - Call restricted syscalls (ptrace, mount, reboot, etc.) +//! - Make network connections (if network blocked, requires Landlock ABI 4+) +//! - Send signals to processes outside the sandbox (Landlock ABI 5+) +//! - Connect to abstract unix sockets outside the sandbox (Landlock ABI 5+) //! - Exceed resource limits //! - Gain new privileges use std::ffi::CString; -use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd}; +use std::os::fd::{AsRawFd, FromRawFd, OwnedFd}; use std::os::unix::ffi::OsStrExt; use std::path::Path; use evalbox_sys::landlock::{ - self, LANDLOCK_ACCESS_FS_EXECUTE, LANDLOCK_ACCESS_FS_MAKE_DIR, LANDLOCK_ACCESS_FS_MAKE_REG, - LANDLOCK_ACCESS_FS_READ_DIR, LANDLOCK_ACCESS_FS_READ_FILE, LANDLOCK_ACCESS_FS_REMOVE_DIR, - LANDLOCK_ACCESS_FS_REMOVE_FILE, LANDLOCK_ACCESS_FS_TRUNCATE, LANDLOCK_ACCESS_FS_WRITE_FILE, - LandlockPathBeneathAttr, LandlockRulesetAttr, fs_access_for_abi, landlock_add_rule_path, - landlock_create_ruleset, landlock_restrict_self, net_access_for_abi, + self, LANDLOCK_ACCESS_FS_EXECUTE, LANDLOCK_ACCESS_FS_MAKE_DIR, LANDLOCK_ACCESS_FS_MAKE_FIFO, + LANDLOCK_ACCESS_FS_MAKE_REG, LANDLOCK_ACCESS_FS_MAKE_SYM, LANDLOCK_ACCESS_FS_READ_DIR, + LANDLOCK_ACCESS_FS_READ_FILE, LANDLOCK_ACCESS_FS_REMOVE_DIR, LANDLOCK_ACCESS_FS_REMOVE_FILE, + LANDLOCK_ACCESS_FS_TRUNCATE, LANDLOCK_ACCESS_FS_WRITE_FILE, LandlockPathBeneathAttr, + LandlockRulesetAttr, fs_access_for_abi, landlock_add_rule_path, landlock_create_ruleset, + landlock_restrict_self, net_access_for_abi, scope_for_abi, }; use evalbox_sys::last_errno; -use evalbox_sys::seccomp::{ - DEFAULT_WHITELIST, SockFprog, build_whitelist_filter, seccomp_set_mode_filter, -}; use rustix::io::Errno; use thiserror::Error; -use super::rootfs::apply_rlimits; +use super::rlimits::apply_rlimits; use crate::plan::Plan; /// Error during security lockdown. @@ -53,26 +55,34 @@ pub enum LockdownError { #[error("capability: {0}")] Capability(Errno), - #[error("close fds: {0}")] - CloseFds(Errno), + #[error("securebits: {0}")] + Securebits(Errno), } +/// Apply security lockdown to the current process. +/// +/// `workspace_root` is the real absolute path to the workspace directory +/// (no `pivot_root`, so we use real paths). +/// +/// `extra_readonly_paths` are additional paths that should be readable +/// (e.g., resolved binary mount paths). pub fn lockdown( plan: &Plan, - workspace_path: Option<&Path>, + workspace_root: &Path, extra_readonly_paths: &[&str], ) -> Result<(), LockdownError> { - apply_landlock(plan, workspace_path, extra_readonly_paths)?; - apply_seccomp()?; + // NO_NEW_PRIVS must be set before landlock_restrict_self and seccomp. + set_no_new_privs()?; + apply_landlock_v5(plan, workspace_root, extra_readonly_paths)?; apply_rlimits(plan).map_err(LockdownError::Rlimit)?; + apply_securebits()?; drop_all_caps()?; - close_extra_fds()?; Ok(()) } -fn apply_landlock( +fn apply_landlock_v5( plan: &Plan, - workspace_path: Option<&Path>, + workspace_root: &Path, extra_readonly_paths: &[&str], ) -> Result<(), LockdownError> { let abi = match landlock::landlock_abi_version() { @@ -80,16 +90,22 @@ fn apply_landlock( Err(_) => return Ok(()), // Landlock not available }; + if abi < 5 { + eprintln!("warning: landlock ABI {abi} < 5, signal/IPC scoping unavailable"); + } + let fs_access = fs_access_for_abi(abi); let net_access = if plan.network_blocked && abi >= 4 { net_access_for_abi(abi) } else { 0 }; + let scoped = scope_for_abi(abi); let attr = LandlockRulesetAttr { handled_access_fs: fs_access, handled_access_net: net_access, + scoped, }; let ruleset_fd = landlock_create_ruleset(&attr).map_err(LockdownError::Landlock)?; @@ -99,11 +115,13 @@ fn apply_landlock( | LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_MAKE_REG | LANDLOCK_ACCESS_FS_MAKE_DIR + | LANDLOCK_ACCESS_FS_MAKE_SYM + | LANDLOCK_ACCESS_FS_MAKE_FIFO | LANDLOCK_ACCESS_FS_REMOVE_FILE | LANDLOCK_ACCESS_FS_REMOVE_DIR | LANDLOCK_ACCESS_FS_TRUNCATE; - // Read-only paths from plan.mounts (pre-computed by evalbox, includes system paths) + // Read-only mounts from plan (system paths computed by evalbox or user-specified) for mount in &plan.mounts { if !mount.writable { let access = if mount.executable { @@ -111,32 +129,41 @@ fn apply_landlock( } else { read_access & !LANDLOCK_ACCESS_FS_EXECUTE }; - add_path_rule(&ruleset_fd, &mount.target, access); + add_path_rule(&ruleset_fd, &mount.source, access); } } + // Extra readonly paths (resolved binary mounts) for path in extra_readonly_paths { add_path_rule(&ruleset_fd, path, read_access); } - // Pre-pivot_root workspace path - if let Some(ws_path) = workspace_path { - add_path_rule(&ruleset_fd, ws_path, write_access); + // Writable workspace paths (real absolute paths, no pivot_root) + add_path_rule(&ruleset_fd, workspace_root.join("work"), write_access); + add_path_rule(&ruleset_fd, workspace_root.join("tmp"), write_access); + add_path_rule(&ruleset_fd, workspace_root.join("home"), write_access); + + // System paths (read-only with execute) + for path in ["/usr", "/bin", "/lib", "/lib64", "/etc"] { + add_path_rule(&ruleset_fd, path, read_access); } - // Writable paths - for path in ["/work", "/tmp", "/home"] { - add_path_rule(&ruleset_fd, path, write_access); + // NixOS store + if Path::new("/nix/store").exists() { + add_path_rule(&ruleset_fd, "/nix/store", read_access); + } + if Path::new("/run/current-system").exists() { + add_path_rule(&ruleset_fd, "/run/current-system", read_access); } // Proc (read-only) - add_path_rule(&ruleset_fd, "/proc", read_access); + add_path_rule(&ruleset_fd, "/proc", read_access & !LANDLOCK_ACCESS_FS_EXECUTE); // Dev (read + write for /dev/null etc.) add_path_rule( &ruleset_fd, "/dev", - read_access | LANDLOCK_ACCESS_FS_WRITE_FILE, + (read_access & !LANDLOCK_ACCESS_FS_EXECUTE) | LANDLOCK_ACCESS_FS_WRITE_FILE, ); landlock_restrict_self(&ruleset_fd).map_err(LockdownError::Landlock) @@ -144,9 +171,9 @@ fn apply_landlock( /// Add a path rule to the Landlock ruleset. /// -/// Errors are logged to stderr but not propagated - the path simply won't be -/// accessible in the sandbox. This is intentional: missing paths (like /nix/store -/// on non-NixOS) should not prevent sandbox creation. +/// Errors are logged but not propagated - the path simply won't be +/// accessible in the sandbox. Missing paths (like /nix/store on non-NixOS) +/// should not prevent sandbox creation. fn add_path_rule(ruleset_fd: &OwnedFd, path: impl AsRef, access: u64) { let path = path.as_ref(); let fd = match open_path(path) { @@ -159,7 +186,6 @@ fn add_path_rule(ruleset_fd: &OwnedFd, path: impl AsRef, access: u64) { parent_fd: fd.as_raw_fd(), }; if let Err(e) = landlock_add_rule_path(ruleset_fd, &rule) { - // Log but don't fail - path won't be accessible in sandbox eprintln!("warning: landlock rule for {path:?} failed: {e}"); } } @@ -175,13 +201,45 @@ fn open_path(path: impl AsRef) -> Result { } } -fn apply_seccomp() -> Result<(), LockdownError> { - let filter = build_whitelist_filter(DEFAULT_WHITELIST); - let fprog = SockFprog { - len: filter.len() as u16, - filter: filter.as_ptr(), - }; - unsafe { seccomp_set_mode_filter(&fprog) }.map_err(LockdownError::Seccomp) +// Securebits constants (from ) +const SECBIT_NOROOT: u64 = 1 << 0; +const SECBIT_NOROOT_LOCKED: u64 = 1 << 1; +const SECBIT_NO_SETUID_FIXUP: u64 = 1 << 2; +const SECBIT_NO_SETUID_FIXUP_LOCKED: u64 = 1 << 3; +const SECBIT_KEEP_CAPS_LOCKED: u64 = 1 << 5; +const SECBIT_NO_CAP_AMBIENT_RAISE: u64 = 1 << 6; +const SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED: u64 = 1 << 7; + +/// Apply securebits to lock capability state permanently. +/// +/// This prevents the process from ever regaining capabilities through +/// any mechanism (exec of setuid, ambient capabilities, etc.). +fn apply_securebits() -> Result<(), LockdownError> { + let bits = SECBIT_NOROOT + | SECBIT_NOROOT_LOCKED + | SECBIT_NO_SETUID_FIXUP + | SECBIT_NO_SETUID_FIXUP_LOCKED + | SECBIT_KEEP_CAPS_LOCKED + | SECBIT_NO_CAP_AMBIENT_RAISE + | SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED; + + let ret = unsafe { libc::prctl(libc::PR_SET_SECUREBITS, bits, 0, 0, 0) }; + if ret != 0 { + // Not fatal — securebits may require capabilities we don't have. + // The important thing is NO_NEW_PRIVS + dropping all caps. + eprintln!("warning: PR_SET_SECUREBITS failed: {}", last_errno()); + } + Ok(()) +} + +/// Set `PR_SET_NO_NEW_PRIVS` — required before `landlock_restrict_self` and seccomp. +fn set_no_new_privs() -> Result<(), LockdownError> { + let ret = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) }; + if ret != 0 { + Err(LockdownError::Capability(last_errno())) + } else { + Ok(()) + } } fn drop_all_caps() -> Result<(), LockdownError> { @@ -197,33 +255,18 @@ fn drop_all_caps() -> Result<(), LockdownError> { libc::prctl(libc::PR_CAPBSET_DROP, cap, 0, 0, 0); } } - - let ret = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) }; - if ret != 0 { - Err(LockdownError::Capability(last_errno())) - } else { - Ok(()) - } + Ok(()) } -fn close_extra_fds() -> Result<(), LockdownError> { - let mut fds_to_close = Vec::new(); - - if let Ok(entries) = std::fs::read_dir("/proc/self/fd") { - for entry in entries.flatten() { - if let Ok(fd) = entry.file_name().to_string_lossy().parse::() { - if fd > 2 { - fds_to_close.push(fd); - } - } - } - } - - for fd in fds_to_close { - unsafe { libc::close(fd) }; +/// Close all file descriptors > 2 using `close_range` syscall. +/// +/// This is called separately from lockdown because it must happen after +/// seccomp filter installation and listener fd transfer. +pub fn close_extra_fds() { + // close_range(3, MAX, 0) — close all fds from 3 to MAX + unsafe { + libc::syscall(libc::SYS_close_range, 3u32, u32::MAX, 0u32); } - - Ok(()) } #[cfg(test)] diff --git a/crates/evalbox-sandbox/src/isolation/mod.rs b/crates/evalbox-sandbox/src/isolation/mod.rs index 4b1c334..4e69f46 100644 --- a/crates/evalbox-sandbox/src/isolation/mod.rs +++ b/crates/evalbox-sandbox/src/isolation/mod.rs @@ -2,16 +2,10 @@ //! //! This module contains all the security isolation layers: //! -//! - **namespace** - User namespace and ID mapping setup -//! - **rootfs** - Filesystem setup (bind mounts, `pivot_root`, rlimits) -//! - **lockdown** - Security restrictions (Landlock, seccomp, capabilities) +//! - **lockdown** - Security restrictions (Landlock v5, seccomp, securebits, capabilities) +//! - **rlimits** - Resource limits (memory, CPU, files, processes) mod lockdown; -mod namespace; -mod rootfs; +pub mod rlimits; -pub use lockdown::{LockdownError, lockdown}; -pub use namespace::setup_id_maps; -pub use rootfs::{ - bind_mount, make_rprivate, mount_minimal_dev, mount_proc, pivot_root_and_cleanup, set_hostname, -}; +pub use lockdown::{LockdownError, close_extra_fds, lockdown}; diff --git a/crates/evalbox-sandbox/src/isolation/namespace.rs b/crates/evalbox-sandbox/src/isolation/namespace.rs deleted file mode 100644 index 23ad83b..0000000 --- a/crates/evalbox-sandbox/src/isolation/namespace.rs +++ /dev/null @@ -1,83 +0,0 @@ -//! User namespace and ID mapping setup. -//! -//! Sets up UID/GID mappings so the sandboxed process runs as root (UID 0) -//! inside the namespace, but maps to the real user outside. -//! -//! ## How It Works -//! -//! ```text -//! Outside namespace: uid=1000 (real user) -//! │ -//! ┌─────▼─────┐ -//! │ uid_map │ "0 1000 1" -//! └─────┬─────┘ -//! │ -//! Inside namespace: uid=0 (appears as root) -//! ``` -//! -//! ## Security -//! -//! - `deny_setgroups` must be called BEFORE writing `gid_map` (kernel requirement) -//! - The process appears as root inside but has no real privileges -//! - This enables `pivot_root` and mount operations inside the namespace - -use std::fs; -use std::io; - -/// Write UID mapping for a process in a user namespace. -/// -/// Maps `inside_uid` (seen inside namespace) to `outside_uid` (real UID). -/// The "1" at the end means we map exactly one UID. -pub fn write_uid_map(pid: libc::pid_t, inside_uid: u32, outside_uid: u32) -> io::Result<()> { - fs::write( - format!("/proc/{pid}/uid_map"), - format!("{inside_uid} {outside_uid} 1\n"), - ) -} - -/// Write GID mapping for a process in a user namespace. -/// -/// Maps `inside_gid` (seen inside namespace) to `outside_gid` (real GID). -pub fn write_gid_map(pid: libc::pid_t, inside_gid: u32, outside_gid: u32) -> io::Result<()> { - fs::write( - format!("/proc/{pid}/gid_map"), - format!("{inside_gid} {outside_gid} 1\n"), - ) -} - -/// Deny setgroups syscall for a process. -/// -/// # Safety Order -/// -/// MUST be called before `write_gid_map`. The kernel requires this to prevent -/// privilege escalation via group manipulation. -pub fn deny_setgroups(pid: libc::pid_t) -> io::Result<()> { - fs::write(format!("/proc/{pid}/setgroups"), "deny\n") -} - -/// Set up complete ID mappings for a child process. -/// -/// Maps UID 0 and GID 0 inside the namespace to the current user's -/// real UID/GID outside. This allows the sandboxed process to appear -/// as root while having no actual privileges. -pub fn setup_id_maps(child_pid: libc::pid_t) -> io::Result<()> { - // SAFETY: getuid/getgid are always safe to call - let uid = unsafe { libc::getuid() }; - let gid = unsafe { libc::getgid() }; - - // SAFETY: deny_setgroups MUST come before write_gid_map - deny_setgroups(child_pid)?; - write_uid_map(child_pid, 0, uid)?; - write_gid_map(child_pid, 0, gid) -} - -#[cfg(test)] -mod tests { - #[test] - fn current_uid_gid() { - // SAFETY: getuid/getgid are always safe - let uid = unsafe { libc::getuid() }; - let gid = unsafe { libc::getgid() }; - assert!(uid > 0 || gid > 0); - } -} diff --git a/crates/evalbox-sandbox/src/isolation/rlimits.rs b/crates/evalbox-sandbox/src/isolation/rlimits.rs new file mode 100644 index 0000000..67d97b8 --- /dev/null +++ b/crates/evalbox-sandbox/src/isolation/rlimits.rs @@ -0,0 +1,71 @@ +//! Resource limits for sandboxed processes. +//! +//! Sets kernel-enforced resource limits to prevent denial-of-service. +//! +//! ## Limits Applied +//! +//! | Limit | Purpose | Default | +//! |-------|---------|---------| +//! | `RLIMIT_DATA` | Memory usage | 256 MiB | +//! | `RLIMIT_CPU` | CPU time | timeout * 2 + 60s | +//! | `RLIMIT_FSIZE` | Output file size | 16 MiB | +//! | `RLIMIT_NOFILE` | Open file descriptors | 256 | +//! | `RLIMIT_NPROC` | Max processes | 64 | +//! | `RLIMIT_CORE` | Core dump size | 0 (disabled) | +//! | `RLIMIT_STACK` | Stack size | 8 MiB | +//! +//! ## Note on `RLIMIT_AS` +//! +//! We intentionally do NOT set `RLIMIT_AS` (virtual address space). +//! Modern runtimes like Go, Java, and V8 pre-allocate large virtual address +//! ranges but only commit small portions. `RLIMIT_AS` would break these +//! runtimes. `RLIMIT_DATA` limits actual memory and is more appropriate. + +use evalbox_sys::last_errno; +use rustix::io::Errno; + +use crate::plan::Plan; + +/// Apply resource limits based on the sandbox plan. +pub fn apply_rlimits(plan: &Plan) -> Result<(), Errno> { + let cpu_secs = plan.timeout.as_secs().saturating_mul(2).saturating_add(60); + + set_rlimit(libc::RLIMIT_DATA, plan.memory_limit)?; + set_rlimit(libc::RLIMIT_CPU, cpu_secs)?; + set_rlimit(libc::RLIMIT_FSIZE, plan.max_output)?; + set_rlimit(libc::RLIMIT_NOFILE, 256)?; + set_rlimit(libc::RLIMIT_NPROC, u64::from(plan.max_pids))?; + set_rlimit(libc::RLIMIT_CORE, 0)?; + set_rlimit(libc::RLIMIT_STACK, 8 * 1024 * 1024)?; + Ok(()) +} + +#[inline] +fn set_rlimit(resource: libc::__rlimit_resource_t, limit: u64) -> Result<(), Errno> { + let rlim = libc::rlimit { + rlim_cur: limit, + rlim_max: limit, + }; + // SAFETY: rlim is valid, resource is a valid constant. + if unsafe { libc::setrlimit(resource, &rlim) } != 0 { + Err(last_errno()) + } else { + Ok(()) + } +} + +#[cfg(test)] +mod tests { + #[test] + fn get_current_nofile() { + let mut rlim = libc::rlimit { + rlim_cur: 0, + rlim_max: 0, + }; + assert_eq!( + unsafe { libc::getrlimit(libc::RLIMIT_NOFILE, &mut rlim) }, + 0 + ); + assert!(rlim.rlim_cur > 0); + } +} diff --git a/crates/evalbox-sandbox/src/isolation/rootfs.rs b/crates/evalbox-sandbox/src/isolation/rootfs.rs deleted file mode 100644 index 09892ba..0000000 --- a/crates/evalbox-sandbox/src/isolation/rootfs.rs +++ /dev/null @@ -1,267 +0,0 @@ -//! Rootfs setup and resource limits for sandboxed processes. -//! -//! This module handles: -//! - Bind mounts for the sandbox filesystem -//! - Pivot root to isolate the filesystem -//! - Resource limits (rlimits) -//! -//! ## Filesystem Layout (after `pivot_root`) -//! -//! ```text -//! / -//! ├── bin/ → bind mount from /bin (read-only) -//! ├── dev/ → bind mounts: null, zero, urandom, random + symlinks -//! ├── etc/ → bind mount from /etc (read-only) -//! ├── home/ → empty, writable -//! ├── lib/ → bind mount from /lib (read-only) -//! ├── lib64/ → bind mount from /lib64 if exists (read-only) -//! ├── nix/ → bind mount from /nix/store on NixOS (read-only) -//! ├── proc/ → bind mount from /proc (read-only) -//! ├── tmp/ → empty, writable -//! ├── usr/ → bind mount from /usr (read-only) -//! └── work/ → user code directory, writable -//! ``` - -use std::ffi::CString; -use std::os::unix::ffi::OsStrExt; -use std::path::Path; - -use evalbox_sys::last_errno; -use rustix::io::Errno; -use rustix::process::pivot_root; - -use crate::plan::Plan; - -/// Make all mounts private recursively. -pub fn make_rprivate() -> Result<(), Errno> { - let ret = unsafe { - libc::mount( - std::ptr::null(), - c"/".as_ptr(), - std::ptr::null(), - libc::MS_REC | libc::MS_PRIVATE, - std::ptr::null(), - ) - }; - if ret != 0 { Err(last_errno()) } else { Ok(()) } -} - -/// Mount proc filesystem (bind-mounted read-only from host). -pub fn mount_proc(target: &Path) -> Result<(), Errno> { - let target_c = path_to_cstring(target)?; - - let ret = unsafe { - libc::mount( - c"/proc".as_ptr(), - target_c.as_ptr(), - std::ptr::null(), - libc::MS_BIND | libc::MS_REC, - std::ptr::null(), - ) - }; - if ret != 0 { - return Err(last_errno()); - } - - let ret = unsafe { - libc::mount( - std::ptr::null(), - target_c.as_ptr(), - std::ptr::null(), - libc::MS_BIND - | libc::MS_REMOUNT - | libc::MS_RDONLY - | libc::MS_NOSUID - | libc::MS_NODEV - | libc::MS_NOEXEC, - std::ptr::null(), - ) - }; - if ret != 0 { Err(last_errno()) } else { Ok(()) } -} - -/// Create minimal /dev with null, zero, urandom (bind-mounted from host). -pub fn mount_minimal_dev(target: &Path) -> Result<(), Errno> { - for dev in ["null", "zero", "urandom", "random"] { - bind_mount_dev(target, dev)?; - } - - let fd_path = target.join("fd"); - let fd_c = path_to_cstring(&fd_path)?; - if unsafe { libc::symlink(c"/proc/self/fd".as_ptr(), fd_c.as_ptr()) } != 0 { - return Err(last_errno()); - } - - for (name, num) in [("stdin", 0), ("stdout", 1), ("stderr", 2)] { - let link_path = target.join(name); - let link_c = path_to_cstring(&link_path)?; - let target_str = CString::new(format!("/proc/self/fd/{num}")).map_err(|_| Errno::INVAL)?; - if unsafe { libc::symlink(target_str.as_ptr(), link_c.as_ptr()) } != 0 { - return Err(last_errno()); - } - } - - Ok(()) -} - -fn bind_mount_dev(target_dev: &Path, name: &str) -> Result<(), Errno> { - let source = Path::new("/dev").join(name); - let target = target_dev.join(name); - let target_c = path_to_cstring(&target)?; - let source_c = path_to_cstring(&source)?; - - let fd = unsafe { libc::open(target_c.as_ptr(), libc::O_CREAT | libc::O_WRONLY, 0o644) }; - if fd < 0 { - return Err(last_errno()); - } - unsafe { libc::close(fd) }; - - let ret = unsafe { - libc::mount( - source_c.as_ptr(), - target_c.as_ptr(), - std::ptr::null(), - libc::MS_BIND, - std::ptr::null(), - ) - }; - if ret != 0 { Err(last_errno()) } else { Ok(()) } -} - -/// Bind mount a path. -pub fn bind_mount(source: &Path, target: &Path, readonly: bool) -> Result<(), Errno> { - let source_c = path_to_cstring(source)?; - let target_c = path_to_cstring(target)?; - - let ret = unsafe { - libc::mount( - source_c.as_ptr(), - target_c.as_ptr(), - std::ptr::null(), - libc::MS_BIND | libc::MS_REC, - std::ptr::null(), - ) - }; - if ret != 0 { - return Err(last_errno()); - } - - if readonly { - let ret = unsafe { - libc::mount( - std::ptr::null(), - target_c.as_ptr(), - std::ptr::null(), - libc::MS_BIND | libc::MS_REMOUNT | libc::MS_RDONLY, - std::ptr::null(), - ) - }; - if ret != 0 { - return Err(last_errno()); - } - } - - Ok(()) -} - -/// Perform `pivot_root` and clean up the old root. -pub fn pivot_root_and_cleanup(new_root: &Path) -> Result<(), Errno> { - let new_root_c = path_to_cstring(new_root)?; - - let ret = unsafe { - libc::mount( - new_root_c.as_ptr(), - new_root_c.as_ptr(), - std::ptr::null(), - libc::MS_BIND | libc::MS_REC, - std::ptr::null(), - ) - }; - if ret != 0 { - return Err(last_errno()); - } - - let old_root = new_root.join(".old_root"); - let old_root_c = path_to_cstring(&old_root)?; - unsafe { libc::mkdir(old_root_c.as_ptr(), 0o700) }; - - let new_root_cstr = CString::new(new_root_c.as_bytes()).map_err(|_| Errno::INVAL)?; - let old_root_cstr = CString::new(old_root_c.as_bytes()).map_err(|_| Errno::INVAL)?; - pivot_root(new_root_cstr.as_c_str(), old_root_cstr.as_c_str())?; - - unsafe { - libc::chdir(c"/".as_ptr()); - libc::umount2(c"/.old_root".as_ptr(), libc::MNT_DETACH); - libc::rmdir(c"/.old_root".as_ptr()); - } - - Ok(()) -} - -/// Set the hostname. -pub fn set_hostname(name: &str) -> Result<(), Errno> { - let ret = unsafe { libc::sethostname(name.as_ptr().cast::(), name.len()) }; - if ret != 0 { Err(last_errno()) } else { Ok(()) } -} - -#[inline] -fn path_to_cstring(path: &Path) -> Result { - CString::new(path.as_os_str().as_bytes()).map_err(|_| Errno::INVAL) -} - -/// Apply resource limits based on the sandbox plan. -pub fn apply_rlimits(plan: &Plan) -> Result<(), Errno> { - let cpu_secs = plan.timeout.as_secs().saturating_mul(2).saturating_add(60); - - // Note: We intentionally do NOT set RLIMIT_AS (address space). - // RLIMIT_AS limits virtual memory, which can be much larger than actual usage. - // Modern runtimes like Go, Java, and V8 pre-allocate large virtual address ranges - // but only commit (use) small portions. RLIMIT_AS would break these runtimes. - // RLIMIT_DATA limits the data segment and is more appropriate for real memory control. - set_rlimit(libc::RLIMIT_DATA, plan.memory_limit)?; - set_rlimit(libc::RLIMIT_CPU, cpu_secs)?; - set_rlimit(libc::RLIMIT_FSIZE, plan.max_output)?; - set_rlimit(libc::RLIMIT_NOFILE, 256)?; - set_rlimit(libc::RLIMIT_NPROC, u64::from(plan.max_pids))?; - set_rlimit(libc::RLIMIT_CORE, 0)?; - set_rlimit(libc::RLIMIT_STACK, 8 * 1024 * 1024)?; - Ok(()) -} - -#[inline] -fn set_rlimit(resource: libc::__rlimit_resource_t, limit: u64) -> Result<(), Errno> { - let rlim = libc::rlimit { - rlim_cur: limit, - rlim_max: limit, - }; - // SAFETY: rlim is valid, resource is a valid constant. - if unsafe { libc::setrlimit(resource, &rlim) } != 0 { - Err(last_errno()) - } else { - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn path_to_cstring_valid() { - let cstr = path_to_cstring(Path::new("/tmp/test")).unwrap(); - assert_eq!(cstr.as_bytes(), b"/tmp/test"); - } - - #[test] - fn get_current_nofile() { - let mut rlim = libc::rlimit { - rlim_cur: 0, - rlim_max: 0, - }; - assert_eq!( - unsafe { libc::getrlimit(libc::RLIMIT_NOFILE, &mut rlim) }, - 0 - ); - assert!(rlim.rlim_cur > 0); - } -} diff --git a/crates/evalbox-sandbox/src/lib.rs b/crates/evalbox-sandbox/src/lib.rs index bfd4bfb..006b14a 100644 --- a/crates/evalbox-sandbox/src/lib.rs +++ b/crates/evalbox-sandbox/src/lib.rs @@ -3,12 +3,13 @@ //! This crate provides secure sandboxed execution of untrusted code on Linux. //! It combines multiple isolation mechanisms for defense in depth: //! -//! - **User namespaces** - Unprivileged containers, UID 0 inside = real user outside -//! - **Mount namespaces** - Private filesystem view with minimal bind mounts -//! - **Pivot root** - Change root directory, unmount host filesystem -//! - **Landlock** - Filesystem and network access control (kernel 5.13+) +//! - **Landlock v5** - Filesystem, network, signal, and IPC access control //! - **Seccomp-BPF** - Syscall whitelist (~40 allowed syscalls) +//! - **Seccomp User Notify** - Optional syscall interception for FS virtualization //! - **Rlimits** - Resource limits (memory, CPU, files, processes) +//! - **Capabilities** - All capabilities dropped, `NO_NEW_PRIVS` enforced +//! +//! No user namespaces required — works inside Docker with default seccomp profile. //! //! ## Quick Start //! @@ -22,8 +23,7 @@ //! //! ## Requirements //! -//! - Linux kernel 5.13+ (for Landlock ABI 1+) -//! - User namespaces enabled (`/proc/sys/kernel/unprivileged_userns_clone = 1`) +//! - Linux kernel 6.12+ (for Landlock ABI 5) //! - Seccomp enabled in kernel #![allow(clippy::cast_possible_truncation)] @@ -32,6 +32,7 @@ pub mod executor; pub mod isolation; pub mod monitor; +pub mod notify; pub mod plan; pub mod resolve; pub mod sysinfo; @@ -40,10 +41,5 @@ pub mod workspace; pub use executor::{Event, Executor, ExecutorError, SandboxId}; pub use monitor::{Output, Status}; -pub use plan::{Landlock, Mount, Plan, Syscalls, UserFile}; +pub use plan::{Landlock, Mount, NotifyMode, Plan, Syscalls, UserFile}; pub use resolve::{ResolveError, ResolvedBinary, resolve_binary}; - -// Backwards compatibility -#[allow(deprecated)] -#[doc(hidden)] -pub use plan::SandboxPlan; diff --git a/crates/evalbox-sandbox/src/notify/mod.rs b/crates/evalbox-sandbox/src/notify/mod.rs new file mode 100644 index 0000000..d4912f2 --- /dev/null +++ b/crates/evalbox-sandbox/src/notify/mod.rs @@ -0,0 +1,17 @@ +//! Seccomp user notification support. +//! +//! This module provides the supervisor side of seccomp user notification, +//! enabling syscall interception without Linux user namespaces. +//! +//! ## Modules +//! +//! - **supervisor** - Main notification loop that handles intercepted syscalls +//! - **`virtual_fs`** - Path translation for filesystem virtualization +//! - **`scm_rights`** - Unix socket fd passing (child → parent listener fd transfer) + +pub mod scm_rights; +pub mod supervisor; +pub mod virtual_fs; + +pub use supervisor::{NotifyEvent, Supervisor}; +pub use virtual_fs::VirtualFs; diff --git a/crates/evalbox-sandbox/src/notify/scm_rights.rs b/crates/evalbox-sandbox/src/notify/scm_rights.rs new file mode 100644 index 0000000..7637613 --- /dev/null +++ b/crates/evalbox-sandbox/src/notify/scm_rights.rs @@ -0,0 +1,150 @@ +//! Unix socket fd passing via `SCM_RIGHTS`. +//! +//! After the child installs its seccomp notify filter, it receives a listener fd. +//! This fd must be passed to the parent process so the parent can handle +//! notifications. We use `SCM_RIGHTS` over an `AF_UNIX` socketpair to transfer +//! the fd across the fork boundary. + +use std::io; +use std::os::fd::{FromRawFd, OwnedFd, RawFd}; + +/// Create an `AF_UNIX SOCK_STREAM` socketpair. +/// +/// Returns `(parent_sock, child_sock)`. After fork, parent closes `child_sock` +/// and child closes `parent_sock`. +pub fn create_socketpair() -> io::Result<(OwnedFd, OwnedFd)> { + let mut fds = [0i32; 2]; + let ret = unsafe { + libc::socketpair( + libc::AF_UNIX, + libc::SOCK_STREAM | libc::SOCK_CLOEXEC, + 0, + fds.as_mut_ptr(), + ) + }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + Ok(unsafe { (OwnedFd::from_raw_fd(fds[0]), OwnedFd::from_raw_fd(fds[1])) }) +} + +/// Send a file descriptor over a unix socket using `SCM_RIGHTS`. +pub fn send_fd(socket: RawFd, fd: RawFd) -> io::Result<()> { + let data = [0u8; 1]; + let iov = libc::iovec { + iov_base: data.as_ptr() as *mut libc::c_void, + iov_len: 1, + }; + + // cmsg buffer: header + one fd + let cmsg_space = unsafe { libc::CMSG_SPACE(size_of::() as u32) } as usize; + let mut cmsg_buf = vec![0u8; cmsg_space]; + + let mut msg: libc::msghdr = unsafe { std::mem::zeroed() }; + msg.msg_iov = &iov as *const _ as *mut _; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf.as_mut_ptr().cast(); + msg.msg_controllen = cmsg_space; + + // Fill control message + let cmsg = unsafe { libc::CMSG_FIRSTHDR(&msg) }; + if cmsg.is_null() { + return Err(io::Error::other("CMSG_FIRSTHDR null")); + } + unsafe { + (*cmsg).cmsg_level = libc::SOL_SOCKET; + (*cmsg).cmsg_type = libc::SCM_RIGHTS; + (*cmsg).cmsg_len = libc::CMSG_LEN(size_of::() as u32) as usize; + let data_ptr = libc::CMSG_DATA(cmsg); + std::ptr::copy_nonoverlapping((&fd as *const RawFd).cast::(), data_ptr, size_of::()); + } + + let ret = unsafe { libc::sendmsg(socket, &msg, 0) }; + if ret < 0 { + Err(io::Error::last_os_error()) + } else { + Ok(()) + } +} + +/// Receive a file descriptor from a unix socket using `SCM_RIGHTS`. +pub fn recv_fd(socket: RawFd) -> io::Result { + let mut data = [0u8; 1]; + let mut iov = libc::iovec { + iov_base: data.as_mut_ptr().cast(), + iov_len: 1, + }; + + let cmsg_space = unsafe { libc::CMSG_SPACE(size_of::() as u32) } as usize; + let mut cmsg_buf = vec![0u8; cmsg_space]; + + let mut msg: libc::msghdr = unsafe { std::mem::zeroed() }; + msg.msg_iov = &mut iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf.as_mut_ptr().cast(); + msg.msg_controllen = cmsg_space; + + let ret = unsafe { libc::recvmsg(socket, &mut msg, 0) }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + + let cmsg = unsafe { libc::CMSG_FIRSTHDR(&msg) }; + if cmsg.is_null() { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "no control message received", + )); + } + + unsafe { + if (*cmsg).cmsg_level != libc::SOL_SOCKET || (*cmsg).cmsg_type != libc::SCM_RIGHTS { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "unexpected control message type", + )); + } + let mut fd: RawFd = 0; + let data_ptr = libc::CMSG_DATA(cmsg); + std::ptr::copy_nonoverlapping(data_ptr, (&mut fd as *mut RawFd).cast::(), size_of::()); + Ok(OwnedFd::from_raw_fd(fd)) + } +} + +#[cfg(test)] +mod tests { + use std::os::fd::AsRawFd; + + use super::*; + + #[test] + fn socketpair_creation() { + let (a, b) = create_socketpair().unwrap(); + assert!(a.as_raw_fd() >= 0); + assert!(b.as_raw_fd() >= 0); + assert_ne!(a.as_raw_fd(), b.as_raw_fd()); + } + + #[test] + fn send_recv_fd() { + let (parent, child) = create_socketpair().unwrap(); + + // Create a pipe and send its read end + let mut pipe_fds = [0i32; 2]; + unsafe { libc::pipe(pipe_fds.as_mut_ptr()) }; + let pipe_read = pipe_fds[0]; + let pipe_write = pipe_fds[1]; + + send_fd(child.as_raw_fd(), pipe_read).unwrap(); + let received = recv_fd(parent.as_raw_fd()).unwrap(); + + // The received fd should be valid and different from the original + assert!(received.as_raw_fd() >= 0); + + // Clean up + unsafe { + libc::close(pipe_read); + libc::close(pipe_write); + } + } +} diff --git a/crates/evalbox-sandbox/src/notify/supervisor.rs b/crates/evalbox-sandbox/src/notify/supervisor.rs new file mode 100644 index 0000000..13a77fb --- /dev/null +++ b/crates/evalbox-sandbox/src/notify/supervisor.rs @@ -0,0 +1,270 @@ +//! Seccomp notification supervisor. +//! +//! Runs in the parent process, handling intercepted syscalls from the sandboxed child. +//! The supervisor receives notifications via the seccomp listener fd and decides +//! how to respond based on the configured [`NotifyMode`]. +//! +//! ## Modes +//! +//! - **Monitor**: Log syscall and return `SECCOMP_USER_NOTIF_FLAG_CONTINUE` +//! - **Virtualize**: Translate filesystem paths via [`VirtualFs`], inject fds via `ADDFD` + +use std::fs::File; +use std::io::{self, Read, Seek, SeekFrom}; +use std::os::fd::{AsRawFd, OwnedFd, RawFd}; + +use evalbox_sys::seccomp_notify::{ + SECCOMP_ADDFD_FLAG_SEND, SECCOMP_USER_NOTIF_FLAG_CONTINUE, SeccompNotif, SeccompNotifAddfd, + SeccompNotifResp, notif_addfd, notif_id_valid, notif_recv, notif_send, +}; + +use super::virtual_fs::VirtualFs; +use crate::plan::NotifyMode; + +/// Events emitted by the supervisor for future user-facing notifications. +#[derive(Debug)] +pub enum NotifyEvent { + /// A syscall was intercepted and handled. + SyscallHandled { + /// PID of the process that made the syscall. + pid: u32, + /// Syscall number. + syscall_nr: i32, + /// Whether the syscall was allowed. + allowed: bool, + }, +} + +/// Seccomp notification supervisor. +pub struct Supervisor { + listener_fd: OwnedFd, + mode: NotifyMode, + vfs: VirtualFs, +} + +impl Supervisor { + /// Create a new supervisor. + pub fn new(listener_fd: OwnedFd, mode: NotifyMode, vfs: VirtualFs) -> Self { + Self { + listener_fd, + mode, + vfs, + } + } + + /// Get the raw fd for registering with poll/mio. + pub fn fd(&self) -> RawFd { + self.listener_fd.as_raw_fd() + } + + /// Handle a notification event. Call when the listener fd is readable. + /// + /// Returns `Some(NotifyEvent)` on success, `None` if the notification was + /// stale (child died or already handled). + pub fn handle_event(&self) -> io::Result> { + let mut notif = SeccompNotif::default(); + + if let Err(e) = notif_recv(self.listener_fd.as_raw_fd(), &mut notif) { + // ENOENT means the target process died before we could receive + if e == rustix::io::Errno::NOENT { + return Ok(None); + } + return Err(io::Error::from_raw_os_error(e.raw_os_error())); + } + + match self.mode { + NotifyMode::Disabled => { + debug_assert!(false, "supervisor received notification with NotifyMode::Disabled"); + self.respond_continue(¬if)?; + Ok(None) + } + NotifyMode::Monitor => self.handle_monitor(¬if), + NotifyMode::Virtualize => self.handle_virtualize(¬if), + } + } + + fn handle_monitor(&self, notif: &SeccompNotif) -> io::Result> { + let syscall_name = syscall_name(notif.data.nr); + eprintln!( + "[notify] pid={} syscall={}({}) args=[{:#x}, {:#x}, {:#x}]", + notif.pid, + syscall_name, + notif.data.nr, + notif.data.args[0], + notif.data.args[1], + notif.data.args[2], + ); + + self.respond_continue(notif)?; + + Ok(Some(NotifyEvent::SyscallHandled { + pid: notif.pid, + syscall_nr: notif.data.nr, + allowed: true, + })) + } + + fn handle_virtualize(&self, notif: &SeccompNotif) -> io::Result> { + let syscall_nr = notif.data.nr; + + // For openat-family syscalls, args[1] is the pathname pointer + // For open/creat, args[0] is the pathname pointer + let path_addr = if syscall_nr == libc::SYS_openat as i32 + || syscall_nr == libc::SYS_newfstatat as i32 + || syscall_nr == libc::SYS_faccessat as i32 + || syscall_nr == libc::SYS_faccessat2 as i32 + || syscall_nr == libc::SYS_readlinkat as i32 + { + notif.data.args[1] + } else { + notif.data.args[0] + }; + + // Read path from child's memory + let path = match self.read_child_string(notif.pid, path_addr) { + Ok(p) => p, + Err(_) => { + // Can't read memory, let syscall proceed + self.respond_continue(notif)?; + return Ok(None); + } + }; + + // TOCTOU check: verify notification is still valid after reading memory + if notif_id_valid(self.listener_fd.as_raw_fd(), notif.id).is_err() { + return Ok(None); // Notification is stale + } + + // Try to translate path + if let Some(real_path) = self.vfs.translate(&path) { + // For openat: open the file ourselves and inject the fd + if syscall_nr == libc::SYS_openat as i32 + || syscall_nr == libc::SYS_open as i32 + || syscall_nr == libc::SYS_creat as i32 + { + let flags = if syscall_nr == libc::SYS_openat as i32 { + notif.data.args[2] as i32 + } else { + notif.data.args[1] as i32 + }; + + match self.open_and_inject(notif, &real_path, flags) { + Ok(()) => { + return Ok(Some(NotifyEvent::SyscallHandled { + pid: notif.pid, + syscall_nr, + allowed: true, + })); + } + Err(_) => { + // Fall through to continue + } + } + } + } + + // No translation or non-open syscall: let it proceed as-is + self.respond_continue(notif)?; + Ok(Some(NotifyEvent::SyscallHandled { + pid: notif.pid, + syscall_nr, + allowed: true, + })) + } + + fn respond_continue(&self, notif: &SeccompNotif) -> io::Result<()> { + let resp = SeccompNotifResp { + id: notif.id, + val: 0, + error: 0, + flags: SECCOMP_USER_NOTIF_FLAG_CONTINUE, + }; + notif_send(self.listener_fd.as_raw_fd(), &resp) + .map_err(|e| io::Error::from_raw_os_error(e.raw_os_error())) + } + + fn open_and_inject( + &self, + notif: &SeccompNotif, + real_path: &std::path::Path, + flags: i32, + ) -> io::Result<()> { + use std::ffi::CString; + use std::os::unix::ffi::OsStrExt; + + let path_c = CString::new(real_path.as_os_str().as_bytes()) + .map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "invalid path"))?; + + // Open the file at the translated path + let fd = unsafe { libc::open(path_c.as_ptr(), flags & !libc::O_CLOEXEC, 0o666) }; + if fd < 0 { + return Err(io::Error::last_os_error()); + } + + // Inject the fd into the child and atomically respond + let addfd = SeccompNotifAddfd { + id: notif.id, + flags: SECCOMP_ADDFD_FLAG_SEND, + srcfd: fd as u32, + newfd: 0, + newfd_flags: 0, + }; + + let result = notif_addfd(self.listener_fd.as_raw_fd(), &addfd) + .map_err(|e| io::Error::from_raw_os_error(e.raw_os_error())); + + // Close our copy of the fd + unsafe { libc::close(fd) }; + + result.map(|_| ()) + } + + /// Read a null-terminated string from the child's memory via `/proc/pid/mem`. + fn read_child_string(&self, pid: u32, addr: u64) -> io::Result { + let mem_path = format!("/proc/{pid}/mem"); + let mut file = File::open(&mem_path)?; + file.seek(SeekFrom::Start(addr))?; + + let mut buf = vec![0u8; 4096]; + let n = file.read(&mut buf)?; + buf.truncate(n); + + // Find null terminator + if let Some(nul_pos) = buf.iter().position(|&b| b == 0) { + buf.truncate(nul_pos); + } + + String::from_utf8(buf).map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "invalid UTF-8 in path")) + } +} + +/// Map syscall number to name for logging. +fn syscall_name(nr: i32) -> &'static str { + match nr as i64 { + libc::SYS_openat => "openat", + libc::SYS_open => "open", + libc::SYS_creat => "creat", + libc::SYS_access => "access", + libc::SYS_faccessat => "faccessat", + libc::SYS_faccessat2 => "faccessat2", + libc::SYS_stat => "stat", + libc::SYS_lstat => "lstat", + libc::SYS_newfstatat => "newfstatat", + libc::SYS_statx => "statx", + libc::SYS_readlink => "readlink", + libc::SYS_readlinkat => "readlinkat", + _ => "unknown", + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn syscall_names() { + assert_eq!(syscall_name(libc::SYS_openat as i32), "openat"); + assert_eq!(syscall_name(libc::SYS_stat as i32), "stat"); + assert_eq!(syscall_name(9999), "unknown"); + } +} diff --git a/crates/evalbox-sandbox/src/notify/virtual_fs.rs b/crates/evalbox-sandbox/src/notify/virtual_fs.rs new file mode 100644 index 0000000..7c7df5b --- /dev/null +++ b/crates/evalbox-sandbox/src/notify/virtual_fs.rs @@ -0,0 +1,139 @@ +//! Virtual filesystem path translation. +//! +//! Maps paths from the child's perspective to real paths on the host. +//! Used by the supervisor in `Virtualize` mode to translate filesystem +//! syscalls to the correct workspace paths. +//! +//! ## Default Mappings +//! +//! | Child sees | Host path | +//! |-----------|-----------| +//! | `/work` | `{workspace}/work` | +//! | `/tmp` | `{workspace}/tmp` | +//! | `/home` | `{workspace}/home` | + +use std::collections::HashMap; +use std::path::{Path, PathBuf}; + +/// Virtual filesystem with path translation. +#[derive(Debug, Clone)] +pub struct VirtualFs { + /// Maps virtual prefix → real prefix. + mappings: HashMap, +} + +impl VirtualFs { + /// Create a new `VirtualFs` with default mappings for the given workspace root. + pub fn new(workspace_root: &Path) -> Self { + let mut mappings = HashMap::new(); + mappings.insert(PathBuf::from("/work"), workspace_root.join("work")); + mappings.insert(PathBuf::from("/tmp"), workspace_root.join("tmp")); + mappings.insert(PathBuf::from("/home"), workspace_root.join("home")); + Self { mappings } + } + + /// Create an empty `VirtualFs` with no mappings. + pub fn empty() -> Self { + Self { + mappings: HashMap::new(), + } + } + + /// Add a path mapping. + pub fn add_mapping(&mut self, virtual_path: impl Into, real_path: impl Into) { + self.mappings.insert(virtual_path.into(), real_path.into()); + } + + /// Translate a path from child's view to host's view. + /// + /// Returns `Some(real_path)` if the path matches a mapping, + /// `None` if the path should be accessed as-is (passthrough). + pub fn translate(&self, path: &str) -> Option { + let path = Path::new(path); + for (virtual_prefix, real_prefix) in &self.mappings { + if let Ok(suffix) = path.strip_prefix(virtual_prefix) { + return Some(real_prefix.join(suffix)); + } + } + None + } + + /// Check if a path is within any allowed scope. + /// + /// In `Virtualize` mode, only paths within mappings or system paths are allowed. + pub fn is_allowed(&self, path: &str) -> bool { + let path = Path::new(path); + + // Check virtual mappings + for virtual_prefix in self.mappings.keys() { + if path.starts_with(virtual_prefix) { + return true; + } + } + + // Allow common system paths (read-only, handled by Landlock) + let system_prefixes = ["/usr", "/bin", "/lib", "/lib64", "/etc", "/proc", "/dev"]; + for prefix in &system_prefixes { + if path.starts_with(prefix) { + return true; + } + } + + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn default_mappings() { + let vfs = VirtualFs::new(Path::new("/tmp/evalbox-abc123")); + + assert_eq!( + vfs.translate("/work/main.py"), + Some(PathBuf::from("/tmp/evalbox-abc123/work/main.py")) + ); + assert_eq!( + vfs.translate("/tmp/output.txt"), + Some(PathBuf::from("/tmp/evalbox-abc123/tmp/output.txt")) + ); + assert_eq!( + vfs.translate("/home/.bashrc"), + Some(PathBuf::from("/tmp/evalbox-abc123/home/.bashrc")) + ); + } + + #[test] + fn no_translation_for_system_paths() { + let vfs = VirtualFs::new(Path::new("/tmp/evalbox-abc123")); + assert_eq!(vfs.translate("/usr/bin/python3"), None); + assert_eq!(vfs.translate("/etc/passwd"), None); + } + + #[test] + fn is_allowed_checks() { + let vfs = VirtualFs::new(Path::new("/tmp/evalbox-abc123")); + + assert!(vfs.is_allowed("/work/test.py")); + assert!(vfs.is_allowed("/tmp/output")); + assert!(vfs.is_allowed("/usr/bin/python3")); + assert!(vfs.is_allowed("/etc/passwd")); + assert!(vfs.is_allowed("/proc/self/status")); + assert!(!vfs.is_allowed("/root/.ssh/id_rsa")); + assert!(!vfs.is_allowed("/var/log/syslog")); + } + + #[test] + fn custom_mapping() { + let mut vfs = VirtualFs::empty(); + vfs.add_mapping("/data", "/mnt/shared/data"); + + assert_eq!( + vfs.translate("/data/file.csv"), + Some(PathBuf::from("/mnt/shared/data/file.csv")) + ); + assert_eq!(vfs.translate("/work/test"), None); + } +} diff --git a/crates/evalbox-sandbox/src/plan.rs b/crates/evalbox-sandbox/src/plan.rs index 31dc2a6..62a31ae 100644 --- a/crates/evalbox-sandbox/src/plan.rs +++ b/crates/evalbox-sandbox/src/plan.rs @@ -41,6 +41,22 @@ use std::collections::{HashMap, HashSet}; use std::path::PathBuf; use std::time::Duration; +/// Seccomp user notification mode. +/// +/// Controls how the supervisor handles intercepted syscalls from the sandboxed child. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum NotifyMode { + /// No seccomp notify filter installed. Zero overhead. Default. + #[default] + Disabled, + /// Supervisor logs syscalls and returns `SECCOMP_USER_NOTIF_FLAG_CONTINUE`. + /// Minimal overhead. For debugging/auditing. + Monitor, + /// Supervisor intercepts FS syscalls, translates paths via `VirtualFs`, + /// opens files at translated paths, injects fd via `SECCOMP_IOCTL_NOTIF_ADDFD`. + Virtualize, +} + /// Mount point configuration. /// /// This is the canonical Mount type used throughout evalbox. @@ -273,7 +289,7 @@ impl UserFile { #[derive(Debug, Clone)] pub struct Plan { pub cmd: Vec, - /// Pre-resolved binary path. If set, sandbox uses this instead of resolving `cmd\[0\]`. + /// Pre-resolved binary path. If set, sandbox uses this instead of resolving `cmd[0]`. /// This allows evalbox to do binary resolution before calling sandbox. pub binary_path: Option, pub env: HashMap, @@ -291,12 +307,10 @@ pub struct Plan { pub syscalls: Option, /// Custom Landlock configuration. pub landlock: Option, + /// Seccomp user notification mode. + pub notify_mode: NotifyMode, } -/// Type alias for backwards compatibility. -#[deprecated(since = "0.2.0", note = "Use `Plan` instead")] -pub type SandboxPlan = Plan; - impl Default for Plan { fn default() -> Self { Self { @@ -315,6 +329,7 @@ impl Default for Plan { network_blocked: true, syscalls: None, landlock: None, + notify_mode: NotifyMode::Disabled, } } } @@ -425,6 +440,16 @@ impl Plan { self } + /// Set the seccomp user notification mode. + /// + /// - `Disabled` (default): No notify filter, zero overhead. + /// - `Monitor`: Log intercepted syscalls for debugging. + /// - `Virtualize`: Full filesystem virtualization via path translation. + pub fn notify_mode(mut self, mode: NotifyMode) -> Self { + self.notify_mode = mode; + self + } + /// Execute this plan (convenience method). /// /// Equivalent to `Executor::run(self)`. diff --git a/crates/evalbox-sandbox/src/resolve.rs b/crates/evalbox-sandbox/src/resolve.rs index 1a7b757..7fd934d 100644 --- a/crates/evalbox-sandbox/src/resolve.rs +++ b/crates/evalbox-sandbox/src/resolve.rs @@ -101,7 +101,8 @@ mod tests { let sys_paths = &*SYSTEM_PATHS; let mounts = detect_mounts(Path::new("/usr/bin/echo"), sys_paths); - if sys_paths.system_type == SystemType::Fhs { + // Only check for /usr mount if we're on an actual FHS system with /usr + if sys_paths.system_type == SystemType::Fhs && Path::new("/usr").exists() { assert!(mounts.iter().any(|m| m.source == Path::new("/usr"))); } } diff --git a/crates/evalbox-sandbox/src/workspace.rs b/crates/evalbox-sandbox/src/workspace.rs index 68b2001..dcf5d42 100644 --- a/crates/evalbox-sandbox/src/workspace.rs +++ b/crates/evalbox-sandbox/src/workspace.rs @@ -1,14 +1,14 @@ //! Workspace and pipe management for sandboxed execution. //! -//! The workspace is a temporary directory that becomes the sandbox root after `pivot_root`. -//! It contains all the pipes for parent-child communication. +//! The workspace is a temporary directory containing the sandbox's writable areas +//! and all the pipes for parent-child communication. //! //! ## Pipes //! //! - **stdin**: Parent writes → Child reads //! - **stdout**: Child writes → Parent reads //! - **stderr**: Child writes → Parent reads -//! - **sync**: Eventfd pair for parent-child synchronization (UID map setup) +//! - **sync**: Eventfd for parent-child synchronization //! //! ## Important: Pipe Hygiene //! @@ -59,6 +59,9 @@ impl Pipe { } /// Eventfd-based parent-child synchronization. +/// +/// Used when `NotifyMode::Disabled` — the child signals readiness via eventfd +/// after completing setup, and the parent writes back to let it proceed to exec. #[derive(Debug)] pub struct SyncPair { pub child_ready: OwnedFd, @@ -150,7 +153,6 @@ impl Workspace { fs::write(&full, content)?; if executable { - // Set executable permission (rwxr-xr-x) fs::set_permissions(&full, fs::Permissions::from_mode(0o755))?; } @@ -163,57 +165,14 @@ impl Workspace { Ok(full) } + /// Create standard sandbox directories. + /// + /// Only creates the writable workspace directories (work, tmp, home). + /// No rootfs directories (proc, dev, etc.) needed since we don't use `pivot_root`. pub fn setup_sandbox_dirs(&self) -> io::Result<()> { - for dir in [ - "proc", "dev", "tmp", "home", "work", "usr", "bin", "lib", "lib64", "etc", - ] { + for dir in ["work", "tmp", "home"] { self.create_dir(dir)?; } - self.setup_minimal_etc()?; - Ok(()) - } - - /// Create minimal /etc files to prevent information leakage. - /// - /// Instead of mounting the host's /etc (which contains sensitive info like - /// /etc/passwd, /etc/shadow), we create a minimal /etc with only essential files. - pub fn setup_minimal_etc(&self) -> io::Result<()> { - let etc = self.root.join("etc"); - - // Minimal /etc/passwd - just nobody user - fs::write( - etc.join("passwd"), - "nobody:x:65534:65534:Unprivileged user:/nonexistent:/usr/sbin/nologin\n", - )?; - - // Minimal /etc/group - just nobody group - fs::write(etc.join("group"), "nogroup:x:65534:\n")?; - - // Minimal /etc/hosts - localhost only - fs::write(etc.join("hosts"), "127.0.0.1 localhost\n::1 localhost\n")?; - - // Minimal /etc/nsswitch.conf - required for name resolution - fs::write( - etc.join("nsswitch.conf"), - "passwd: files\ngroup: files\nhosts: files dns\n", - )?; - - // Copy /etc/ld.so.cache from host if it exists (needed for dynamic linking) - let host_ldcache = Path::new("/etc/ld.so.cache"); - if host_ldcache.exists() { - if let Ok(content) = fs::read(host_ldcache) { - fs::write(etc.join("ld.so.cache"), content)?; - } - } - - // Create /etc/ssl directory for certificates - let ssl_dir = etc.join("ssl"); - fs::create_dir_all(&ssl_dir)?; - - // Minimal /etc/resolv.conf - empty (network is blocked by default) - // When network is enabled, Landlock will allow DNS - fs::write(etc.join("resolv.conf"), "# DNS disabled in sandbox\n")?; - Ok(()) } } @@ -252,4 +211,13 @@ mod tests { let perms = std::fs::metadata(&path).unwrap().permissions(); assert_eq!(perms.mode() & 0o777, 0o755); } + + #[test] + fn workspace_sandbox_dirs() { + let ws = Workspace::new().unwrap(); + ws.setup_sandbox_dirs().unwrap(); + assert!(ws.root().join("work").exists()); + assert!(ws.root().join("tmp").exists()); + assert!(ws.root().join("home").exists()); + } } diff --git a/crates/evalbox-sandbox/tests/common/mod.rs b/crates/evalbox-sandbox/tests/common/mod.rs index d863f1b..cad5812 100644 --- a/crates/evalbox-sandbox/tests/common/mod.rs +++ b/crates/evalbox-sandbox/tests/common/mod.rs @@ -63,33 +63,6 @@ fn find_payload(name: &str) -> Option { None } -/// Check if we have permission to create user namespaces. -pub fn can_create_namespaces() -> bool { - // Check kernel parameter - if let Ok(content) = std::fs::read_to_string("/proc/sys/kernel/unprivileged_userns_clone") { - if content.trim() == "0" { - return false; - } - } - - // Try to actually create a namespace - let result = std::process::Command::new("unshare") - .args(["--user", "--map-root-user", "true"]) - .output(); - - result.map(|o| o.status.success()).unwrap_or(false) -} - -/// Skip test if namespaces aren't available. Call at start of test. -pub fn skip_if_no_namespaces() -> bool { - if !can_create_namespaces() { - eprintln!("Skipping: Cannot create user namespaces"); - true - } else { - false - } -} - /// SIGSYS signal number (seccomp violation). pub const SIGSYS: i32 = 31; diff --git a/crates/evalbox-sandbox/tests/security/cve.rs b/crates/evalbox-sandbox/tests/security/cve.rs index a46007e..933c0a9 100644 --- a/crates/evalbox-sandbox/tests/security/cve.rs +++ b/crates/evalbox-sandbox/tests/security/cve.rs @@ -7,7 +7,7 @@ use std::time::Duration; use evalbox_sandbox::{Executor, Plan}; -use crate::common::{SIGSYS, payload, skip_if_no_namespaces}; +use crate::common::{SIGSYS, payload}; // ============================================================================= // CVE-2024-1086: nf_tables Use-After-Free @@ -23,13 +23,10 @@ use crate::common::{SIGSYS, payload, skip_if_no_namespaces}; #[test] #[ignore] fn test_cve_2024_1086_nftables_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("cve_2024_1086_nftables")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -57,13 +54,10 @@ fn test_cve_2024_1086_nftables_blocked() { #[test] #[ignore] fn test_cve_2022_0185_fsconfig_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("cve_2022_0185_fsconfig")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -91,13 +85,10 @@ fn test_cve_2022_0185_fsconfig_blocked() { #[test] #[ignore] fn test_cve_2017_5226_tiocsti_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("cve_2017_5226_tiocsti")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -125,13 +116,10 @@ fn test_cve_2017_5226_tiocsti_blocked() { #[test] #[ignore] fn test_cve_2022_0492_cgroups_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("cve_2022_0492_cgroups")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -159,13 +147,10 @@ fn test_cve_2022_0492_cgroups_blocked() { #[test] #[ignore] fn test_fileless_memfd_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("fileless_memfd")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -190,13 +175,10 @@ fn test_fileless_memfd_blocked() { #[test] #[ignore] fn test_ioctl_tioclinux_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("ioctl_tioclinux")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -218,13 +200,10 @@ fn test_ioctl_tioclinux_blocked() { #[test] #[ignore] fn test_ioctl_tiocsetd_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("ioctl_tiocsetd")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -250,13 +229,10 @@ fn test_ioctl_tiocsetd_blocked() { #[test] #[ignore] fn test_userns_creation_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("userns_escape")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -284,13 +260,10 @@ fn test_userns_creation_blocked() { #[test] #[ignore] fn test_ptrace_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("ptrace_escape")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -317,13 +290,10 @@ fn test_ptrace_blocked() { #[test] #[ignore] fn test_cve_2019_10063_ioctl_bypass_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("cve_2019_10063_ioctl_bypass")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); diff --git a/crates/evalbox-sandbox/tests/security/filesystem.rs b/crates/evalbox-sandbox/tests/security/filesystem.rs index 93ed59d..0f9be09 100644 --- a/crates/evalbox-sandbox/tests/security/filesystem.rs +++ b/crates/evalbox-sandbox/tests/security/filesystem.rs @@ -1,23 +1,20 @@ //! Filesystem isolation tests. //! //! These tests verify that sandboxed processes cannot access -//! files outside their allowed mounts. +//! files outside their Landlock-allowed paths. +//! +//! Without `pivot_root`, the child process chdir's to `{workspace}/work`. +//! Landlock restricts filesystem access to only allowed paths. use std::time::Duration; use evalbox_sandbox::{Executor, Plan}; -use crate::common::skip_if_no_namespaces; - /// Test that /etc/shadow is not accessible. -/// This file contains password hashes and should never be readable. +/// Landlock only grants read access to /etc, and /etc/shadow requires root. #[test] #[ignore] fn test_cannot_read_etc_shadow() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run(Plan::new(["cat", "/etc/shadow"]).timeout(Duration::from_secs(5))) .expect("Executor should run"); @@ -31,13 +28,10 @@ fn test_cannot_read_etc_shadow() { } /// Test that /etc/passwd cannot be written to. +/// Landlock grants read-only access to /etc, so writes should be blocked. #[test] #[ignore] fn test_cannot_write_etc_passwd() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( Plan::new(["sh", "-c", "echo 'hacked:x:0:0::/:/bin/sh' >> /etc/passwd"]) .timeout(Duration::from_secs(5)), @@ -48,143 +42,111 @@ fn test_cannot_write_etc_passwd() { } /// Test that /root is not accessible. +/// Landlock has no rule for /root, so access should be denied. #[test] #[ignore] fn test_cannot_access_root_home() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run(Plan::new(["ls", "/root"]).timeout(Duration::from_secs(5))) .expect("Executor should run"); assert!(!output.success(), "/root should not be accessible"); } -/// Test that the work directory is writable. +/// Test that the work directory (CWD) is writable. +/// The child chdir's to {workspace}/work, which Landlock grants read/write access to. #[test] #[ignore] fn test_work_dir_is_writable() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( Plan::new([ "sh", "-c", - "echo 'test content' > /work/test.txt && cat /work/test.txt", + "echo 'test content' > ./test.txt && cat ./test.txt", ]) .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); - assert!(output.success(), "Should be able to write to /work"); + assert!(output.success(), "Should be able to write to CWD (work dir)"); assert_eq!(output.stdout_str().trim(), "test content"); } -/// Test that /tmp is writable. +/// Test that the workspace tmp directory is writable. +/// The workspace tmp dir is at ../tmp relative to CWD ({workspace}/work). +/// Landlock grants read/write access to the workspace root. #[test] #[ignore] fn test_tmp_is_writable() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( Plan::new([ "sh", "-c", - "echo 'temp data' > /tmp/test.txt && cat /tmp/test.txt", + "echo 'temp data' > ../tmp/test.txt && cat ../tmp/test.txt", ]) .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); - assert!(output.success(), "Should be able to write to /tmp"); + assert!( + output.success(), + "Should be able to write to workspace tmp (../tmp)" + ); assert_eq!(output.stdout_str().trim(), "temp data"); } -/// Test that path traversal attempts are blocked. +/// Test that path traversal attempts are blocked by Landlock. /// -/// The sandbox creates a minimal /etc with only essential files (passwd, group, hosts). -/// Path traversal should only see the sandbox's minimal /etc, not the host's. +/// Without `pivot_root`, path traversal from CWD goes up the real filesystem. +/// Landlock should block access to paths outside the allowed set. #[test] #[ignore] fn test_path_traversal_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["cat", "/work/../../../etc/passwd"]).timeout(Duration::from_secs(5)), + Plan::new(["cat", "../../../etc/shadow"]).timeout(Duration::from_secs(5)), ) .expect("Executor should run"); - // The path resolves to /etc/passwd which is the sandbox's minimal passwd - if output.success() { - let content = output.stdout_str(); - - // Verify this is NOT the real host passwd - // Real passwd would have many entries (root, daemon, bin, etc.) - let line_count = content.lines().count(); - let has_nixbld = content.contains("nixbld"); // NixOS specific - let has_root = content.contains("root:"); - let has_real_users = content.contains("daemon:") || content.contains("bin:"); - - assert!( - !has_nixbld && !has_real_users && line_count <= 5, - "Path traversal should not leak real /etc/passwd.\n\ - Expected minimal sandbox passwd, got {line_count} lines:\n{content}" - ); - - // If there's root: it should be the sandbox's nobody-only passwd - if has_root { - panic!("Path traversal leaked real /etc/passwd with root entry:\n{content}"); - } - } + // Landlock should block access to /etc/shadow (no read on shadow, even via traversal) + assert!( + !output.success(), + "Path traversal to /etc/shadow should be blocked by Landlock" + ); } /// Test that symlink attacks are prevented. +/// Landlock controls access at the kernel level, so symlinks to restricted +/// paths should still be blocked. #[test] #[ignore] fn test_symlink_escape_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( Plan::new([ "sh", "-c", - "ln -s /etc/shadow /work/shadow && cat /work/shadow", + "ln -s /etc/shadow ./shadow && cat ./shadow", ]) .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); - // Either symlink creation fails or reading it fails + // Either symlink creation fails or reading the target fails due to Landlock assert!(!output.success(), "Symlink escape should be blocked"); } -/// Test that /proc/self/exe cannot be used to escape. +/// Test that /proc/self/exe is safe. +/// Without `pivot_root`, /proc/self/exe reveals the real binary path on the host. +/// This is expected behavior -- we just verify the sandbox doesn't crash. #[test] #[ignore] fn test_proc_self_exe_safe() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run(Plan::new(["readlink", "/proc/self/exe"]).timeout(Duration::from_secs(5))) .expect("Executor should run"); - // Should not reveal host paths - if output.success() { - let exe_path = output.stdout_str(); - assert!( - !exe_path.contains("/home/") && !exe_path.contains("/usr/"), - "/proc/self/exe should not reveal host paths: {exe_path}" - ); - } + // Without pivot_root, /proc/self/exe will show the real host path. + // This is expected -- just verify the command runs without crashing. + assert!( + output.exit_code.is_some(), + "/proc/self/exe readlink should complete without crashing" + ); } diff --git a/crates/evalbox-sandbox/tests/security/network.rs b/crates/evalbox-sandbox/tests/security/network.rs index ef884cf..2603c6a 100644 --- a/crates/evalbox-sandbox/tests/security/network.rs +++ b/crates/evalbox-sandbox/tests/security/network.rs @@ -7,17 +7,11 @@ use std::time::Duration; use evalbox_sandbox::{Executor, Plan}; -use crate::common::skip_if_no_namespaces; - /// Test that network is blocked by default. /// curl should fail to connect to any external host. #[test] #[ignore] fn test_network_blocked_by_default() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( Plan::new(["sh", "-c", "curl -s --connect-timeout 2 http://example.com || wget -q -O- --timeout=2 http://example.com"]) .timeout(Duration::from_secs(5)), @@ -31,29 +25,21 @@ fn test_network_blocked_by_default() { #[test] #[ignore] fn test_localhost_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( Plan::new(["sh", "-c", "echo test | nc -w1 127.0.0.1 80 2>/dev/null"]) .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); - // Should fail - network namespace isolates us + // Should fail - seccomp blocks socket creation assert!(!output.success(), "Localhost should not be reachable"); } /// Test that external DNS resolution fails when network is blocked. -/// Note: /etc/hosts lookups may still work since the file exists in sandbox. +/// Note: /etc/hosts lookups may still work since the file exists on the host. #[test] #[ignore] fn test_external_dns_blocked() { - if skip_if_no_namespaces() { - return; - } - // Use a domain that definitely isn't in /etc/hosts let output = Executor::run( Plan::new([ @@ -83,10 +69,6 @@ fn test_external_dns_blocked() { #[test] #[ignore] fn test_network_flag_enabled() { - if skip_if_no_namespaces() { - return; - } - // Just verify that enabling network doesn't break sandbox execution let output = Executor::run( Plan::new(["sh", "-c", "echo 'network flag test'"]) @@ -109,10 +91,6 @@ fn test_network_flag_enabled() { #[test] #[ignore] fn test_loopback_isolated() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( Plan::new([ "sh", @@ -123,14 +101,14 @@ fn test_loopback_isolated() { ) .expect("Executor should run"); - // The loopback might exist in the network namespace but be isolated - // This is more of a sanity check + // Without network namespaces, the host loopback is visible but + // seccomp blocks socket creation so it can't be used to connect. + // This is more of a sanity check that the command runs. if output.success() { - // If lo exists, verify it's the sandbox's own interface let stdout = output.stdout_str(); assert!( stdout.contains("lo") || stdout.contains("127.0.0.1"), - "Loopback should be visible if network namespace is active" + "Loopback should be visible" ); } } diff --git a/crates/evalbox-sandbox/tests/security/resources.rs b/crates/evalbox-sandbox/tests/security/resources.rs index 2075527..fa51dd0 100644 --- a/crates/evalbox-sandbox/tests/security/resources.rs +++ b/crates/evalbox-sandbox/tests/security/resources.rs @@ -7,16 +7,12 @@ use std::time::Duration; use evalbox_sandbox::{Executor, Plan, Status}; -use crate::common::{payload, skip_if_no_namespaces}; +use crate::common::payload; /// Test that timeout is enforced. #[test] #[ignore] fn test_timeout_enforced() { - if skip_if_no_namespaces() { - return; - } - let start = std::time::Instant::now(); let output = Executor::run(Plan::new(["sleep", "60"]).timeout(Duration::from_millis(500))) @@ -36,10 +32,6 @@ fn test_timeout_enforced() { #[test] #[ignore] fn test_infinite_loop_timeout() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( Plan::new(["sh", "-c", "while true; do :; done"]).timeout(Duration::from_millis(500)), ) @@ -53,13 +45,10 @@ fn test_infinite_loop_timeout() { #[test] #[ignore] fn test_max_pids_enforced() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("fork_bomb")) + .binary_path("./payload") .max_pids(10) .timeout(Duration::from_secs(5)), ) @@ -80,10 +69,6 @@ fn test_max_pids_enforced() { #[test] #[ignore] fn test_output_limit_enforced() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( Plan::new(["sh", "-c", "yes | head -c 100000"]) // 100KB of 'y' .max_output(1024) // 1KB limit @@ -113,10 +98,6 @@ fn test_output_limit_enforced() { #[test] #[ignore] fn test_memory_limit_set() { - if skip_if_no_namespaces() { - return; - } - // Check that the memory rlimit is set correctly let output = Executor::run( Plan::new([ @@ -146,10 +127,6 @@ fn test_memory_limit_set() { #[test] #[ignore] fn test_fd_limit_set() { - if skip_if_no_namespaces() { - return; - } - // Check the fd limit using ulimit let output = Executor::run(Plan::new(["sh", "-c", "ulimit -n"]).timeout(Duration::from_secs(5))) @@ -173,10 +150,6 @@ fn test_fd_limit_set() { #[test] #[ignore] fn test_cpu_intensive_timeout() { - if skip_if_no_namespaces() { - return; - } - let start = std::time::Instant::now(); // CPU-intensive work that doesn't sleep diff --git a/crates/evalbox-sandbox/tests/security/seccomp.rs b/crates/evalbox-sandbox/tests/security/seccomp.rs index bbfdc00..937dd68 100644 --- a/crates/evalbox-sandbox/tests/security/seccomp.rs +++ b/crates/evalbox-sandbox/tests/security/seccomp.rs @@ -7,20 +7,17 @@ use std::time::Duration; use evalbox_sandbox::{Executor, Plan}; -use crate::common::{SIGSYS, payload, skip_if_no_namespaces}; +use crate::common::{SIGSYS, payload}; /// Test that a simple payload can execute successfully. /// This is a control test to verify the sandbox is working. #[test] #[ignore] fn test_payload_can_execute() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("success")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -39,13 +36,10 @@ fn test_payload_can_execute() { #[test] #[ignore] fn test_ptrace_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("syscall_ptrace")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -64,13 +58,10 @@ fn test_ptrace_blocked() { #[test] #[ignore] fn test_mount_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("syscall_mount")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -89,13 +80,10 @@ fn test_mount_blocked() { #[test] #[ignore] fn test_reboot_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("syscall_reboot")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -114,13 +102,10 @@ fn test_reboot_blocked() { #[test] #[ignore] fn test_clone_newuser_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("syscall_clone_ns")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -139,13 +124,10 @@ fn test_clone_newuser_blocked() { #[test] #[ignore] fn test_socket_netlink_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("socket_netlink")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -164,13 +146,10 @@ fn test_socket_netlink_blocked() { #[test] #[ignore] fn test_socket_raw_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("socket_raw")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -190,13 +169,10 @@ fn test_socket_raw_blocked() { #[test] #[ignore] fn test_keyctl_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("syscall_keyctl")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -216,13 +192,10 @@ fn test_keyctl_blocked() { #[test] #[ignore] fn test_bpf_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("syscall_bpf")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); diff --git a/crates/evalbox-sys/src/check.rs b/crates/evalbox-sys/src/check.rs index f88e3c2..6076036 100644 --- a/crates/evalbox-sys/src/check.rs +++ b/crates/evalbox-sys/src/check.rs @@ -7,9 +7,8 @@ //! //! | Feature | Minimum | Check Method | //! |---------|---------|--------------| -//! | Kernel | 5.13 | `uname` syscall | -//! | Landlock | ABI 1 | `landlock_create_ruleset` with VERSION flag | -//! | User NS | enabled | `/proc/sys/kernel/unprivileged_userns_clone` or fork+unshare test | +//! | Kernel | 6.12 | `uname` syscall | +//! | Landlock | ABI 5 | `landlock_create_ruleset` with VERSION flag | //! | Seccomp | enabled | `prctl(PR_GET_SECCOMP)` | //! //! ## Usage @@ -20,13 +19,6 @@ //! Err(e) => eprintln!("System not supported: {}", e), //! } //! ``` -//! -//! ## User Namespaces -//! -//! User namespace support varies by distribution: -//! - **Debian/Ubuntu**: `/proc/sys/kernel/unprivileged_userns_clone` -//! - **NixOS/Fedora**: `/proc/sys/user/max_user_namespaces` -//! - **Fallback**: Fork + unshare test use std::sync::OnceLock; @@ -41,7 +33,6 @@ use crate::seccomp; pub struct SystemInfo { pub kernel_version: (u32, u32, u32), pub landlock_abi: u32, - pub user_ns_enabled: bool, pub seccomp_enabled: bool, } @@ -57,8 +48,8 @@ pub enum CheckError { #[error("landlock is not available")] LandlockNotAvailable, - #[error("user namespaces are disabled")] - UserNamespacesDisabled, + #[error("landlock ABI {found} is too old, need at least ABI {required}")] + LandlockAbiTooOld { required: u32, found: u32 }, #[error("seccomp is not available")] SeccompNotAvailable, @@ -67,8 +58,9 @@ pub enum CheckError { KernelVersionReadFailed, } -// Minimum kernel version: 5.13 (first with Landlock) -const MIN_KERNEL_VERSION: (u32, u32, u32) = (5, 13, 0); +// Minimum kernel version: 6.12 (Landlock ABI 5 with SCOPE_SIGNAL + SCOPE_ABSTRACT_UNIX_SOCKET) +const MIN_KERNEL_VERSION: (u32, u32, u32) = (6, 12, 0); +const MIN_LANDLOCK_ABI: u32 = 5; static SYSTEM_INFO: OnceLock> = OnceLock::new(); @@ -93,10 +85,11 @@ fn check_impl() -> Result { if landlock_abi == 0 { return Err(CheckError::LandlockNotAvailable); } - - let user_ns_enabled = check_user_namespaces(); - if !user_ns_enabled { - return Err(CheckError::UserNamespacesDisabled); + if landlock_abi < MIN_LANDLOCK_ABI { + return Err(CheckError::LandlockAbiTooOld { + required: MIN_LANDLOCK_ABI, + found: landlock_abi, + }); } let seccomp_enabled = seccomp::seccomp_available(); @@ -107,7 +100,6 @@ fn check_impl() -> Result { Ok(SystemInfo { kernel_version, landlock_abi, - user_ns_enabled, seccomp_enabled, }) } @@ -145,36 +137,6 @@ fn parse_kernel_version(release: &str) -> Result<(u32, u32, u32), CheckError> { Ok((major, minor, patch)) } -fn check_user_namespaces() -> bool { - // Check sysctl first (Debian/Ubuntu) - if let Ok(content) = std::fs::read_to_string("/proc/sys/kernel/unprivileged_userns_clone") { - return content.trim() == "1"; - } - - // Check max_user_namespaces (NixOS and others) - if let Ok(content) = std::fs::read_to_string("/proc/sys/user/max_user_namespaces") - && content.trim().parse::().unwrap_or(0) > 0 - { - return true; - } - - // Last resort: fork + unshare test (must fork to avoid polluting parent) - // SAFETY: fork/unshare/waitpid are safe when used correctly. Child exits immediately. - unsafe { - let pid = libc::fork(); - if pid < 0 { - return false; - } - if pid == 0 { - let ret = libc::unshare(libc::CLONE_NEWUSER); - libc::_exit(if ret == 0 { 0 } else { 1 }); - } - let mut status: i32 = 0; - libc::waitpid(pid, &mut status, 0); - libc::WIFEXITED(status) && libc::WEXITSTATUS(status) == 0 - } -} - #[cfg(test)] mod tests { use super::*; @@ -187,6 +149,7 @@ mod tests { parse_kernel_version("5.4.0-150-generic").unwrap(), (5, 4, 0) ); + assert_eq!(parse_kernel_version("6.12.0").unwrap(), (6, 12, 0)); } #[test] @@ -195,7 +158,6 @@ mod tests { Ok(info) => { println!("Kernel version: {:?}", info.kernel_version); println!("Landlock ABI: {}", info.landlock_abi); - println!("User NS enabled: {}", info.user_ns_enabled); println!("Seccomp enabled: {}", info.seccomp_enabled); } Err(e) => { diff --git a/crates/evalbox-sys/src/landlock.rs b/crates/evalbox-sys/src/landlock.rs index 6a7599d..c862cd4 100644 --- a/crates/evalbox-sys/src/landlock.rs +++ b/crates/evalbox-sys/src/landlock.rs @@ -12,6 +12,7 @@ //! | 2 | 5.19 | `REFER` (cross-directory rename/link) | //! | 3 | 6.2 | `TRUNCATE` (file truncation) | //! | 4 | 6.7 | `IOCTL_DEV`, TCP network access | +//! | 5 | 6.12 | `SCOPE_SIGNAL`, `SCOPE_ABSTRACT_UNIX_SOCKET` | //! //! ## Usage //! @@ -75,11 +76,19 @@ pub const LANDLOCK_ACCESS_FS_IOCTL_DEV: u64 = 1 << 15; pub const LANDLOCK_ACCESS_NET_BIND_TCP: u64 = 1 << 0; pub const LANDLOCK_ACCESS_NET_CONNECT_TCP: u64 = 1 << 1; +// ABI v5 - Scoped restrictions +/// Block abstract unix socket connections outside the sandbox. +pub const LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET: u64 = 1 << 0; +/// Block signals to processes outside the sandbox. +pub const LANDLOCK_SCOPE_SIGNAL: u64 = 1 << 1; + #[repr(C)] #[derive(Debug, Default)] pub struct LandlockRulesetAttr { pub handled_access_fs: u64, pub handled_access_net: u64, + /// ABI 5+: Scoped restrictions (signal and abstract unix socket isolation). + pub scoped: u64, } #[repr(C)] @@ -205,6 +214,18 @@ pub fn net_access_for_abi(abi: u32) -> u64 { } } +/// Returns the scoped restriction flags for the given ABI version. +/// +/// ABI 5+ supports signal isolation and abstract unix socket isolation, +/// replacing the need for PID and IPC namespaces. +pub fn scope_for_abi(abi: u32) -> u64 { + if abi >= 5 { + LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET | LANDLOCK_SCOPE_SIGNAL + } else { + 0 + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/evalbox-sys/src/lib.rs b/crates/evalbox-sys/src/lib.rs index 59a11d7..78a4688 100644 --- a/crates/evalbox-sys/src/lib.rs +++ b/crates/evalbox-sys/src/lib.rs @@ -5,8 +5,9 @@ //! //! ## Modules //! -//! - **landlock** - Landlock LSM for filesystem/network access control (kernel 5.13+) +//! - **landlock** - Landlock LSM for filesystem/network/scope access control (kernel 5.13+) //! - **seccomp** - Seccomp-BPF syscall filtering +//! - **`seccomp_notify`** - Seccomp user notification (`SECCOMP_RET_USER_NOTIF`) //! - **check** - Runtime system capability detection //! //! ## Landlock @@ -16,6 +17,7 @@ //! - ABI 2: File truncation (kernel 5.19) //! - ABI 3: File permission changes (kernel 6.2) //! - ABI 4: Network TCP access control (kernel 6.7) +//! - ABI 5: Scoped signals and abstract unix sockets (kernel 6.12) //! //! ## Seccomp-BPF //! @@ -23,6 +25,11 @@ //! a whitelist-based filter that allows ~40 safe syscalls and kills the process //! on any other syscall. //! +//! ## Seccomp User Notify +//! +//! Seccomp user notification allows a supervisor process to intercept syscalls +//! from a sandboxed child, enabling filesystem virtualization without namespaces. +//! //! # Safety //! //! This crate contains raw syscall wrappers. Casts between integer types @@ -34,6 +41,7 @@ pub mod check; pub mod landlock; pub mod seccomp; +pub mod seccomp_notify; pub use check::{CheckError, SystemInfo, check}; diff --git a/crates/evalbox-sys/src/seccomp.rs b/crates/evalbox-sys/src/seccomp.rs index b2b57d6..1fd50d4 100644 --- a/crates/evalbox-sys/src/seccomp.rs +++ b/crates/evalbox-sys/src/seccomp.rs @@ -47,14 +47,14 @@ //! - `memfd_create` + `execveat` - Enables fileless execution (bypass Landlock) //! - `setresuid`/`setresgid` - No reason to change UID in sandbox //! - `setsid`/`setpgid` - Session manipulation, unnecessary -//! - `ioctl` - Too powerful without argument filtering (TODO: whitelist specific codes) +//! - `ioctl` - Allowed with argument filtering (TIOCSTI, TIOCSETD, TIOCLINUX blocked) //! //! ## Security Notes //! //! - Filter is permanent - cannot be removed once applied //! - Requires `PR_SET_NO_NEW_PRIVS` first //! - Blocked syscall = immediate process termination (SIGSYS) -//! - `kill`/`tgkill` are safe because we use PID namespace (`CLONE_NEWPID`) +//! - `kill`/`tgkill` are safe due to Landlock v5 `SCOPE_SIGNAL` isolation //! - `prctl` allowed but `PR_SET_SECCOMP` has no effect (filter already applied) use rustix::io::Errno; @@ -64,6 +64,7 @@ use crate::last_errno; // Seccomp constants const SECCOMP_SET_MODE_FILTER: u32 = 1; const SECCOMP_RET_KILL_PROCESS: u32 = 0x80000000; +const SECCOMP_RET_USER_NOTIF: u32 = 0x7fc00000; const SECCOMP_RET_ALLOW: u32 = 0x7fff0000; // Return ENOSYS (38) to allow graceful fallback const SECCOMP_RET_ERRNO_ENOSYS: u32 = 0x00050000 | 38; @@ -172,7 +173,7 @@ pub struct SockFprog { /// - `setsid`/`setpgid` - Session manipulation unnecessary /// /// ## Notes: -/// - `kill`/`tgkill` safe due to PID namespace isolation +/// - `kill`/`tgkill` safe due to Landlock v5 `SCOPE_SIGNAL` isolation /// - `prctl` kept for runtime needs (`PR_SET_NAME`, etc.) pub const DEFAULT_WHITELIST: &[i64] = &[ // === Basic I/O === @@ -291,7 +292,7 @@ pub const DEFAULT_WHITELIST: &[i64] = &[ libc::SYS_fchdir, libc::SYS_readlink, libc::SYS_readlinkat, - // === Signals (safe due to PID namespace) === + // === Signals (safe due to Landlock SCOPE_SIGNAL) === libc::SYS_rt_sigaction, libc::SYS_rt_sigprocmask, libc::SYS_rt_sigreturn, @@ -299,9 +300,9 @@ pub const DEFAULT_WHITELIST: &[i64] = &[ libc::SYS_rt_sigpending, libc::SYS_rt_sigtimedwait, libc::SYS_sigaltstack, - libc::SYS_kill, // Safe: PID namespace isolates - libc::SYS_tgkill, // Safe: PID namespace isolates - libc::SYS_tkill, // Safe: PID namespace isolates + libc::SYS_kill, // Safe: Landlock SCOPE_SIGNAL isolates + libc::SYS_tgkill, // Safe: Landlock SCOPE_SIGNAL isolates + libc::SYS_tkill, // Safe: Landlock SCOPE_SIGNAL isolates // === Process control === libc::SYS_execve, // execveat REMOVED - with memfd_create enables fileless execution @@ -567,6 +568,81 @@ pub fn seccomp_available() -> bool { unsafe { libc::prctl(libc::PR_GET_SECCOMP, 0, 0, 0, 0) >= 0 } } +/// Builds a BPF filter that returns `SECCOMP_RET_USER_NOTIF` for the listed +/// syscalls and `SECCOMP_RET_ALLOW` for everything else. +/// +/// This filter is installed *before* the kill filter. The kernel evaluates all +/// stacked filters and returns the strictest verdict, so: +/// - Syscall in both ALLOW lists → ALLOW +/// - Syscall in NOTIFY + ALLOW → NOTIFY (supervisor decides) +/// - Syscall not in kill filter whitelist → KILL (regardless of notify filter) +/// +/// # Panics +/// +/// Panics if `syscalls.len()` > 200 (BPF jump offsets are u8). +pub fn build_notify_filter(syscalls: &[i64]) -> Vec { + assert!( + syscalls.len() <= MAX_WHITELIST_SIZE, + "notify syscall list too large: {} > {}", + syscalls.len(), + MAX_WHITELIST_SIZE + ); + + let n = syscalls.len(); + let mut filter = Vec::with_capacity(n + 8); + + // Architecture check + filter.push(SockFilter::stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARCH)); + filter.push(SockFilter::jump( + BPF_JMP | BPF_JEQ | BPF_K, + AUDIT_ARCH_X86_64, + 1, + 0, + )); + filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)); + + // Load syscall number + filter.push(SockFilter::stmt( + BPF_LD | BPF_W | BPF_ABS, + OFFSET_SYSCALL_NR, + )); + + // Check each syscall → jump to NOTIFY + for (i, &nr) in syscalls.iter().enumerate() { + let notify_offset = (n - i) as u8; // jump to NOTIFY instruction + filter.push(SockFilter::jump( + BPF_JMP | BPF_JEQ | BPF_K, + nr as u32, + notify_offset, + 0, + )); + } + + // Default: ALLOW + filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)); + + // NOTIFY + filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_USER_NOTIF)); + + filter +} + +/// Syscalls that are intercepted by the notify filter for filesystem virtualization. +pub const NOTIFY_FS_SYSCALLS: &[i64] = &[ + libc::SYS_openat, + libc::SYS_open, + libc::SYS_creat, + libc::SYS_access, + libc::SYS_faccessat, + libc::SYS_faccessat2, + libc::SYS_stat, + libc::SYS_lstat, + libc::SYS_newfstatat, + libc::SYS_statx, + libc::SYS_readlink, + libc::SYS_readlinkat, +]; + #[cfg(test)] mod tests { use super::*; @@ -659,4 +735,20 @@ mod tests { let huge: Vec = (0..300).map(|i| i as i64).collect(); build_whitelist_filter(&huge); } + + #[test] + fn notify_filter_structure() { + let syscalls = &[libc::SYS_openat, libc::SYS_open, libc::SYS_stat]; + let filter = build_notify_filter(syscalls); + // 3 (arch) + 1 (load) + 3 (checks) + 1 (allow) + 1 (notify) = 9 + assert_eq!(filter.len(), 9); + } + + #[test] + fn notify_fs_syscalls_present() { + assert!(NOTIFY_FS_SYSCALLS.contains(&libc::SYS_openat)); + assert!(NOTIFY_FS_SYSCALLS.contains(&libc::SYS_open)); + assert!(NOTIFY_FS_SYSCALLS.contains(&libc::SYS_stat)); + assert!(NOTIFY_FS_SYSCALLS.contains(&libc::SYS_readlink)); + } } diff --git a/crates/evalbox-sys/src/seccomp_notify.rs b/crates/evalbox-sys/src/seccomp_notify.rs new file mode 100644 index 0000000..9edf775 --- /dev/null +++ b/crates/evalbox-sys/src/seccomp_notify.rs @@ -0,0 +1,247 @@ +//! Seccomp user notification (`SECCOMP_RET_USER_NOTIF`) support. +//! +//! Seccomp user notification allows a supervisor process to intercept +//! syscalls from a sandboxed child and make decisions on its behalf. +//! This enables filesystem virtualization without user namespaces. +//! +//! ## Architecture +//! +//! 1. Child installs a seccomp filter with `SECCOMP_FILTER_FLAG_NEW_LISTENER` +//! 2. This returns a "listener fd" which is passed to the parent via `SCM_RIGHTS` +//! 3. Parent polls the listener fd; when readable, calls `SECCOMP_IOCTL_NOTIF_RECV` +//! 4. Parent inspects the syscall and either: +//! - Returns `SECCOMP_USER_NOTIF_FLAG_CONTINUE` to let it proceed +//! - Returns an error code to deny it +//! - Uses `SECCOMP_IOCTL_NOTIF_ADDFD` to inject a file descriptor +//! +//! ## TOCTOU Protection +//! +//! Between receiving a notification and responding, the child's memory may change. +//! Always call `SECCOMP_IOCTL_NOTIF_ID_VALID` after reading child memory to verify +//! the notification is still valid. + +use std::os::fd::{FromRawFd, OwnedFd}; + +use rustix::io::Errno; + +use crate::last_errno; +use crate::seccomp::SockFprog; + +// Seccomp constants for notify +const SECCOMP_SET_MODE_FILTER: u32 = 1; +pub const SECCOMP_FILTER_FLAG_NEW_LISTENER: u32 = 1 << 3; + +/// Let the syscall proceed as-is (supervisor approves). +pub const SECCOMP_USER_NOTIF_FLAG_CONTINUE: u32 = 1; + +/// Atomically inject fd and respond to the notification. +pub const SECCOMP_ADDFD_FLAG_SEND: u32 = 1 << 0; +/// Replace an existing fd in the target process. +pub const SECCOMP_ADDFD_FLAG_SETFD: u32 = 1 << 1; + +// ioctl numbers for seccomp notify (from kernel headers) +// These are architecture-dependent; values below are for x86_64. +// SECCOMP_IOCTL_NOTIF_RECV = SECCOMP_IOWR(0, struct seccomp_notif) +// SECCOMP_IOCTL_NOTIF_SEND = SECCOMP_IOWR(1, struct seccomp_notif_resp) +// SECCOMP_IOCTL_NOTIF_ID_VALID = SECCOMP_IOW(2, __u64) +// SECCOMP_IOCTL_NOTIF_ADDFD = SECCOMP_IOW(3, struct seccomp_notif_addfd) + +/// ioctl to receive a notification from the seccomp listener fd. +pub const SECCOMP_IOCTL_NOTIF_RECV: u64 = 0xc0502100; +/// ioctl to send a response to a seccomp notification. +pub const SECCOMP_IOCTL_NOTIF_SEND: u64 = 0xc0182101; +/// ioctl to check if a notification ID is still valid (TOCTOU protection). +pub const SECCOMP_IOCTL_NOTIF_ID_VALID: u64 = 0x40082102; +/// ioctl to inject a file descriptor into the notifying process. +pub const SECCOMP_IOCTL_NOTIF_ADDFD: u64 = 0x40182103; + +/// Seccomp notification data (mirrors kernel `struct seccomp_data`). +#[repr(C)] +#[derive(Debug, Clone, Copy, Default)] +pub struct SeccompData { + /// Syscall number. + pub nr: i32, + /// Architecture (`AUDIT_ARCH_*`). + pub arch: u32, + /// Instruction pointer at time of syscall. + pub instruction_pointer: u64, + /// Syscall arguments. + pub args: [u64; 6], +} + +/// Seccomp notification received from the child (mirrors kernel `struct seccomp_notif`). +#[repr(C)] +#[derive(Debug, Clone, Copy)] +pub struct SeccompNotif { + /// Unique notification ID. + pub id: u64, + /// PID of the notifying process (in supervisor's PID namespace). + pub pid: u32, + /// Flags (currently unused, must be 0). + pub flags: u32, + /// The syscall data. + pub data: SeccompData, +} + +impl Default for SeccompNotif { + fn default() -> Self { + // SAFETY: SeccompNotif is a plain C struct with no invariants. + unsafe { std::mem::zeroed() } + } +} + +/// Response to a seccomp notification (mirrors kernel `struct seccomp_notif_resp`). +#[repr(C)] +#[derive(Debug, Clone, Copy, Default)] +pub struct SeccompNotifResp { + /// Must match the notification ID. + pub id: u64, + /// Return value for the syscall. + pub val: i64, + /// Errno value (negated in kernel). + pub error: i32, + /// Flags (e.g., `SECCOMP_USER_NOTIF_FLAG_CONTINUE`). + pub flags: u32, +} + +/// Inject a file descriptor into the notifying process +/// (mirrors kernel `struct seccomp_notif_addfd`). +#[repr(C)] +#[derive(Debug, Clone, Copy, Default)] +pub struct SeccompNotifAddfd { + /// Must match the notification ID. + pub id: u64, + /// Flags (e.g., `SECCOMP_ADDFD_FLAG_SEND`). + pub flags: u32, + /// The fd in the supervisor to inject. + pub srcfd: u32, + /// The fd number to use in the target (0 = kernel picks). + pub newfd: u32, + /// Flags for the new fd (e.g., `O_CLOEXEC`). + pub newfd_flags: u32, +} + +/// Install a seccomp filter with `SECCOMP_FILTER_FLAG_NEW_LISTENER`. +/// +/// Returns the listener fd which can be used to receive notifications. +/// The caller must have already called `PR_SET_NO_NEW_PRIVS`. +/// +/// # Safety +/// +/// The filter must be a valid BPF program. This permanently restricts +/// syscalls for this thread. +/// +/// # Errors +/// +/// Returns `Errno` if the filter cannot be installed. +pub unsafe fn seccomp_set_mode_filter_listener(fprog: &SockFprog) -> Result { + unsafe { + let ret = libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + if ret != 0 { + return Err(last_errno()); + } + + let ret = libc::syscall( + libc::SYS_seccomp, + SECCOMP_SET_MODE_FILTER, + SECCOMP_FILTER_FLAG_NEW_LISTENER, + fprog as *const _, + ); + if ret < 0 { + Err(last_errno()) + } else { + // SAFETY: On success, ret is a valid listener file descriptor. + Ok(OwnedFd::from_raw_fd(ret as i32)) + } + } +} + +/// Receive a notification from the seccomp listener fd. +/// +/// Blocks until a notification is available (or use poll/epoll first). +/// +/// # Errors +/// +/// Returns `Errno` on failure (e.g., `ENOENT` if the target died). +pub fn notif_recv(listener_fd: i32, notif: &mut SeccompNotif) -> Result<(), Errno> { + let ret = + unsafe { libc::ioctl(listener_fd, SECCOMP_IOCTL_NOTIF_RECV, notif as *mut SeccompNotif) }; + if ret < 0 { Err(last_errno()) } else { Ok(()) } +} + +/// Send a response to a seccomp notification. +/// +/// # Errors +/// +/// Returns `Errno` on failure. +pub fn notif_send(listener_fd: i32, resp: &SeccompNotifResp) -> Result<(), Errno> { + let ret = unsafe { + libc::ioctl( + listener_fd, + SECCOMP_IOCTL_NOTIF_SEND, + resp as *const SeccompNotifResp, + ) + }; + if ret < 0 { Err(last_errno()) } else { Ok(()) } +} + +/// Check if a notification ID is still valid. +/// +/// Must be called after reading from child's `/proc/pid/mem` to protect +/// against TOCTOU attacks. +/// +/// # Errors +/// +/// Returns `Errno::NOENT` if the notification is no longer valid. +pub fn notif_id_valid(listener_fd: i32, id: u64) -> Result<(), Errno> { + let ret = unsafe { + libc::ioctl( + listener_fd, + SECCOMP_IOCTL_NOTIF_ID_VALID, + &id as *const u64, + ) + }; + if ret < 0 { Err(last_errno()) } else { Ok(()) } +} + +/// Inject a file descriptor into the notifying process. +/// +/// With `SECCOMP_ADDFD_FLAG_SEND`, this atomically injects the fd and +/// responds to the notification (the return value becomes the new fd number +/// in the target process). +/// +/// # Errors +/// +/// Returns `Errno` on failure. +pub fn notif_addfd(listener_fd: i32, addfd: &SeccompNotifAddfd) -> Result { + let ret = unsafe { + libc::ioctl( + listener_fd, + SECCOMP_IOCTL_NOTIF_ADDFD, + addfd as *const SeccompNotifAddfd, + ) + }; + if ret < 0 { Err(last_errno()) } else { Ok(ret) } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn struct_sizes() { + // Verify struct sizes match kernel expectations + assert_eq!(size_of::(), 64); + assert_eq!(size_of::(), 80); + assert_eq!(size_of::(), 24); + assert_eq!(size_of::(), 24); + } + + #[test] + fn default_notif_is_zeroed() { + let notif = SeccompNotif::default(); + assert_eq!(notif.id, 0); + assert_eq!(notif.pid, 0); + assert_eq!(notif.data.nr, 0); + } +} From 15abdc4b7e692938565a75b8d3ddb55175df2824 Mon Sep 17 00:00:00 2001 From: fullzer4 Date: Sun, 22 Feb 2026 16:26:45 -0300 Subject: [PATCH 2/6] build: migrate nix flake to flake-parts + import-tree Replace manual forAllSystems boilerplate with flake-parts module system and import-tree for auto-discovery of nix modules. - Add nix/toolchain.nix (shared crane/toolchain via _module.args) - Merge checks into nix/packages.nix - Add packages.test-all for full security test suite - Restrict to x86_64-linux (arm not yet supported) - Remove nix/lib.nix, nix/checks.nix, nix/tests/ --- .github/workflows/ci.yml | 70 ++++++++++++++++++++---------------- .gitignore | 5 ++- flake.lock | 78 ++++++++++++++++++++++++++++++++++++---- flake.nix | 53 +++++++-------------------- nix/devshell.nix | 17 +++++++++ nix/packages.nix | 44 +++++++++++++++++++++++ nix/toolchain.nix | 27 ++++++++++++++ 7 files changed, 216 insertions(+), 78 deletions(-) create mode 100644 nix/devshell.nix create mode 100644 nix/packages.nix create mode 100644 nix/toolchain.nix diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c65f903..9f962b3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,45 +6,56 @@ on: pull_request: branches: [main] -env: - CARGO_TERM_COLOR: always - RUSTFLAGS: -Dwarnings +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: - fmt: - name: Format + nix-checks: + name: Nix Checks runs-on: ubuntu-latest + permissions: + id-token: write + contents: read steps: - uses: actions/checkout@v4 - - run: rustup component add rustfmt - - run: cargo fmt --all --check - clippy: - name: Clippy - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - run: rustup component add clippy - - uses: Swatinem/rust-cache@v2 - - run: cargo clippy --all-targets --all-features + - uses: DeterminateSystems/determinate-nix-action@v3 - doc: - name: Documentation - runs-on: ubuntu-latest - env: - RUSTDOCFLAGS: -Dwarnings - steps: - - uses: actions/checkout@v4 - - uses: Swatinem/rust-cache@v2 - - run: cargo doc --no-deps --all-features + - uses: DeterminateSystems/magic-nix-cache-action@main + + - name: Run fast checks (clippy, fmt, test, doc) + run: | + nix build -L \ + .#checks.x86_64-linux.clippy \ + .#checks.x86_64-linux.fmt \ + .#checks.x86_64-linux.test \ + .#checks.x86_64-linux.doc - unit-tests: - name: Unit Tests + e2e-docker: + name: E2E Docker (${{ matrix.distro }}) runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + distro: [ubuntu:24.04, fedora:41, alpine:3.21] steps: - uses: actions/checkout@v4 - - uses: Swatinem/rust-cache@v2 - - run: cargo test --lib --all-features + + - uses: DeterminateSystems/determinate-nix-action@v3 + + - uses: DeterminateSystems/magic-nix-cache-action@main + + - name: Build test binary + run: nix build -L .#checks.x86_64-linux.test + + - name: Run security tests in Docker (${{ matrix.distro }}) + run: | + TEST_BIN=$(find result/ -name 'security_tests-*' -type f -executable | head -1) + docker run --rm \ + -v "$TEST_BIN:/security_tests:ro" \ + ${{ matrix.distro }} \ + /security_tests --ignored semver-check: name: SemVer Check @@ -52,7 +63,6 @@ jobs: steps: - uses: actions/checkout@v4 - uses: Swatinem/rust-cache@v2 - - name: Check semver - uses: obi1kenobi/cargo-semver-checks-action@v2 + - uses: obi1kenobi/cargo-semver-checks-action@v2 with: package: evalbox diff --git a/.gitignore b/.gitignore index 73e2f03..a803b6f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ .direnv target/ -bindings/ \ No newline at end of file +bindings/ + +# Local cargo config +.cargo/ \ No newline at end of file diff --git a/flake.lock b/flake.lock index 82db2d7..90964ff 100644 --- a/flake.lock +++ b/flake.lock @@ -1,12 +1,60 @@ { "nodes": { + "crane": { + "locked": { + "lastModified": 1771121070, + "narHash": "sha256-aIlv7FRXF9q70DNJPI237dEDAznSKaXmL5lfK/Id/bI=", + "owner": "ipetkov", + "repo": "crane", + "rev": "a2812c19f1ed2e5ed5ce2ef7109798b575c180e1", + "type": "github" + }, + "original": { + "owner": "ipetkov", + "repo": "crane", + "type": "github" + } + }, + "flake-parts": { + "inputs": { + "nixpkgs-lib": "nixpkgs-lib" + }, + "locked": { + "lastModified": 1769996383, + "narHash": "sha256-AnYjnFWgS49RlqX7LrC4uA+sCCDBj0Ry/WOJ5XWAsa0=", + "owner": "hercules-ci", + "repo": "flake-parts", + "rev": "57928607ea566b5db3ad13af0e57e921e6b12381", + "type": "github" + }, + "original": { + "owner": "hercules-ci", + "repo": "flake-parts", + "type": "github" + } + }, + "import-tree": { + "locked": { + "lastModified": 1771045967, + "narHash": "sha256-oYO4poyw0Sb/db2PigqugMlDwsvwLg6CSpFrMUWxA3Q=", + "owner": "vic", + "repo": "import-tree", + "rev": "c968d3b54d12cf5d9c13f16f7c545a06c9d1fde6", + "type": "github" + }, + "original": { + "owner": "vic", + "repo": "import-tree", + "type": "github" + } + }, "nixpkgs": { "locked": { - "lastModified": 1770562336, - "narHash": "sha256-ub1gpAONMFsT/GU2hV6ZWJjur8rJ6kKxdm9IlCT0j84=", + "lastModified": 1771008912, + "narHash": "sha256-gf2AmWVTs8lEq7z/3ZAsgnZDhWIckkb+ZnAo5RzSxJg=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "d6c71932130818840fc8fe9509cf50be8c64634f", + "rev": "a82ccc39b39b621151d6732718e3e250109076fa", "type": "github" }, "original": { @@ -16,8 +64,26 @@ "type": "github" } }, + "nixpkgs-lib": { + "locked": { + "lastModified": 1769909678, + "narHash": "sha256-cBEymOf4/o3FD5AZnzC3J9hLbiZ+QDT/KDuyHXVJOpM=", + "owner": "nix-community", + "repo": "nixpkgs.lib", + "rev": "72716169fe93074c333e8d0173151350670b824c", + "type": "github" + }, + "original": { + "owner": "nix-community", + "repo": "nixpkgs.lib", + "type": "github" + } + }, "root": { "inputs": { + "crane": "crane", + "flake-parts": "flake-parts", + "import-tree": "import-tree", "nixpkgs": "nixpkgs", "rust-overlay": "rust-overlay" } @@ -29,11 +95,11 @@ ] }, "locked": { - "lastModified": 1770865833, - "narHash": "sha256-oiARqnlvaW6pVGheVi4ye6voqCwhg5hCcGish2ZvQzI=", + "lastModified": 1771297684, + "narHash": "sha256-wieWskQxZLPlNXX06JEB0bMoS/ZYQ89xBzF0RL9lyLs=", "owner": "oxalica", "repo": "rust-overlay", - "rev": "c8cfbe26238638e2f3a2c0ae7e8d240f5e4ded85", + "rev": "755d3669699a7c62aef35af187d75dc2728cfd85", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index e708a37..ccc5bef 100644 --- a/flake.nix +++ b/flake.nix @@ -3,47 +3,18 @@ inputs = { nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; - rust-overlay.url = "github:oxalica/rust-overlay"; - rust-overlay.inputs.nixpkgs.follows = "nixpkgs"; + flake-parts.url = "github:hercules-ci/flake-parts"; + import-tree.url = "github:vic/import-tree"; + rust-overlay = { + url = "github:oxalica/rust-overlay"; + inputs.nixpkgs.follows = "nixpkgs"; + }; + crane.url = "github:ipetkov/crane"; }; - outputs = { self, nixpkgs, rust-overlay }: - let - systems = [ "x86_64-linux" "aarch64-linux" ]; - forAllSystems = nixpkgs.lib.genAttrs systems; - in - { - devShells = forAllSystems (system: - let - pkgs = import nixpkgs { - inherit system; - overlays = [ rust-overlay.overlays.default ]; - }; - - rust = pkgs.rust-bin.stable.latest.default.override { - extensions = [ "rust-src" "rust-analyzer" ]; - }; - in - { - default = pkgs.mkShell { - buildInputs = with pkgs; [ - rust - pkg-config - - mold - clang - - python3 - go - ]; - - RUST_BACKTRACE = "1"; - - shellHook = '' - echo "evalbox dev environment" - echo "Rust: $(rustc --version)" - ''; - }; - }); - }; + outputs = inputs: + inputs.flake-parts.lib.mkFlake { inherit inputs; } + (inputs.import-tree ./nix // { + systems = [ "x86_64-linux" ]; + }); } diff --git a/nix/devshell.nix b/nix/devshell.nix new file mode 100644 index 0000000..001172d --- /dev/null +++ b/nix/devshell.nix @@ -0,0 +1,17 @@ +{ ... }: +{ + perSystem = { pkgs, toolchainWithExtensions, ... }: { + devShells.default = pkgs.mkShell { + name = "evalbox-dev"; + buildInputs = with pkgs; [ + toolchainWithExtensions + pkg-config + gcc + python3 + go + ]; + RUST_SRC_PATH = "${toolchainWithExtensions}/lib/rustlib/src/rust/library"; + RUST_BACKTRACE = "1"; + }; + }; +} diff --git a/nix/packages.nix b/nix/packages.nix new file mode 100644 index 0000000..2bbd50c --- /dev/null +++ b/nix/packages.nix @@ -0,0 +1,44 @@ +{ ... }: +{ + perSystem = { pkgs, craneLib, toolchainWithExtensions, src, commonArgs, cargoArtifacts, ... }: { + packages = { + default = craneLib.buildPackage (commonArgs // { + inherit cargoArtifacts; + }); + + test-all = pkgs.writeShellApplication { + name = "evalbox-test-all"; + runtimeInputs = [ toolchainWithExtensions pkgs.pkg-config pkgs.gcc ]; + text = '' + cargo test --lib + + cargo build -p evalbox-sandbox + + cargo test -p evalbox-sandbox --test security_tests --ignored -- --test-threads=1 + ''; + }; + }; + + checks = { + clippy = craneLib.cargoClippy { + inherit src cargoArtifacts; + pname = "evalbox-clippy"; + cargoClippyExtraArgs = "--all-targets -- -D warnings"; + }; + fmt = craneLib.cargoFmt { + inherit src; + pname = "evalbox-fmt"; + }; + test = craneLib.cargoTest { + inherit src cargoArtifacts; + pname = "evalbox-test"; + cargoTestExtraArgs = "--lib"; + }; + doc = craneLib.cargoDoc { + inherit src cargoArtifacts; + pname = "evalbox-doc"; + RUSTDOCFLAGS = "-D warnings"; + }; + }; + }; +} diff --git a/nix/toolchain.nix b/nix/toolchain.nix new file mode 100644 index 0000000..3b90771 --- /dev/null +++ b/nix/toolchain.nix @@ -0,0 +1,27 @@ +{ inputs, ... }: +{ + perSystem = { system, ... }: + let + pkgs = import inputs.nixpkgs { + inherit system; + overlays = [ inputs.rust-overlay.overlays.default ]; + }; + toolchain = pkgs.rust-bin.stable.latest.default; + toolchainWithExtensions = toolchain.override { + extensions = [ "rust-src" "rust-analyzer" "clippy" "rustfmt" ]; + }; + craneLib = (inputs.crane.mkLib pkgs).overrideToolchain toolchain; + src = craneLib.cleanCargoSource ./..; + crateInfo = craneLib.crateNameFromCargoToml { cargoToml = ./../Cargo.toml; }; + commonArgs = { + inherit src; + inherit (crateInfo) pname version; + nativeBuildInputs = with pkgs; [ pkg-config ]; + }; + cargoArtifacts = craneLib.buildDepsOnly commonArgs; + in { + _module.args = { + inherit pkgs craneLib toolchainWithExtensions src commonArgs cargoArtifacts; + }; + }; +} From c6e4b33d49aecdd080cf35c6d1f1179683bcfddf Mon Sep 17 00:00:00 2001 From: fullzer4 Date: Sun, 22 Feb 2026 16:27:04 -0300 Subject: [PATCH 3/6] docs: update for Landlock v5 architecture Rewrite all documentation to reflect removal of namespaces and Landlock v5 as primary isolation mechanism. - Update README: kernel 6.12+, no namespace references - Rename docs/SECURITY.md to docs/SECURITY_MODEL.md - Add SECURITY.md (GitHub vulnerability reporting policy) - Add CONTRIBUTING.md (dev setup, testing guide) - Rewrite ARCHITECTURE.md (lifecycle, diagrams, crate structure) - Add security hardening roadmap (UDP, /proc, PID namespace) - Add CHANGELOG 0.1.1 --- CHANGELOG.md | 43 ++++- CONTRIBUTING.md | 80 ++++++++++ README.md | 13 +- SECURITY.md | 42 +++++ docs/ARCHITECTURE.md | 240 ++++++++++++---------------- docs/ROADMAP.md | 27 ++++ docs/SECURITY.md | 354 ----------------------------------------- docs/SECURITY_MODEL.md | 286 +++++++++++++++++++++++++++++++++ 8 files changed, 582 insertions(+), 503 deletions(-) create mode 100644 CONTRIBUTING.md create mode 100644 SECURITY.md delete mode 100644 docs/SECURITY.md create mode 100644 docs/SECURITY_MODEL.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 4408a55..fdf27b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,40 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.1.1] - 2026-02-22 + +### Changed + +- **Architecture: remove namespace isolation, use Landlock v5 as primary** + - Removed user, PID, network, mount, UTS, and IPC namespaces + - Removed `pivot_root` and bind mount rootfs setup + - Landlock v5 replaces namespaces for filesystem, network, signal, and IPC control + - Plain `fork()` instead of `clone()` with `CLONE_NEW*` flags + - Minimum kernel raised from 5.13 to 6.12 (Landlock ABI 5) + +- **Resource limits moved to dedicated module** (`isolation/rlimits.rs`) + - `RLIMIT_DATA` (256 MiB) instead of `RLIMIT_AS` (breaks Go/Java/V8 runtimes) + - Added `RLIMIT_CPU`, `RLIMIT_CORE`, `RLIMIT_STACK` + +- **Nix flake migrated to flake-parts + import-tree** + - Auto-discovery of modules via `import-tree ./nix` + - Removed manual `forAllSystems` boilerplate + - Restricted to `x86_64-linux` (arm not yet supported) + +### Added + +- Seccomp user notify support (`SECCOMP_RET_USER_NOTIF`) for optional syscall interception +- `nix run .#test-all` to run the full security test suite +- `SECURITY.md` — GitHub standard vulnerability reporting policy +- `CONTRIBUTING.md` — development setup, testing guide +- Security hardening roadmap (UDP filtering, /proc restriction, optional PID namespace) + +### Removed + +- `crates/evalbox-sandbox/src/isolation/namespace.rs` — namespace setup +- `crates/evalbox-sandbox/src/isolation/rootfs.rs` — pivot_root + bind mounts +- `nix/lib.nix`, `nix/checks.nix`, `nix/tests/` — replaced by flake-parts modules + ## [0.1.0] - 2025-02-17 ### Added @@ -19,14 +53,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Go runtime with compilation caching - Shell runtime for script execution -- **Security isolation** (7 layers of defense) - - User namespaces (unprivileged containers) - - PID namespace (process isolation) - - Network namespace (network isolation) - - Mount namespace + pivot_root (filesystem isolation) - - Landlock LSM (kernel-enforced filesystem rules) +- **Security isolation** + - Landlock v5 (filesystem, network, signal, IPC access control) - Seccomp BPF (syscall whitelist with ~100 allowed syscalls) - Resource limits (memory, PIDs, file descriptors, timeout) + - Privilege hardening (NO_NEW_PRIVS, securebits, capability drop) - **Seccomp filtering** - Whitelist-based syscall filter diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..dd919ef --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,80 @@ +# Contributing + +## Development Setup + +evalbox uses Nix for a reproducible dev environment: + +```bash +nix develop +``` + +This provides the Rust toolchain, GCC (for test payloads), Python, and Go. + +## Building + +```bash +cargo build +``` + +## Testing + +### Fast checks (CI) + +```bash +# Runs via nix: clippy, fmt, unit tests, docs +nix flake check +``` + +Or manually: + +```bash +cargo clippy --all-targets -- -D warnings +cargo fmt --check +cargo test --lib +cargo doc --no-deps +``` + +### Full test suite (requires user namespaces) + +```bash +nix run .#test-all +``` + +Or manually: + +```bash +cargo build -p evalbox-sandbox +cargo test -p evalbox-sandbox --test security_tests --ignored -- --test-threads=1 +``` + +The security tests require Linux with user namespaces enabled. They compile C payloads that attempt real exploit techniques (CVEs, syscall abuse, escape vectors) and verify the sandbox blocks them. + +### Running specific test categories + +```bash +cargo test -p evalbox-sandbox --test security_tests seccomp -- --ignored +cargo test -p evalbox-sandbox --test security_tests filesystem -- --ignored +cargo test -p evalbox-sandbox --test security_tests network -- --ignored +cargo test -p evalbox-sandbox --test security_tests cve -- --ignored +cargo test -p evalbox-sandbox --test security_tests resources -- --ignored +``` + +## Project Structure + +``` +evalbox/ # Public API, language runtimes +evalbox-sandbox/ # Sandbox orchestration, isolation +evalbox-sys/ # Low-level Linux syscall wrappers +``` + +See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for details. + +## Pull Requests + +- Run `nix flake check` before submitting +- Security-related changes should include tests in `crates/evalbox-sandbox/tests/security/` +- Keep the seccomp whitelist minimal: don't add syscalls without justification + +## Security + +Found a vulnerability? See [SECURITY.md](SECURITY.md) for reporting instructions. diff --git a/README.md b/README.md index 379b8bf..9dae5a0 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Execute code like `eval()`, but safe. No containers, no VMs, no root. - **Simple** - One function call, security handled for you - **Multi-language** - Python, Go, and shell/terminal commands - **Fast** - Millisecond startup, no containers or VMs -- **Secure** - 7 layers of isolation (namespaces, Landlock, seccomp, rlimits) +- **Secure** - Landlock v5 + seccomp-BPF + rlimits, no namespaces needed ## Quick Start @@ -38,8 +38,8 @@ let output = shell::run("curl https://example.com") ## Requirements -- Linux kernel 5.13+ (Landlock ABI 1+) -- User namespaces enabled +- Linux kernel 6.12+ (Landlock ABI 5) +- Seccomp enabled ## Installation @@ -50,15 +50,16 @@ evalbox = { version = "0.1", features = ["python", "go", "shell"] } ## Security -7 layers of isolation: user namespaces, PID namespace, network namespace, mount namespace + pivot_root, Landlock LSM, seccomp BPF, rlimits. +Isolation via Landlock v5 (filesystem + network + signal + IPC scoping), seccomp-BPF (syscall whitelist), rlimits, privilege hardening (NO_NEW_PRIVS, securebits, capability drop). -See [SECURITY.md](docs/SECURITY.md) for threat model and CVE protections. +See [Security Model](docs/SECURITY_MODEL.md) for threat model and CVE protections. ## Documentation - [Architecture](docs/ARCHITECTURE.md) -- [Security Model](docs/SECURITY.md) +- [Security Model](docs/SECURITY_MODEL.md) - [Roadmap](docs/ROADMAP.md) +- [Contributing](CONTRIBUTING.md) ## License diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..0ce141b --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,42 @@ +# Security Policy + +## Supported Versions + +| Version | Supported | +|---------|-----------| +| 0.1.x | Yes | + +## Reporting a Vulnerability + +If you discover a security vulnerability in evalbox, **please do not open a public issue.** + +Instead, report it privately via [GitHub Security Advisories](https://github.com/fullzer4/evalbox/security/advisories/new). + +Include: +- Description of the vulnerability +- Steps to reproduce +- Which isolation mechanism is affected (Landlock, seccomp, rlimits, privilege hardening) +- Impact assessment (sandbox escape, info leak, DoS, etc.) + +You should receive a response within **72 hours**. Critical sandbox escape vulnerabilities are treated as highest priority. + +## Scope + +evalbox provides isolation via Landlock v5, seccomp-BPF, rlimits, and privilege hardening. The following are in scope for security reports: + +- Sandbox escape (code executing outside isolation) +- Filesystem access beyond Landlock-allowed paths +- Network access when disabled +- Privilege escalation from sandbox +- Seccomp filter bypass +- Landlock rule bypass +- Resource limit bypass (memory, PIDs, file descriptors) + +See [docs/SECURITY_MODEL.md](docs/SECURITY_MODEL.md) for the full threat model and isolation architecture. + +## Out of Scope + +- Kernel 0-day exploits (requires kernel hardening) +- CPU side-channel attacks (Spectre/Meltdown) +- Denial of service against the host kernel +- Issues requiring non-default kernel configurations diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 9f22c66..adeca15 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -2,7 +2,7 @@ ## Overview -evalbox is a secure sandbox for executing untrusted code on Linux. It provides millisecond-startup isolation using Linux namespaces, Landlock LSM, and seccomp-BPF. +evalbox is a secure sandbox for executing untrusted code on Linux. It provides millisecond-startup isolation using Landlock LSM v5, seccomp-BPF, and rlimits — no namespaces, no containers, no root. ``` ┌─────────────────────────────────────────────────────────────────┐ @@ -29,23 +29,23 @@ evalbox is a secure sandbox for executing untrusted code on Linux. It provides m │ └──────────────────────────────────────────────────────────┘ │ │ ┌──────────────────────────────────────────────────────────┐ │ │ │ Isolation │ │ -│ │ • Namespaces (user, pid, net, mount, uts, ipc) │ │ -│ │ • pivot_root + minimal rootfs │ │ -│ │ • Landlock filesystem rules │ │ -│ │ • Seccomp syscall filter │ │ +│ │ • Landlock v5 (filesystem, network, signal, IPC) │ │ +│ │ • Seccomp-BPF (syscall whitelist) │ │ +│ │ • rlimits (memory, CPU, PIDs, fds) │ │ +│ │ • Privilege hardening (securebits, capability drop) │ │ │ └──────────────────────────────────────────────────────────┘ │ └─────────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────────┐ │ evalbox-sys │ -│ Raw Linux syscalls: clone3, pidfd, seccomp, landlock │ +│ Raw Linux syscalls: seccomp, landlock, seccomp_notify │ └─────────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────────┐ │ Linux Kernel │ -│ namespaces │ seccomp-bpf │ landlock │ cgroups │ rlimits │ +│ seccomp-bpf │ landlock │ rlimits │ └─────────────────────────────────────────────────────────────────┘ ``` @@ -68,17 +68,18 @@ evalbox/ │ ├── workspace.rs # Temporary filesystem setup │ ├── monitor.rs # Process monitoring, output capture │ ├── isolation/ # Isolation mechanisms -│ │ ├── namespace.rs # User/PID/Net namespace setup -│ │ ├── rootfs.rs # Mount namespace, pivot_root -│ │ └── lockdown.rs # Landlock + seccomp application +│ │ ├── lockdown.rs # Landlock v5 + securebits + cap drop +│ │ └── rlimits.rs # Resource limits +│ ├── notify/ # Seccomp user notify (optional) │ ├── validate.rs # Input validation -│ └── sysinfo.rs # System detection (Nix, paths) +│ └── resolve.rs # Binary resolution │ └── evalbox-sys/ # Low-level syscalls └── src/ ├── seccomp.rs # BPF filter generation + ├── seccomp_notify.rs # Seccomp user notify support ├── landlock.rs # Landlock ruleset API - └── check.rs # Capability detection + └── check.rs # System capability detection ``` --- @@ -132,13 +133,6 @@ loop { } ``` -### Platform Behavior - -| Platform | Process Monitoring | I/O Multiplexing | -|----------|-------------------|------------------| -| Linux | pidfd + epoll | mio (epoll) | -| macOS | vsock to VM | mio (kqueue) | - --- ## Sandbox Lifecycle @@ -156,85 +150,93 @@ loop { ┌──────────────────────────────────────────────────────────────────┐ │ 2. WORKSPACE PREPARATION │ │ • Create tempdir (/tmp/evalbox-XXXXX) │ -│ • Setup directory structure (/work, /tmp, /etc) │ -│ • Write user files │ -│ • Create pipes (stdin, stdout, stderr) │ +│ • Create writable directories: /work, /tmp, /home │ +│ • Write user files to /work │ +│ • Create pipes (stdin, stdout, stderr) + eventfd sync │ └──────────────────────────────────────────────────────────────────┘ │ ▼ ┌──────────────────────────────────────────────────────────────────┐ -│ 3. CLONE WITH NAMESPACES │ -│ clone3(CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET | │ -│ CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC) │ +│ 3. FORK │ +│ fork() — plain fork, no CLONE_NEW* flags │ │ │ │ Parent Child │ │ │ │ │ -│ ├─ Write UID/GID maps ├─ Wait for parent │ -│ ├─ Signal ready ────────────────► │ -│ │ ├─ Setup isolation │ -│ │ │ (see step 4) │ -│ ▼ ▼ │ +│ ├─ Open pidfd ├─ Close parent pipe ends │ +│ ├─ Wait for child ready ├─ Setup stdio (dup2) │ +│ ├─ Signal to proceed ├─ chdir(workspace/work) │ +│ ▼ ├─ Apply lockdown (step 4)│ +│ ▼ │ └──────────────────────────────────────────────────────────────────┘ │ ▼ ┌──────────────────────────────────────────────────────────────────┐ -│ 4. CHILD ISOLATION SETUP │ +│ 4. CHILD LOCKDOWN (irreversible) │ │ │ │ ┌─────────────────────────────────────────────────────────┐ │ -│ │ a) Mount namespace │ │ -│ │ • Bind mount /usr, /lib, /lib64 (read-only) │ │ -│ │ • Bind mount workspace to /work │ │ -│ │ • Mount minimal /dev (null, zero, urandom) │ │ -│ │ • pivot_root to new root │ │ -│ │ • Unmount old root │ │ +│ │ a) NO_NEW_PRIVS │ │ +│ │ prctl(PR_SET_NO_NEW_PRIVS) — required before │ │ +│ │ Landlock and seccomp │ │ │ └─────────────────────────────────────────────────────────┘ │ │ ┌─────────────────────────────────────────────────────────┐ │ -│ │ b) Landlock (kernel 5.13+) │ │ -│ │ • Create ruleset with FS restrictions │ │ -│ │ • Allow read-only: /usr, /lib, /bin, /etc │ │ -│ │ • Allow read-write: /work, /tmp │ │ -│ │ • Enforce ruleset │ │ -│ │ (See SECURITY.md for details) │ │ +│ │ b) Landlock v5 │ │ +│ │ • Filesystem: read-only /usr, /lib, /etc, /bin │ │ +│ │ read-write workspace/work, /tmp, /home │ │ +│ │ • Network: block TCP bind + connect (ABI 4+) │ │ +│ │ • Signals: block cross-sandbox signals (ABI 5) │ │ +│ │ • IPC: block abstract unix sockets (ABI 5) │ │ │ └─────────────────────────────────────────────────────────┘ │ │ ┌─────────────────────────────────────────────────────────┐ │ -│ │ c) Seccomp BPF │ │ -│ │ • Load syscall whitelist filter │ │ -│ │ • Block dangerous syscalls (ptrace, mount, etc.) │ │ -│ │ • Filter clone() flags, socket() domains │ │ -│ │ • Filter dangerous ioctls (TIOCSTI, etc.) │ │ -│ │ (See SECURITY.md for full policy) │ │ +│ │ c) Resource limits (rlimits) │ │ +│ │ • RLIMIT_DATA: 256 MiB memory │ │ +│ │ • RLIMIT_CPU: timeout * 2 + 60s │ │ +│ │ • RLIMIT_NPROC: 64 processes │ │ +│ │ • RLIMIT_NOFILE: 256 file descriptors │ │ +│ │ • RLIMIT_FSIZE: 16 MiB output │ │ +│ │ • RLIMIT_CORE: 0 (disabled) │ │ +│ │ • RLIMIT_STACK: 8 MiB │ │ │ └─────────────────────────────────────────────────────────┘ │ │ ┌─────────────────────────────────────────────────────────┐ │ -│ │ d) Resource limits (rlimits) │ │ -│ │ • RLIMIT_AS: Memory limit │ │ -│ │ • RLIMIT_NPROC: Process limit │ │ -│ │ • RLIMIT_NOFILE: File descriptor limit │ │ -│ │ • RLIMIT_FSIZE: Output file size limit │ │ +│ │ d) Securebits + capability drop │ │ +│ │ • Lock NOROOT, NO_SETUID_FIXUP, KEEP_CAPS, │ │ +│ │ NO_CAP_AMBIENT_RAISE │ │ +│ │ • Drop all 64 capabilities │ │ │ └─────────────────────────────────────────────────────────┘ │ └──────────────────────────────────────────────────────────────────┘ │ ▼ ┌──────────────────────────────────────────────────────────────────┐ -│ 5. EXECVE TARGET PROGRAM │ -│ execve("/usr/bin/python", ["python", "-c", code], env) │ +│ 5. SECCOMP FILTERS │ +│ • [Optional] Install notify filter for FS syscall │ +│ interception, send listener fd to parent via SCM_RIGHTS │ +│ • Install kill filter — whitelist of ~100 safe syscalls │ +│ • Argument filtering: clone flags, socket domains, ioctls │ +│ • Violation = SECCOMP_RET_KILL_PROCESS (SIGSYS) │ +└──────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ 6. SIGNAL PARENT + WAIT + EXEC │ +│ • Signal parent readiness (eventfd) │ +│ • Wait for parent go-ahead (eventfd) │ +│ • close_range(3, MAX, 0) — close all fds except 0,1,2 │ +│ • execve(binary, args, env) │ │ │ -│ • All isolation is now permanent │ -│ • Seccomp filter cannot be removed │ -│ • Landlock rules cannot be relaxed │ +│ All isolation is now permanent and cannot be undone. │ └──────────────────────────────────────────────────────────────────┘ │ ▼ ┌──────────────────────────────────────────────────────────────────┐ -│ 6. PARENT MONITORS │ +│ 7. PARENT MONITORS │ │ • Poll pidfd for process exit │ │ • Read stdout/stderr via pipes │ │ • Enforce timeout (kill if exceeded) │ -│ • Track output size (truncate if exceeded) │ +│ • Track output size (kill if exceeded) │ └──────────────────────────────────────────────────────────────────┘ │ ▼ ┌──────────────────────────────────────────────────────────────────┐ -│ 7. CLEANUP │ +│ 8. CLEANUP │ │ • Collect exit status │ │ • Remove workspace tempdir │ │ • Return Output { stdout, stderr, exit_code, signal } │ @@ -245,7 +247,7 @@ loop { ## Security Architecture -evalbox implements **defense in depth** with 7 independent isolation layers: +evalbox implements **defense in depth** with independent isolation mechanisms: ``` ┌─────────────────────────────────────────────────────────────┐ @@ -254,42 +256,16 @@ evalbox implements **defense in depth** with 7 independent isolation layers: │ ▼ ┌─────────────────────────────────────────────────────────────┐ -│ Layer 1: User Namespace │ -│ • UID 0 inside = real user outside │ -│ • No capabilities in parent namespace │ -└─────────────────────────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────┐ -│ Layer 2: PID Namespace │ -│ • Isolated process tree (PID 1 inside) │ -│ • Cannot see/signal host processes │ -└─────────────────────────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────┐ -│ Layer 3: Network Namespace │ -│ • Empty by default (no interfaces) │ -│ • Cannot access host network │ +│ Landlock v5 │ +│ • Filesystem: read-only system paths, read-write workspace │ +│ • Network: block TCP bind + connect (ABI 4+) │ +│ • Signals: block cross-sandbox signals (ABI 5) │ +│ • IPC: block abstract unix sockets (ABI 5) │ └─────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────┐ -│ Layer 4: Mount Namespace + pivot_root │ -│ • Minimal rootfs (no /proc, /sys, /home) │ -│ • Host filesystem completely unmounted │ -└─────────────────────────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────┐ -│ Layer 5: Landlock LSM │ -│ • Kernel-enforced filesystem rules │ -│ • Read-only binaries, read-write workspace only │ -└─────────────────────────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────┐ -│ Layer 6: Seccomp BPF │ +│ Seccomp BPF │ │ • ~100 allowed syscalls (whitelist) │ │ • Blocks ptrace, mount, clone(NEWUSER), AF_NETLINK │ │ • SIGSYS on violation (immediate termination) │ @@ -297,12 +273,20 @@ evalbox implements **defense in depth** with 7 independent isolation layers: │ ▼ ┌─────────────────────────────────────────────────────────────┐ -│ Layer 7: Resource Limits │ +│ Resource Limits │ │ • Memory, CPU, processes, file descriptors │ │ • Prevents DoS attacks │ +└─────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ Privilege Hardening │ +│ • NO_NEW_PRIVS — cannot gain privileges via exec │ +│ • Securebits locked — cannot regain capabilities │ +│ • All 64 capabilities dropped │ └─────────────────────────────────────────────────────────────┘ -For detailed security policy and threat model, see SECURITY.md +For detailed security policy and threat model, see SECURITY_MODEL.md ``` --- @@ -344,7 +328,7 @@ BPF Program Flow: KILL ALLOW KILL ALLOW KILL ALLOW ALLOW KILL ``` -For the complete syscall policy, see [SECURITY.md](SECURITY.md#syscall-policy). +For the complete syscall policy, see [SECURITY_MODEL.md](SECURITY_MODEL.md#syscall-policy). --- @@ -352,31 +336,15 @@ For the complete syscall policy, see [SECURITY.md](SECURITY.md#syscall-policy). ``` /tmp/evalbox-XXXXX/ Workspace root (tmpdir) -├── root/ New root filesystem -│ ├── work/ User workspace (read-write) -│ │ ├── script.py User files -│ │ └── data.json -│ ├── tmp/ Temporary files (read-write) -│ ├── etc/ Minimal config -│ │ ├── passwd nobody user -│ │ ├── group nogroup -│ │ ├── hosts localhost -│ │ └── resolv.conf DNS (if network enabled) -│ ├── dev/ Minimal devices -│ │ ├── null -│ │ ├── zero -│ │ ├── urandom -│ │ └── fd → /proc/self/fd -│ ├── usr/ ──────────────── Bind mount (read-only) -│ ├── lib/ ──────────────── Bind mount (read-only) -│ ├── lib64/ ────────────── Bind mount (read-only) -│ └── bin/ ──────────────── Symlink to /usr/bin -│ -├── stdin Input pipe -├── stdout Output pipe -└── stderr Error pipe +├── work/ User workspace (read-write via Landlock) +│ ├── script.py User files +│ └── data.json +├── tmp/ Temporary files (read-write via Landlock) +└── home/ Home directory (read-write via Landlock) ``` +The workspace is a plain tempdir. No `pivot_root`, no bind mounts, no special rootfs. Landlock rules control which real filesystem paths are accessible. + --- ## Design Principles @@ -384,26 +352,26 @@ For the complete syscall policy, see [SECURITY.md](SECURITY.md#syscall-policy). ### 1. Simple as eval() ```rust // One function call to run code safely -let output = python::run("print('hello')", &config)?; +let output = python::run("print('hello')").exec()?; ``` ### 2. Defense in Depth -Every isolation mechanism works independently. A bypass of one layer doesn't compromise the sandbox. See [SECURITY.md](SECURITY.md#defense-in-depth). +Each isolation mechanism works independently. Landlock controls filesystem and network access, seccomp blocks dangerous syscalls, rlimits prevent resource exhaustion. See [SECURITY_MODEL.md](SECURITY_MODEL.md#defense-in-depth). ### 3. Unprivileged - No root required - No daemon/service -- Uses user namespaces +- No namespaces needed — Landlock + seccomp work unprivileged with `NO_NEW_PRIVS` ### 4. Minimal Attack Surface - Small syscall whitelist (~100 syscalls) -- Minimal filesystem -- No /proc, /sys by default +- Landlock restricts filesystem to minimal paths +- All capabilities dropped ### 5. Fast - ~5ms sandbox creation -- No VM boot -- No container image pull +- No VM boot, no container image pull +- Plain `fork()` + lockdown ### 6. Embeddable - Library, not a service @@ -414,12 +382,11 @@ Every isolation mechanism works independently. A bypass of one layer doesn't com ## System Requirements -| Requirement | Minimum | Recommended | -|-------------|---------|-------------| -| Linux Kernel | 5.13 | 6.1+ | -| User Namespaces | Required | - | -| Landlock | Required (ABI 1) | ABI 4 | -| Seccomp | Required | - | +| Requirement | Minimum | +|-------------|---------| +| Linux Kernel | 6.12 | +| Landlock | ABI 5 | +| Seccomp | Required | Check compatibility: ```bash @@ -430,8 +397,7 @@ evalbox check ## References -- [SECURITY.md](SECURITY.md) - Detailed security model and threat analysis -- [ROADMAP.md](ROADMAP.md) - Planned features -- [Linux namespaces](https://man7.org/linux/man-pages/man7/namespaces.7.html) +- [Security Model](SECURITY_MODEL.md) - Detailed security model and threat analysis +- [Roadmap](ROADMAP.md) - Planned features - [Landlock LSM](https://docs.kernel.org/userspace-api/landlock.html) - [Seccomp BPF](https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html) diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index fb8fba7..32e49f2 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -1,5 +1,32 @@ # Roadmap +## Security Hardening + +### Block UDP exfiltration via seccomp + +Landlock only controls TCP (`LANDLOCK_ACCESS_NET_{BIND,CONNECT}_TCP`). A sandboxed process can create a `SOCK_DGRAM` socket and `sendto()` data to any IP without Landlock blocking it. + +**Fix:** Block `SOCK_DGRAM` in the seccomp socket filter when `plan.network_blocked`. DNS inside the sandbox already doesn't work, so this breaks nothing. + +**Tracking:** Landlock ABI v8 RFC patches (Dec 2025) propose `LANDLOCK_ACCESS_NET_{BIND,CONNECT,SENDTO}_UDP`. Once merged, seccomp filtering can be relaxed. + +### Restrict /proc access + +`/proc` is currently Landlock read-only. Landlock's ptrace scoping already blocks access to `environ`, `maps`, `fd/` of processes outside the sandbox domain. However, `/proc/[pid]/cmdline` is world-readable (`0444`, no ptrace check) — any host process's command line is visible. + +**Options:** +- Remove `/proc` from Landlock entirely (breaks programs that read `/proc/self/`) +- Accept `cmdline` leak as residual risk (low impact for single-user) +- Wait for kernel support: `hidepid=` requires mount namespace, Landlock can't target `/proc/self` (magic symlink resolves to fixed inode at `open()` time) + +### PID namespace (optional) + +Without PID namespace the sandbox can enumerate host PIDs via `/proc`. Combined with `cmdline` being world-readable, this is an information leak. Adding `CLONE_NEWPID` back would fully isolate the process tree, but requires re-introducing namespace setup code. + +**Trade-off:** Adds ~0.5ms and complexity. Not needed for single-user code execution, useful for multi-tenant deployments. + +--- + ## Supervised Execution Mode Intercept syscalls before execution for AI CLI tools and interactive approval. diff --git a/docs/SECURITY.md b/docs/SECURITY.md deleted file mode 100644 index e3ebdf2..0000000 --- a/docs/SECURITY.md +++ /dev/null @@ -1,354 +0,0 @@ -# evalbox Security Model - -## Defense in Depth - -evalbox uses **7 independent isolation layers**. Each layer provides protection even if another layer is bypassed. - -``` -┌─────────────────────────────────────────────────────────────┐ -│ Layer 1 │ User Namespaces │ Identity │ -├───────────┼──────────────────────────────────┼──────────────┤ -│ Layer 2 │ PID Namespace │ Process │ -├───────────┼──────────────────────────────────┼──────────────┤ -│ Layer 3 │ Network Namespace │ Network │ -├───────────┼──────────────────────────────────┼──────────────┤ -│ Layer 4 │ Mount Namespace + pivot_root │ Filesystem │ -├───────────┼──────────────────────────────────┼──────────────┤ -│ Layer 5 │ Landlock LSM │ FS Rules │ -├───────────┼──────────────────────────────────┼──────────────┤ -│ Layer 6 │ Seccomp BPF │ Syscalls │ -├───────────┼──────────────────────────────────┼──────────────┤ -│ Layer 7 │ rlimits │ Resources │ -└───────────┴──────────────────────────────────┴──────────────┘ -``` - ---- - -## Isolation Layers - -### Layer 1: User Namespaces - -User namespaces provide identity isolation. - -| Inside Sandbox | Outside Sandbox | -|----------------|-----------------| -| UID 0 (root) | Real user UID | -| GID 0 (root) | Real user GID | -| Full capabilities | No capabilities | - -**Security properties:** -- Cannot access host user's files (different UID) -- Capabilities only valid inside namespace -- Cannot escalate to real root - -### Layer 2: PID Namespace - -Process isolation prevents interference with host processes. - -``` -Host PID Namespace Sandbox PID Namespace -┌───────────────────┐ ┌───────────────────┐ -│ PID 1 (init) │ │ PID 1 (sandbox) │ -│ PID 1234 (shell) │ │ PID 2 (python) │ -│ PID 5678 (...) │ │ PID 3 (child) │ -└───────────────────┘ └───────────────────┘ - │ │ - │ ✗ Cannot see │ - │◄─────────────────────────┤ - │ ✗ Cannot signal │ -``` - -**Security properties:** -- Sandbox sees only its own processes -- Cannot enumerate host processes -- Cannot send signals to host processes -- kill() safe inside namespace - -### Layer 3: Network Namespace - -Network isolation blocks all network access by default. - -``` -┌─────────────────────────────────────────┐ -│ Host Network │ -│ eth0: 192.168.1.100 │ -│ lo: 127.0.0.1 │ -│ docker0: 172.17.0.1 │ -└─────────────────────────────────────────┘ - ✗ No access -┌─────────────────────────────────────────┐ -│ Sandbox Network │ -│ (empty - no interfaces) │ -│ │ -│ • No loopback │ -│ • No external access │ -│ • socket() works but connect() fails │ -└─────────────────────────────────────────┘ -``` - -**Security properties:** -- Cannot connect to localhost services -- Cannot access local network -- Cannot exfiltrate data via network -- Optional: Enable with `.network(true)` - -### Layer 4: Mount Namespace + pivot_root - -Filesystem isolation provides a minimal, controlled view. - -``` -Host Filesystem Sandbox Filesystem -/ / -├── home/ ├── work/ ← User workspace (rw) -│ └── user/ ✗ ├── tmp/ ← Temp files (rw) -├── etc/ ├── etc/ ← Minimal config -│ └── shadow ✗ │ ├── passwd (nobody) -├── root/ ✗ │ └── hosts (localhost) -├── proc/ ✗ ├── dev/ ← Minimal devices -├── sys/ ✗ │ ├── null -├── usr/ ───────────────────┼── usr/ ← Bind mount (ro) -├── lib/ ───────────────────┼── lib/ ← Bind mount (ro) -└── lib64/ ──────────────────┼── lib64/ ← Bind mount (ro) - └── (host root unmounted) -``` - -**Security properties:** -- Cannot access /home, /root -- Cannot read /etc/shadow, /etc/passwd (host) -- Cannot access /proc (no process info) -- Cannot access /sys (no kernel info) -- Host filesystem completely unmounted - -### Layer 5: Landlock LSM - -Kernel-enforced filesystem access control (requires Linux 5.13+). - -```rust -// Landlock ruleset -Ruleset { - read_only: ["/usr", "/lib", "/lib64", "/bin", "/etc"], - read_write: ["/work", "/tmp"], - execute: ["/usr/bin", "/bin"], - no_access: [everything else], -} -``` - -**Landlock ABI versions:** -| ABI | Kernel | Features | -|-----|--------|----------| -| 1 | 5.13 | Basic filesystem | -| 2 | 5.19 | Truncate control | -| 3 | 6.2 | File permissions | -| 4 | 6.7 | Network TCP control | - -**Security properties:** -- Enforced at kernel level (bypass-resistant) -- Works even if mount namespace bypassed -- Cannot be disabled after application - -### Layer 6: Seccomp BPF - -Syscall filtering with immediate termination on violation. - -**Filter approach:** Whitelist (allow known-safe syscalls, kill on others) - -**Blocked syscall categories:** -| Category | Syscalls | Reason | -|----------|----------|--------| -| Namespaces | `clone(CLONE_NEW*)`, `unshare`, `setns` | Prevent new namespaces | -| Mounting | `mount`, `umount`, `pivot_root` | Prevent FS manipulation | -| Debugging | `ptrace`, `process_vm_*` | Prevent process injection | -| Kernel | `reboot`, `kexec_load`, `init_module` | Prevent system damage | -| Privilege | `setuid`, `setgid`, `setgroups` | Prevent escalation | -| Keyring | `keyctl` | Not namespaced | -| eBPF | `bpf` | Kernel attack surface | - -**Argument filtering:** -| Syscall | Blocked Arguments | Reason | -|---------|-------------------|--------| -| `clone` | `CLONE_NEWUSER`, `CLONE_NEWNET`, etc. | Block namespace creation | -| `socket` | `AF_NETLINK`, `SOCK_RAW` | Block kernel interfaces | -| `ioctl` | `TIOCSTI`, `TIOCSETD`, `TIOCLINUX` | Block terminal injection | - -**Violation behavior:** `SECCOMP_RET_KILL_PROCESS` (SIGSYS, signal 31) - -### Layer 7: Resource Limits - -Prevent denial-of-service attacks. - -| Resource | Limit | Purpose | -|----------|-------|---------| -| `RLIMIT_AS` | 256 MB | Memory limit | -| `RLIMIT_NPROC` | 64 | Fork bomb prevention | -| `RLIMIT_NOFILE` | 256 | File descriptor limit | -| `RLIMIT_FSIZE` | 10 MB | Output file size | -| Timeout | 30s | CPU time limit | - ---- - -## Syscall Policy - -### Allowed Syscalls (~100) - -``` -Basic I/O: read, write, close, lseek, pread64, pwrite64 -File ops: openat, stat, fstat, access, readlink -Memory: mmap, mprotect, munmap, brk, mremap -Process: fork, vfork, execve, exit, exit_group, wait4 -Signals: rt_sigaction, rt_sigprocmask, rt_sigreturn -Time: clock_gettime, nanosleep, gettimeofday -Sockets: socket*, connect, bind, listen, accept, send*, recv* -Events: epoll_*, poll, select -``` - -### Blocked Syscalls (examples) - -``` -Dangerous: ptrace, mount, reboot, kexec_load, init_module -Namespaces: clone3, unshare, setns (blocked or filtered) -Privilege: setuid, setgid, setresuid, setresgid -Kernel: bpf, perf_event_open, keyctl -Fileless: memfd_create, execveat (together enable fileless exec) -``` - -### Special Handling - -| Syscall | Handling | -|---------|----------| -| `clone` | Allowed, but `CLONE_NEW*` flags blocked | -| `clone3` | Returns `ENOSYS` (glibc falls back to `clone`) | -| `socket` | Allowed, but `AF_NETLINK` and `SOCK_RAW` blocked | -| `ioctl` | Allowed, but `TIOCSTI`, `TIOCSETD`, `TIOCLINUX` blocked | - ---- - -## Threat Model - -### In Scope (Protected Against) - -| Threat | Mitigation | -|--------|------------| -| **Arbitrary code execution** | Sandboxed environment | -| **Filesystem escape** | Namespaces + Landlock + pivot_root | -| **Network access** | Network namespace (empty) | -| **Process injection** | PID namespace + ptrace blocked | -| **Privilege escalation** | User namespace + seccomp | -| **Resource exhaustion** | rlimits + timeouts | -| **Fork bombs** | RLIMIT_NPROC | -| **Terminal injection** | TIOCSTI/TIOCLINUX blocked | -| **Fileless malware** | memfd_create + execveat blocked | - -### Out of Scope - -| Threat | Reason | -|--------|--------| -| **Kernel exploits** | Requires kernel hardening (grsecurity, etc.) | -| **Side-channel attacks** | Spectre/Meltdown require CPU mitigations | -| **Container breakout via 0-day** | Defense in depth limits impact | -| **Covert channels** | Timing-based data exfiltration possible | - -### CVE Protection - -evalbox's seccomp policy blocks attack vectors for many kernel CVEs: - -| CVE | Attack Vector | Blocked By | -|-----|---------------|------------| -| CVE-2024-1086 | AF_NETLINK + nf_tables | Socket filtering | -| CVE-2022-0185 | fsconfig + user namespace | CLONE_NEWUSER blocked | -| CVE-2022-0492 | cgroups + user namespace | CLONE_NEWUSER blocked | -| CVE-2017-5226 | TIOCSTI terminal injection | ioctl filtering | -| CVE-2019-13272 | ptrace PTRACE_TRACEME | ptrace blocked | -| CVE-2021-3490 | eBPF verifier bypass | bpf blocked | - ---- - -## Filesystem Access - -### Default Mounts - -| Path | Access | Source | Purpose | -|------|--------|--------|---------| -| `/work` | Read-Write | Workspace | User files | -| `/tmp` | Read-Write | tmpfs | Temporary files | -| `/usr` | Read-Only | Host | Binaries, libraries | -| `/lib` | Read-Only | Host | Shared libraries | -| `/lib64` | Read-Only | Host | 64-bit libraries | -| `/etc` | Read-Only | Generated | Minimal config | -| `/dev` | Read-Only | Generated | null, zero, urandom | - -### Not Mounted (Blocked) - -| Path | Contains | Risk if Accessible | -|------|----------|-------------------| -| `/home` | User data | Data theft | -| `/root` | Root home | Credential theft | -| `/proc` | Process info | Info leak, escape vectors | -| `/sys` | Kernel interfaces | Kernel manipulation | -| `/var` | System state | Log manipulation | -| `/run` | Runtime data | Socket access | - ---- - -## Verification - -### Security Tests - -Run the security test suite to verify isolation: - -```bash -# Run all security tests -cargo test -p evalbox-sandbox --test security_tests -- --ignored - -# Run specific category -cargo test -p evalbox-sandbox --test security_tests seccomp -- --ignored -cargo test -p evalbox-sandbox --test security_tests filesystem -- --ignored -cargo test -p evalbox-sandbox --test security_tests network -- --ignored -cargo test -p evalbox-sandbox --test security_tests cve -- --ignored -``` - -### Test Coverage - -| Category | Tests | Coverage | -|----------|-------|----------| -| Seccomp | 9 | ptrace, mount, reboot, clone, socket, keyctl, bpf | -| Filesystem | 8 | /etc/shadow, /root, path traversal, symlinks | -| Network | 5 | External, localhost, loopback, DNS | -| Resources | 7 | Timeout, memory, PIDs, output limit | -| CVE | 10 | Real-world exploits blocked | - -### Manual Verification - -```bash -# Try to read /etc/shadow (should fail) -evalbox shell "cat /etc/shadow" - -# Try to access network (should fail) -evalbox shell "curl https://example.com" - -# Try ptrace (should be killed with SIGSYS) -evalbox shell "strace ls" -``` - ---- - -## Production Requirements - -To deploy evalbox securely, ensure your system meets these requirements: - -| Requirement | How to Verify | -|-------------|---------------| -| Kernel 5.13+ with Landlock | `cat /sys/kernel/security/lsm` should include `landlock` | -| User namespaces enabled | `cat /proc/sys/kernel/unprivileged_userns_clone` should be `1` | -| Seccomp enabled | `grep SECCOMP /boot/config-$(uname -r)` | -| Unprivileged BPF disabled | `sysctl kernel.unprivileged_bpf_disabled=1` (recommended) | - -Run `evalbox check` to verify all requirements automatically. - ---- - -## References - -- [Architecture Overview](ARCHITECTURE.md) -- [Linux Namespaces](https://man7.org/linux/man-pages/man7/namespaces.7.html) -- [Landlock Documentation](https://docs.kernel.org/userspace-api/landlock.html) -- [Seccomp BPF](https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html) diff --git a/docs/SECURITY_MODEL.md b/docs/SECURITY_MODEL.md new file mode 100644 index 0000000..21c12a2 --- /dev/null +++ b/docs/SECURITY_MODEL.md @@ -0,0 +1,286 @@ +# evalbox Security Model + +## Defense in Depth + +evalbox uses **independent isolation mechanisms**. Each provides protection even if another is bypassed. + +``` +┌─────────────────────────────────────────────────────────────┐ +│ │ Landlock v5 │ Filesystem, Network│ +│ │ │ Signal, IPC │ +├───────────┼───────────────────────────┼─────────────────────┤ +│ │ Seccomp BPF │ Syscalls │ +├───────────┼───────────────────────────┼─────────────────────┤ +│ │ rlimits │ Resources │ +├───────────┼───────────────────────────┼─────────────────────┤ +│ │ Privilege Hardening │ NO_NEW_PRIVS, │ +│ │ │ securebits, caps │ +└───────────┴───────────────────────────┴─────────────────────┘ +``` + +--- + +## Isolation Mechanisms + +### Landlock v5 + +Kernel-enforced access control (requires Linux 6.12+, Landlock ABI 5). + +No namespaces or `pivot_root` needed — Landlock operates on real filesystem paths. + +**Filesystem rules:** +``` +read-only: /usr, /lib, /lib64, /bin, /etc, /proc, /nix/store* +read-write: workspace/work, workspace/tmp, workspace/home +write: /dev (for /dev/null, /dev/zero, /dev/urandom) +no access: everything else +``` + +**Network control (ABI 4+):** +- Blocks `LANDLOCK_ACCESS_NET_BIND_TCP` +- Blocks `LANDLOCK_ACCESS_NET_CONNECT_TCP` +- Optional: enable with `.network(true)` + +**Signal isolation (ABI 5):** +- `LANDLOCK_SCOPE_SIGNAL` — blocks signals to processes outside the sandbox + +**IPC isolation (ABI 5):** +- `LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET` — blocks connections to abstract unix sockets outside the sandbox + +**Landlock ABI versions:** +| ABI | Kernel | Features | +|-----|--------|----------| +| 1 | 5.13 | Basic filesystem | +| 2 | 5.19 | Truncate control | +| 3 | 6.2 | File permissions | +| 4 | 6.7 | Network TCP control | +| 5 | 6.12 | Signal + abstract unix socket scoping | + +**Security properties:** +- Enforced at kernel level (bypass-resistant) +- Cannot be disabled after application +- Works unprivileged with `NO_NEW_PRIVS` + +### Seccomp BPF + +Syscall filtering with immediate termination on violation. + +**Filter approach:** Whitelist (allow known-safe syscalls, kill on others) + +**Blocked syscall categories:** +| Category | Syscalls | Reason | +|----------|----------|--------| +| Namespaces | `clone(CLONE_NEW*)`, `unshare`, `setns` | Prevent new namespaces | +| Mounting | `mount`, `umount`, `pivot_root` | Prevent FS manipulation | +| Debugging | `ptrace`, `process_vm_*` | Prevent process injection | +| Kernel | `reboot`, `kexec_load`, `init_module` | Prevent system damage | +| Privilege | `setuid`, `setgid`, `setgroups` | Prevent escalation | +| Keyring | `keyctl` | Not namespaced | +| eBPF | `bpf` | Kernel attack surface | +| Fileless | `memfd_create`, `execveat` | Bypass Landlock | + +**Argument filtering:** +| Syscall | Blocked Arguments | Reason | +|---------|-------------------|--------| +| `clone` | `CLONE_NEWUSER`, `CLONE_NEWNET`, `CLONE_NEWNS`, `CLONE_NEWPID`, `CLONE_NEWIPC`, `CLONE_NEWUTS`, `CLONE_NEWCGROUP` | Block namespace creation | +| `clone3` | Entirely blocked (returns `ENOSYS`) | Cannot inspect flags in userspace struct | +| `socket` | `AF_NETLINK`, `SOCK_RAW` | Block kernel interfaces | +| `ioctl` | `TIOCSTI`, `TIOCSETD`, `TIOCLINUX` | Block terminal injection | + +**Violation behavior:** `SECCOMP_RET_KILL_PROCESS` (SIGSYS, signal 31) + +### Resource Limits + +Prevent denial-of-service attacks via kernel-enforced rlimits. + +| Resource | Limit | Purpose | +|----------|-------|---------| +| `RLIMIT_DATA` | 256 MiB | Memory usage | +| `RLIMIT_CPU` | timeout * 2 + 60s | CPU time limit | +| `RLIMIT_FSIZE` | 16 MiB | Output file size | +| `RLIMIT_NOFILE` | 256 | File descriptor limit | +| `RLIMIT_NPROC` | 64 | Fork bomb prevention | +| `RLIMIT_CORE` | 0 | Core dumps disabled | +| `RLIMIT_STACK` | 8 MiB | Stack size | + +Note: `RLIMIT_AS` (virtual address space) is intentionally not set. Modern runtimes like Go, Java, and V8 pre-allocate large virtual ranges but only commit small portions. + +### Privilege Hardening + +Permanent privilege reduction applied before seccomp: + +| Mechanism | Effect | +|-----------|--------| +| `PR_SET_NO_NEW_PRIVS` | Cannot gain privileges via exec (setuid, file caps) | +| `SECBIT_NOROOT` (locked) | Root has no special privilege | +| `SECBIT_NO_SETUID_FIXUP` (locked) | Capabilities not adjusted on UID change | +| `SECBIT_KEEP_CAPS` (locked) | Cannot keep caps through exec | +| `SECBIT_NO_CAP_AMBIENT_RAISE` (locked) | Cannot set ambient capabilities | +| Drop all 64 capabilities | No capability-based operations possible | + +--- + +## Syscall Policy + +### Allowed Syscalls (~100) + +``` +Basic I/O: read, write, close, lseek, pread64, pwrite64 +File ops: openat, stat, fstat, access, readlink +Memory: mmap, mprotect, munmap, brk, mremap +Process: fork, vfork, execve, exit, exit_group, wait4 +Signals: rt_sigaction, rt_sigprocmask, rt_sigreturn, kill, tgkill +Time: clock_gettime, nanosleep, gettimeofday +Sockets: socket*, connect, bind, listen, accept, send*, recv* +Events: epoll_*, poll, select +``` + +Note: `kill` and `tgkill` are allowed because Landlock ABI 5 provides signal scoping — signals can only reach processes within the sandbox. + +### Blocked Syscalls (examples) + +``` +Dangerous: ptrace, mount, reboot, kexec_load, init_module +Namespaces: clone3, unshare, setns (blocked or filtered) +Privilege: setuid, setgid, setresuid, setresgid +Kernel: bpf, perf_event_open, keyctl +Fileless: memfd_create, execveat (together enable fileless exec) +``` + +### Special Handling + +| Syscall | Handling | +|---------|----------| +| `clone` | Allowed, but `CLONE_NEW*` flags blocked | +| `clone3` | Returns `ENOSYS` (glibc falls back to `clone`) | +| `socket` | Allowed, but `AF_NETLINK` and `SOCK_RAW` blocked | +| `ioctl` | Allowed, but `TIOCSTI`, `TIOCSETD`, `TIOCLINUX` blocked | + +--- + +## Threat Model + +### In Scope (Protected Against) + +| Threat | Mitigation | +|--------|------------| +| **Arbitrary code execution** | Sandboxed environment | +| **Filesystem escape** | Landlock v5 path rules | +| **Network access** | Landlock network control (ABI 4+) + seccomp socket filtering | +| **Process injection** | ptrace blocked by seccomp | +| **Privilege escalation** | NO_NEW_PRIVS + seccomp + capability drop | +| **Resource exhaustion** | rlimits + timeouts | +| **Fork bombs** | RLIMIT_NPROC | +| **Terminal injection** | TIOCSTI/TIOCLINUX blocked by seccomp | +| **Fileless malware** | memfd_create + execveat blocked by seccomp | +| **Cross-sandbox signals** | Landlock signal scoping (ABI 5) | +| **Abstract unix socket abuse** | Landlock IPC scoping (ABI 5) | + +### Out of Scope + +| Threat | Reason | +|--------|--------| +| **Kernel exploits** | Requires kernel hardening (grsecurity, etc.) | +| **Side-channel attacks** | Spectre/Meltdown require CPU mitigations | +| **Container breakout via 0-day** | Defense in depth limits impact | +| **Covert channels** | Timing-based data exfiltration possible | + +### CVE Protection + +evalbox's seccomp policy blocks attack vectors for many kernel CVEs: + +| CVE | Attack Vector | Blocked By | +|-----|---------------|------------| +| CVE-2024-1086 | AF_NETLINK + nf_tables | Socket filtering | +| CVE-2022-0185 | fsconfig + user namespace | CLONE_NEWUSER blocked | +| CVE-2022-0492 | cgroups + user namespace | CLONE_NEWUSER blocked | +| CVE-2017-5226 | TIOCSTI terminal injection | ioctl filtering | +| CVE-2019-13272 | ptrace PTRACE_TRACEME | ptrace blocked | +| CVE-2021-3490 | eBPF verifier bypass | bpf blocked | + +--- + +## Filesystem Access + +### Accessible Paths (via Landlock) + +| Path | Access | Purpose | +|------|--------|---------| +| `workspace/work` | Read-Write | User files | +| `workspace/tmp` | Read-Write | Temporary files | +| `workspace/home` | Read-Write | Home directory | +| `/usr` | Read-Only + Execute | Binaries, libraries | +| `/lib` | Read-Only + Execute | Shared libraries | +| `/lib64` | Read-Only + Execute | 64-bit libraries | +| `/bin` | Read-Only + Execute | Binaries | +| `/etc` | Read-Only | System config | +| `/proc` | Read-Only | Process info (no execute) | +| `/dev` | Read + Write | null, zero, urandom | +| `/nix/store` | Read-Only + Execute | NixOS paths (if present) | + +### Not Accessible + +| Path | Contains | Risk if Accessible | +|------|----------|-------------------| +| `/home` (host) | User data | Data theft | +| `/root` | Root home | Credential theft | +| `/sys` | Kernel interfaces | Kernel manipulation | +| `/var` | System state | Log manipulation | +| `/run` | Runtime data | Socket access | + +--- + +## Verification + +### Security Tests + +Run the security test suite to verify isolation: + +```bash +# Run all security tests +cargo test -p evalbox-sandbox --test security_tests -- --ignored + +# Run specific category +cargo test -p evalbox-sandbox --test security_tests seccomp -- --ignored +cargo test -p evalbox-sandbox --test security_tests filesystem -- --ignored +cargo test -p evalbox-sandbox --test security_tests network -- --ignored +cargo test -p evalbox-sandbox --test security_tests cve -- --ignored +``` + +Or via Nix: + +```bash +nix run .#test-all +``` + +### Test Coverage + +| Category | Tests | Coverage | +|----------|-------|----------| +| Seccomp | 9 | ptrace, mount, reboot, clone, socket, keyctl, bpf | +| Filesystem | 8 | /etc/shadow, /root, path traversal, symlinks | +| Network | 5 | External, localhost, loopback, DNS | +| Resources | 7 | Timeout, memory, PIDs, output limit | +| CVE | 10 | Real-world exploits blocked | + +--- + +## Production Requirements + +| Requirement | How to Verify | +|-------------|---------------| +| Kernel 6.12+ | `uname -r` | +| Landlock ABI 5 | `cat /sys/kernel/security/lsm` should include `landlock` | +| Seccomp enabled | `grep SECCOMP /boot/config-$(uname -r)` | +| Unprivileged BPF disabled | `sysctl kernel.unprivileged_bpf_disabled=1` (recommended) | + +Run `evalbox check` to verify all requirements automatically. + +--- + +## References + +- [Architecture](ARCHITECTURE.md) +- [Security Policy](../SECURITY.md) +- [Landlock Documentation](https://docs.kernel.org/userspace-api/landlock.html) +- [Seccomp BPF](https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html) From ad947e71fdb49d4e80533d0364d4c5593b4a99cd Mon Sep 17 00:00:00 2001 From: fullzer4 Date: Sun, 22 Feb 2026 17:42:10 -0300 Subject: [PATCH 4/6] fix(ci): proper security test pipeline with payload support --- .github/workflows/ci.yml | 56 +++++++++------- crates/evalbox-sandbox/src/executor.rs | 4 +- .../evalbox-sandbox/src/isolation/lockdown.rs | 6 +- .../evalbox-sandbox/src/notify/scm_rights.rs | 12 +++- .../evalbox-sandbox/src/notify/supervisor.rs | 8 ++- crates/evalbox-sandbox/tests/common/mod.rs | 49 +++++++------- .../tests/security/filesystem.rs | 20 +++--- crates/evalbox-sys/src/seccomp_notify.rs | 17 +++-- nix/packages.nix | 67 +++++++++++-------- 9 files changed, 135 insertions(+), 104 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9f962b3..9b42d56 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,43 +19,51 @@ jobs: contents: read steps: - uses: actions/checkout@v4 - - uses: DeterminateSystems/determinate-nix-action@v3 - - uses: DeterminateSystems/magic-nix-cache-action@main + - name: Run checks (clippy, fmt, test, doc) + run: nix flake check -L - - name: Run fast checks (clippy, fmt, test, doc) - run: | - nix build -L \ - .#checks.x86_64-linux.clippy \ - .#checks.x86_64-linux.fmt \ - .#checks.x86_64-linux.test \ - .#checks.x86_64-linux.doc + build-test-bin: + name: Build Security Tests + runs-on: ubuntu-latest + needs: nix-checks + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v4 + - uses: DeterminateSystems/determinate-nix-action@v3 + - uses: DeterminateSystems/magic-nix-cache-action@main + - name: Build security test binary + run: nix build -L .#security-test-bin + - name: Upload test binary + uses: actions/upload-artifact@v4 + with: + name: security-test-bin + path: result/bin/ - e2e-docker: - name: E2E Docker (${{ matrix.distro }}) + e2e: + name: E2E (${{ matrix.distro }}) runs-on: ubuntu-latest + needs: build-test-bin strategy: fail-fast: false matrix: distro: [ubuntu:24.04, fedora:41, alpine:3.21] steps: - - uses: actions/checkout@v4 - - - uses: DeterminateSystems/determinate-nix-action@v3 - - - uses: DeterminateSystems/magic-nix-cache-action@main - - - name: Build test binary - run: nix build -L .#checks.x86_64-linux.test - - - name: Run security tests in Docker (${{ matrix.distro }}) + - uses: actions/download-artifact@v4 + with: + name: security-test-bin + path: ./test-bin/ + - name: Run security tests in ${{ matrix.distro }} run: | - TEST_BIN=$(find result/ -name 'security_tests-*' -type f -executable | head -1) - docker run --rm \ + chmod +x ./test-bin/* + TEST_BIN=$(find ./test-bin/ -type f -executable | head -1) + docker run --rm --privileged \ -v "$TEST_BIN:/security_tests:ro" \ ${{ matrix.distro }} \ - /security_tests --ignored + /security_tests --ignored --test-threads=1 semver-check: name: SemVer Check diff --git a/crates/evalbox-sandbox/src/executor.rs b/crates/evalbox-sandbox/src/executor.rs index fc96168..57512fb 100644 --- a/crates/evalbox-sandbox/src/executor.rs +++ b/crates/evalbox-sandbox/src/executor.rs @@ -770,8 +770,8 @@ fn child_process( // 3. chdir to workspace/work let work_dir = workspace.root().join("work"); - let work_cstr = - CString::new(work_dir.to_string_lossy().as_bytes()).map_err(|_| ExecutorError::Exec(Errno::INVAL))?; + let work_cstr = CString::new(work_dir.to_string_lossy().as_bytes()) + .map_err(|_| ExecutorError::Exec(Errno::INVAL))?; if unsafe { libc::chdir(work_cstr.as_ptr()) } != 0 { return Err(ExecutorError::Exec(last_errno())); } diff --git a/crates/evalbox-sandbox/src/isolation/lockdown.rs b/crates/evalbox-sandbox/src/isolation/lockdown.rs index d9c96ff..3c0006e 100644 --- a/crates/evalbox-sandbox/src/isolation/lockdown.rs +++ b/crates/evalbox-sandbox/src/isolation/lockdown.rs @@ -157,7 +157,11 @@ fn apply_landlock_v5( } // Proc (read-only) - add_path_rule(&ruleset_fd, "/proc", read_access & !LANDLOCK_ACCESS_FS_EXECUTE); + add_path_rule( + &ruleset_fd, + "/proc", + read_access & !LANDLOCK_ACCESS_FS_EXECUTE, + ); // Dev (read + write for /dev/null etc.) add_path_rule( diff --git a/crates/evalbox-sandbox/src/notify/scm_rights.rs b/crates/evalbox-sandbox/src/notify/scm_rights.rs index 7637613..3acf6c4 100644 --- a/crates/evalbox-sandbox/src/notify/scm_rights.rs +++ b/crates/evalbox-sandbox/src/notify/scm_rights.rs @@ -56,7 +56,11 @@ pub fn send_fd(socket: RawFd, fd: RawFd) -> io::Result<()> { (*cmsg).cmsg_type = libc::SCM_RIGHTS; (*cmsg).cmsg_len = libc::CMSG_LEN(size_of::() as u32) as usize; let data_ptr = libc::CMSG_DATA(cmsg); - std::ptr::copy_nonoverlapping((&fd as *const RawFd).cast::(), data_ptr, size_of::()); + std::ptr::copy_nonoverlapping( + (&fd as *const RawFd).cast::(), + data_ptr, + size_of::(), + ); } let ret = unsafe { libc::sendmsg(socket, &msg, 0) }; @@ -106,7 +110,11 @@ pub fn recv_fd(socket: RawFd) -> io::Result { } let mut fd: RawFd = 0; let data_ptr = libc::CMSG_DATA(cmsg); - std::ptr::copy_nonoverlapping(data_ptr, (&mut fd as *mut RawFd).cast::(), size_of::()); + std::ptr::copy_nonoverlapping( + data_ptr, + (&mut fd as *mut RawFd).cast::(), + size_of::(), + ); Ok(OwnedFd::from_raw_fd(fd)) } } diff --git a/crates/evalbox-sandbox/src/notify/supervisor.rs b/crates/evalbox-sandbox/src/notify/supervisor.rs index 13a77fb..cf458d6 100644 --- a/crates/evalbox-sandbox/src/notify/supervisor.rs +++ b/crates/evalbox-sandbox/src/notify/supervisor.rs @@ -74,7 +74,10 @@ impl Supervisor { match self.mode { NotifyMode::Disabled => { - debug_assert!(false, "supervisor received notification with NotifyMode::Disabled"); + debug_assert!( + false, + "supervisor received notification with NotifyMode::Disabled" + ); self.respond_continue(¬if)?; Ok(None) } @@ -234,7 +237,8 @@ impl Supervisor { buf.truncate(nul_pos); } - String::from_utf8(buf).map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "invalid UTF-8 in path")) + String::from_utf8(buf) + .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "invalid UTF-8 in path")) } } diff --git a/crates/evalbox-sandbox/tests/common/mod.rs b/crates/evalbox-sandbox/tests/common/mod.rs index cad5812..f125ca5 100644 --- a/crates/evalbox-sandbox/tests/common/mod.rs +++ b/crates/evalbox-sandbox/tests/common/mod.rs @@ -16,43 +16,40 @@ pub fn payload(name: &str) -> Vec { /// Find payload in cargo's build directory structure. fn find_payload(name: &str) -> Option { - // Get the workspace root by looking for Cargo.toml + // 1. Next to the test executable (Nix builds) + if let Ok(exe) = std::env::current_exe() { + if let Some(exe_dir) = exe.parent() { + let path = exe_dir.join("payloads").join(name); + if path.exists() { + return Some(path); + } + } + } + + // 2. Cargo build directory (development) let manifest_dir = std::env::var("CARGO_MANIFEST_DIR") .map(PathBuf::from) .unwrap_or_else(|_| PathBuf::from(".")); - // The workspace root is two levels up from crates/evalbox-sandbox let workspace_root = manifest_dir.parent()?.parent()?; - let target_dir = workspace_root.join("target"); - // Look in both debug and release builds - for profile in ["debug", "release"] { - let build_dir = target_dir.join(profile).join("build"); - if let Ok(entries) = std::fs::read_dir(&build_dir) { - for entry in entries.flatten() { - let dir_name = entry.file_name(); - if dir_name.to_string_lossy().starts_with("evalbox-sandbox-") { - let payload_path = entry.path().join("out").join("payloads").join(name); - if payload_path.exists() { - return Some(payload_path); - } - } - } - } - } + let target_dirs: Vec = std::iter::once(workspace_root.join("target")) + .chain(std::env::var("CARGO_TARGET_DIR").ok().map(PathBuf::from)) + .collect(); - // Also try CARGO_TARGET_DIR if set - if let Ok(target) = std::env::var("CARGO_TARGET_DIR") { - let target_dir = PathBuf::from(target); + for target_dir in target_dirs { for profile in ["debug", "release"] { let build_dir = target_dir.join(profile).join("build"); if let Ok(entries) = std::fs::read_dir(&build_dir) { for entry in entries.flatten() { - let dir_name = entry.file_name(); - if dir_name.to_string_lossy().starts_with("evalbox-sandbox-") { - let payload_path = entry.path().join("out").join("payloads").join(name); - if payload_path.exists() { - return Some(payload_path); + if entry + .file_name() + .to_string_lossy() + .starts_with("evalbox-sandbox-") + { + let path = entry.path().join("out").join("payloads").join(name); + if path.exists() { + return Some(path); } } } diff --git a/crates/evalbox-sandbox/tests/security/filesystem.rs b/crates/evalbox-sandbox/tests/security/filesystem.rs index 0f9be09..fe08618 100644 --- a/crates/evalbox-sandbox/tests/security/filesystem.rs +++ b/crates/evalbox-sandbox/tests/security/filesystem.rs @@ -67,7 +67,10 @@ fn test_work_dir_is_writable() { ) .expect("Executor should run"); - assert!(output.success(), "Should be able to write to CWD (work dir)"); + assert!( + output.success(), + "Should be able to write to CWD (work dir)" + ); assert_eq!(output.stdout_str().trim(), "test content"); } @@ -101,10 +104,9 @@ fn test_tmp_is_writable() { #[test] #[ignore] fn test_path_traversal_blocked() { - let output = Executor::run( - Plan::new(["cat", "../../../etc/shadow"]).timeout(Duration::from_secs(5)), - ) - .expect("Executor should run"); + let output = + Executor::run(Plan::new(["cat", "../../../etc/shadow"]).timeout(Duration::from_secs(5))) + .expect("Executor should run"); // Landlock should block access to /etc/shadow (no read on shadow, even via traversal) assert!( @@ -120,12 +122,8 @@ fn test_path_traversal_blocked() { #[ignore] fn test_symlink_escape_blocked() { let output = Executor::run( - Plan::new([ - "sh", - "-c", - "ln -s /etc/shadow ./shadow && cat ./shadow", - ]) - .timeout(Duration::from_secs(5)), + Plan::new(["sh", "-c", "ln -s /etc/shadow ./shadow && cat ./shadow"]) + .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); diff --git a/crates/evalbox-sys/src/seccomp_notify.rs b/crates/evalbox-sys/src/seccomp_notify.rs index 9edf775..91a2357 100644 --- a/crates/evalbox-sys/src/seccomp_notify.rs +++ b/crates/evalbox-sys/src/seccomp_notify.rs @@ -164,8 +164,13 @@ pub unsafe fn seccomp_set_mode_filter_listener(fprog: &SockFprog) -> Result Result<(), Errno> { - let ret = - unsafe { libc::ioctl(listener_fd, SECCOMP_IOCTL_NOTIF_RECV, notif as *mut SeccompNotif) }; + let ret = unsafe { + libc::ioctl( + listener_fd, + SECCOMP_IOCTL_NOTIF_RECV, + notif as *mut SeccompNotif, + ) + }; if ret < 0 { Err(last_errno()) } else { Ok(()) } } @@ -194,13 +199,7 @@ pub fn notif_send(listener_fd: i32, resp: &SeccompNotifResp) -> Result<(), Errno /// /// Returns `Errno::NOENT` if the notification is no longer valid. pub fn notif_id_valid(listener_fd: i32, id: u64) -> Result<(), Errno> { - let ret = unsafe { - libc::ioctl( - listener_fd, - SECCOMP_IOCTL_NOTIF_ID_VALID, - &id as *const u64, - ) - }; + let ret = unsafe { libc::ioctl(listener_fd, SECCOMP_IOCTL_NOTIF_ID_VALID, &id as *const u64) }; if ret < 0 { Err(last_errno()) } else { Ok(()) } } diff --git a/nix/packages.nix b/nix/packages.nix index 2bbd50c..54b4f4c 100644 --- a/nix/packages.nix +++ b/nix/packages.nix @@ -1,44 +1,57 @@ { ... }: { - perSystem = { pkgs, craneLib, toolchainWithExtensions, src, commonArgs, cargoArtifacts, ... }: { + perSystem = { pkgs, craneLib, src, commonArgs, cargoArtifacts, ... }: + let + srcWithPayloads = pkgs.lib.cleanSourceWith { + src = ./..; + filter = path: type: + (craneLib.filterCargoSources path type) + || (builtins.match ".*\\.c$" path != null); + }; + in { packages = { default = craneLib.buildPackage (commonArgs // { inherit cargoArtifacts; }); - test-all = pkgs.writeShellApplication { - name = "evalbox-test-all"; - runtimeInputs = [ toolchainWithExtensions pkgs.pkg-config pkgs.gcc ]; - text = '' - cargo test --lib - - cargo build -p evalbox-sandbox - - cargo test -p evalbox-sandbox --test security_tests --ignored -- --test-threads=1 + security-test-bin = craneLib.mkCargoDerivation (commonArgs // { + inherit cargoArtifacts; + src = srcWithPayloads; + pnameSuffix = "-security-tests"; + doCheck = false; + nativeBuildInputs = (commonArgs.nativeBuildInputs or []) ++ [ pkgs.jq pkgs.gcc ]; + buildPhaseCargoCommand = '' + cargo test -p evalbox-sandbox --test security_tests \ + --no-run --release --message-format=json 2>/dev/null \ + | jq -r 'select(.executable != null) | .executable' \ + > /tmp/test-bins.txt ''; - }; + installPhaseCommand = '' + mkdir -p $out/bin/payloads + while IFS= read -r bin; do + [ -f "$bin" ] && cp "$bin" $out/bin/ + done < /tmp/test-bins.txt + for dir in target/release/build/evalbox-sandbox-*/out/payloads; do + [ -d "$dir" ] && cp "$dir"/* $out/bin/payloads/ + done + ''; + }); }; checks = { - clippy = craneLib.cargoClippy { - inherit src cargoArtifacts; - pname = "evalbox-clippy"; + clippy = craneLib.cargoClippy (commonArgs // { + inherit cargoArtifacts; cargoClippyExtraArgs = "--all-targets -- -D warnings"; - }; - fmt = craneLib.cargoFmt { - inherit src; - pname = "evalbox-fmt"; - }; - test = craneLib.cargoTest { - inherit src cargoArtifacts; - pname = "evalbox-test"; + }); + fmt = craneLib.cargoFmt { inherit src; }; + test = craneLib.cargoTest (commonArgs // { + inherit cargoArtifacts; cargoTestExtraArgs = "--lib"; - }; - doc = craneLib.cargoDoc { - inherit src cargoArtifacts; - pname = "evalbox-doc"; + }); + doc = craneLib.cargoDoc (commonArgs // { + inherit cargoArtifacts; RUSTDOCFLAGS = "-D warnings"; - }; + }); }; }; } From 89048c9d723865754ce849349b1fc264aa382eb4 Mon Sep 17 00:00:00 2001 From: fullzer4 Date: Sun, 22 Feb 2026 17:58:03 -0300 Subject: [PATCH 5/6] fix(ci): mount nix store in docker for e2e tests --- .github/workflows/ci.yml | 34 +++++++++------------------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9b42d56..0415181 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,46 +24,30 @@ jobs: - name: Run checks (clippy, fmt, test, doc) run: nix flake check -L - build-test-bin: - name: Build Security Tests + e2e: + name: E2E (${{ matrix.distro }}) runs-on: ubuntu-latest needs: nix-checks permissions: id-token: write contents: read + strategy: + fail-fast: false + matrix: + distro: [ubuntu:24.04, fedora:41, alpine:3.21] steps: - uses: actions/checkout@v4 - uses: DeterminateSystems/determinate-nix-action@v3 - uses: DeterminateSystems/magic-nix-cache-action@main - name: Build security test binary run: nix build -L .#security-test-bin - - name: Upload test binary - uses: actions/upload-artifact@v4 - with: - name: security-test-bin - path: result/bin/ - - e2e: - name: E2E (${{ matrix.distro }}) - runs-on: ubuntu-latest - needs: build-test-bin - strategy: - fail-fast: false - matrix: - distro: [ubuntu:24.04, fedora:41, alpine:3.21] - steps: - - uses: actions/download-artifact@v4 - with: - name: security-test-bin - path: ./test-bin/ - name: Run security tests in ${{ matrix.distro }} run: | - chmod +x ./test-bin/* - TEST_BIN=$(find ./test-bin/ -type f -executable | head -1) + TEST_BIN=$(realpath result/bin/security_tests-*) docker run --rm --privileged \ - -v "$TEST_BIN:/security_tests:ro" \ + -v /nix/store:/nix/store:ro \ ${{ matrix.distro }} \ - /security_tests --ignored --test-threads=1 + "$TEST_BIN" --ignored --test-threads=1 semver-check: name: SemVer Check From dcd667176f799eb0821716af74b5fffdf9c8f350 Mon Sep 17 00:00:00 2001 From: fullzer4 Date: Sun, 22 Feb 2026 18:17:25 -0300 Subject: [PATCH 6/6] fix(ci): cargo fmt + e2e commented until GHA gets kernel 6.12+ --- .github/workflows/ci.yml | 52 +++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0415181..e7a297e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,30 +24,34 @@ jobs: - name: Run checks (clippy, fmt, test, doc) run: nix flake check -L - e2e: - name: E2E (${{ matrix.distro }}) - runs-on: ubuntu-latest - needs: nix-checks - permissions: - id-token: write - contents: read - strategy: - fail-fast: false - matrix: - distro: [ubuntu:24.04, fedora:41, alpine:3.21] - steps: - - uses: actions/checkout@v4 - - uses: DeterminateSystems/determinate-nix-action@v3 - - uses: DeterminateSystems/magic-nix-cache-action@main - - name: Build security test binary - run: nix build -L .#security-test-bin - - name: Run security tests in ${{ matrix.distro }} - run: | - TEST_BIN=$(realpath result/bin/security_tests-*) - docker run --rm --privileged \ - -v /nix/store:/nix/store:ro \ - ${{ matrix.distro }} \ - "$TEST_BIN" --ignored --test-threads=1 + # E2E security tests require kernel 6.12+ (Landlock ABI v5). + # GHA ubuntu-latest currently ships 6.11; image 20260209 has 6.14 but hasn't propagated yet. + # Uncomment when runners get kernel 6.12+. + # + # e2e: + # name: E2E (${{ matrix.distro }}) + # runs-on: ubuntu-latest + # needs: nix-checks + # permissions: + # id-token: write + # contents: read + # strategy: + # fail-fast: false + # matrix: + # distro: [ubuntu:24.04, fedora:41, alpine:3.21] + # steps: + # - uses: actions/checkout@v4 + # - uses: DeterminateSystems/determinate-nix-action@v3 + # - uses: DeterminateSystems/magic-nix-cache-action@main + # - name: Build security test binary + # run: nix build -L .#security-test-bin + # - name: Run security tests in ${{ matrix.distro }} + # run: | + # TEST_BIN=$(realpath result/bin/security_tests-*) + # docker run --rm --privileged \ + # -v /nix/store:/nix/store:ro \ + # ${{ matrix.distro }} \ + # "$TEST_BIN" --ignored --test-threads=1 semver-check: name: SemVer Check