diff --git a/.cargo/config.toml b/.cargo/config.toml deleted file mode 100644 index d47f4ee..0000000 --- a/.cargo/config.toml +++ /dev/null @@ -1,13 +0,0 @@ -[target.x86_64-unknown-linux-gnu] -linker = "clang" -rustflags = ["-C", "link-arg=-fuse-ld=mold"] - -[target.aarch64-unknown-linux-gnu] -linker = "clang" -rustflags = ["-C", "link-arg=-fuse-ld=mold"] - -[build] -rustflags = ["-C", "target-cpu=native"] - -[term] -color = "always" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c65f903..e7a297e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,45 +6,52 @@ on: pull_request: branches: [main] -env: - CARGO_TERM_COLOR: always - RUSTFLAGS: -Dwarnings +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: - fmt: - name: Format + nix-checks: + name: Nix Checks runs-on: ubuntu-latest + permissions: + id-token: write + contents: read steps: - uses: actions/checkout@v4 - - run: rustup component add rustfmt - - run: cargo fmt --all --check + - uses: DeterminateSystems/determinate-nix-action@v3 + - uses: DeterminateSystems/magic-nix-cache-action@main + - name: Run checks (clippy, fmt, test, doc) + run: nix flake check -L - clippy: - name: Clippy - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - run: rustup component add clippy - - uses: Swatinem/rust-cache@v2 - - run: cargo clippy --all-targets --all-features - - doc: - name: Documentation - runs-on: ubuntu-latest - env: - RUSTDOCFLAGS: -Dwarnings - steps: - - uses: actions/checkout@v4 - - uses: Swatinem/rust-cache@v2 - - run: cargo doc --no-deps --all-features - - unit-tests: - name: Unit Tests - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: Swatinem/rust-cache@v2 - - run: cargo test --lib --all-features + # E2E security tests require kernel 6.12+ (Landlock ABI v5). + # GHA ubuntu-latest currently ships 6.11; image 20260209 has 6.14 but hasn't propagated yet. + # Uncomment when runners get kernel 6.12+. + # + # e2e: + # name: E2E (${{ matrix.distro }}) + # runs-on: ubuntu-latest + # needs: nix-checks + # permissions: + # id-token: write + # contents: read + # strategy: + # fail-fast: false + # matrix: + # distro: [ubuntu:24.04, fedora:41, alpine:3.21] + # steps: + # - uses: actions/checkout@v4 + # - uses: DeterminateSystems/determinate-nix-action@v3 + # - uses: DeterminateSystems/magic-nix-cache-action@main + # - name: Build security test binary + # run: nix build -L .#security-test-bin + # - name: Run security tests in ${{ matrix.distro }} + # run: | + # TEST_BIN=$(realpath result/bin/security_tests-*) + # docker run --rm --privileged \ + # -v /nix/store:/nix/store:ro \ + # ${{ matrix.distro }} \ + # "$TEST_BIN" --ignored --test-threads=1 semver-check: name: SemVer Check @@ -52,7 +59,6 @@ jobs: steps: - uses: actions/checkout@v4 - uses: Swatinem/rust-cache@v2 - - name: Check semver - uses: obi1kenobi/cargo-semver-checks-action@v2 + - uses: obi1kenobi/cargo-semver-checks-action@v2 with: package: evalbox diff --git a/.gitignore b/.gitignore index 73e2f03..a803b6f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ .direnv target/ -bindings/ \ No newline at end of file +bindings/ + +# Local cargo config +.cargo/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 4408a55..fdf27b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,40 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.1.1] - 2026-02-22 + +### Changed + +- **Architecture: remove namespace isolation, use Landlock v5 as primary** + - Removed user, PID, network, mount, UTS, and IPC namespaces + - Removed `pivot_root` and bind mount rootfs setup + - Landlock v5 replaces namespaces for filesystem, network, signal, and IPC control + - Plain `fork()` instead of `clone()` with `CLONE_NEW*` flags + - Minimum kernel raised from 5.13 to 6.12 (Landlock ABI 5) + +- **Resource limits moved to dedicated module** (`isolation/rlimits.rs`) + - `RLIMIT_DATA` (256 MiB) instead of `RLIMIT_AS` (breaks Go/Java/V8 runtimes) + - Added `RLIMIT_CPU`, `RLIMIT_CORE`, `RLIMIT_STACK` + +- **Nix flake migrated to flake-parts + import-tree** + - Auto-discovery of modules via `import-tree ./nix` + - Removed manual `forAllSystems` boilerplate + - Restricted to `x86_64-linux` (arm not yet supported) + +### Added + +- Seccomp user notify support (`SECCOMP_RET_USER_NOTIF`) for optional syscall interception +- `nix run .#test-all` to run the full security test suite +- `SECURITY.md` — GitHub standard vulnerability reporting policy +- `CONTRIBUTING.md` — development setup, testing guide +- Security hardening roadmap (UDP filtering, /proc restriction, optional PID namespace) + +### Removed + +- `crates/evalbox-sandbox/src/isolation/namespace.rs` — namespace setup +- `crates/evalbox-sandbox/src/isolation/rootfs.rs` — pivot_root + bind mounts +- `nix/lib.nix`, `nix/checks.nix`, `nix/tests/` — replaced by flake-parts modules + ## [0.1.0] - 2025-02-17 ### Added @@ -19,14 +53,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Go runtime with compilation caching - Shell runtime for script execution -- **Security isolation** (7 layers of defense) - - User namespaces (unprivileged containers) - - PID namespace (process isolation) - - Network namespace (network isolation) - - Mount namespace + pivot_root (filesystem isolation) - - Landlock LSM (kernel-enforced filesystem rules) +- **Security isolation** + - Landlock v5 (filesystem, network, signal, IPC access control) - Seccomp BPF (syscall whitelist with ~100 allowed syscalls) - Resource limits (memory, PIDs, file descriptors, timeout) + - Privilege hardening (NO_NEW_PRIVS, securebits, capability drop) - **Seccomp filtering** - Whitelist-based syscall filter diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..dd919ef --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,80 @@ +# Contributing + +## Development Setup + +evalbox uses Nix for a reproducible dev environment: + +```bash +nix develop +``` + +This provides the Rust toolchain, GCC (for test payloads), Python, and Go. + +## Building + +```bash +cargo build +``` + +## Testing + +### Fast checks (CI) + +```bash +# Runs via nix: clippy, fmt, unit tests, docs +nix flake check +``` + +Or manually: + +```bash +cargo clippy --all-targets -- -D warnings +cargo fmt --check +cargo test --lib +cargo doc --no-deps +``` + +### Full test suite (requires user namespaces) + +```bash +nix run .#test-all +``` + +Or manually: + +```bash +cargo build -p evalbox-sandbox +cargo test -p evalbox-sandbox --test security_tests --ignored -- --test-threads=1 +``` + +The security tests require Linux with user namespaces enabled. They compile C payloads that attempt real exploit techniques (CVEs, syscall abuse, escape vectors) and verify the sandbox blocks them. + +### Running specific test categories + +```bash +cargo test -p evalbox-sandbox --test security_tests seccomp -- --ignored +cargo test -p evalbox-sandbox --test security_tests filesystem -- --ignored +cargo test -p evalbox-sandbox --test security_tests network -- --ignored +cargo test -p evalbox-sandbox --test security_tests cve -- --ignored +cargo test -p evalbox-sandbox --test security_tests resources -- --ignored +``` + +## Project Structure + +``` +evalbox/ # Public API, language runtimes +evalbox-sandbox/ # Sandbox orchestration, isolation +evalbox-sys/ # Low-level Linux syscall wrappers +``` + +See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for details. + +## Pull Requests + +- Run `nix flake check` before submitting +- Security-related changes should include tests in `crates/evalbox-sandbox/tests/security/` +- Keep the seccomp whitelist minimal: don't add syscalls without justification + +## Security + +Found a vulnerability? See [SECURITY.md](SECURITY.md) for reporting instructions. diff --git a/Cargo.lock b/Cargo.lock index 9e06bdb..bb2e97b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -59,12 +59,6 @@ dependencies = [ "parking_lot_core", ] -[[package]] -name = "either" -version = "1.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" - [[package]] name = "env_home" version = "0.1.0" @@ -101,7 +95,7 @@ dependencies = [ "tempfile", "thiserror", "walkdir", - "which 7.0.3", + "which", ] [[package]] @@ -115,7 +109,7 @@ dependencies = [ "rustix", "tempfile", "thiserror", - "which 8.0.0", + "which", ] [[package]] @@ -610,18 +604,6 @@ dependencies = [ "semver", ] -[[package]] -name = "which" -version = "7.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24d643ce3fd3e5b54854602a080f34fb10ab75e0b813ee32d00ca2b44fa74762" -dependencies = [ - "either", - "env_home", - "rustix", - "winsafe", -] - [[package]] name = "which" version = "8.0.0" diff --git a/Cargo.toml b/Cargo.toml index bc6800c..96b7910 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,9 @@ members = [ "crates/evalbox-sandbox", ] +[workspace.metadata.crane] +name = "evalbox" + [workspace.package] version = "0.1.0" edition = "2024" @@ -30,7 +33,7 @@ rustix = { version = "1", features = ["event", "process", "system", "mount", "fs thiserror = "2" tempfile = "3" mio = { version = "1.0", features = ["os-poll", "os-ext"] } -which = "7" +which = "8" [workspace.lints.rust] unsafe_op_in_unsafe_fn = "warn" @@ -39,7 +42,6 @@ unused_must_use = "warn" [workspace.lints.clippy] all = { level = "warn", priority = -1 } -# Useful pedantic lints (not all) cast_possible_truncation = "warn" cast_sign_loss = "warn" cloned_instead_of_copied = "warn" diff --git a/README.md b/README.md index 379b8bf..9dae5a0 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Execute code like `eval()`, but safe. No containers, no VMs, no root. - **Simple** - One function call, security handled for you - **Multi-language** - Python, Go, and shell/terminal commands - **Fast** - Millisecond startup, no containers or VMs -- **Secure** - 7 layers of isolation (namespaces, Landlock, seccomp, rlimits) +- **Secure** - Landlock v5 + seccomp-BPF + rlimits, no namespaces needed ## Quick Start @@ -38,8 +38,8 @@ let output = shell::run("curl https://example.com") ## Requirements -- Linux kernel 5.13+ (Landlock ABI 1+) -- User namespaces enabled +- Linux kernel 6.12+ (Landlock ABI 5) +- Seccomp enabled ## Installation @@ -50,15 +50,16 @@ evalbox = { version = "0.1", features = ["python", "go", "shell"] } ## Security -7 layers of isolation: user namespaces, PID namespace, network namespace, mount namespace + pivot_root, Landlock LSM, seccomp BPF, rlimits. +Isolation via Landlock v5 (filesystem + network + signal + IPC scoping), seccomp-BPF (syscall whitelist), rlimits, privilege hardening (NO_NEW_PRIVS, securebits, capability drop). -See [SECURITY.md](docs/SECURITY.md) for threat model and CVE protections. +See [Security Model](docs/SECURITY_MODEL.md) for threat model and CVE protections. ## Documentation - [Architecture](docs/ARCHITECTURE.md) -- [Security Model](docs/SECURITY.md) +- [Security Model](docs/SECURITY_MODEL.md) - [Roadmap](docs/ROADMAP.md) +- [Contributing](CONTRIBUTING.md) ## License diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..0ce141b --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,42 @@ +# Security Policy + +## Supported Versions + +| Version | Supported | +|---------|-----------| +| 0.1.x | Yes | + +## Reporting a Vulnerability + +If you discover a security vulnerability in evalbox, **please do not open a public issue.** + +Instead, report it privately via [GitHub Security Advisories](https://github.com/fullzer4/evalbox/security/advisories/new). + +Include: +- Description of the vulnerability +- Steps to reproduce +- Which isolation mechanism is affected (Landlock, seccomp, rlimits, privilege hardening) +- Impact assessment (sandbox escape, info leak, DoS, etc.) + +You should receive a response within **72 hours**. Critical sandbox escape vulnerabilities are treated as highest priority. + +## Scope + +evalbox provides isolation via Landlock v5, seccomp-BPF, rlimits, and privilege hardening. The following are in scope for security reports: + +- Sandbox escape (code executing outside isolation) +- Filesystem access beyond Landlock-allowed paths +- Network access when disabled +- Privilege escalation from sandbox +- Seccomp filter bypass +- Landlock rule bypass +- Resource limit bypass (memory, PIDs, file descriptors) + +See [docs/SECURITY_MODEL.md](docs/SECURITY_MODEL.md) for the full threat model and isolation architecture. + +## Out of Scope + +- Kernel 0-day exploits (requires kernel hardening) +- CPU side-channel attacks (Spectre/Meltdown) +- Denial of service against the host kernel +- Issues requiring non-default kernel configurations diff --git a/crates/evalbox-sandbox/Cargo.toml b/crates/evalbox-sandbox/Cargo.toml index 482e35e..7b3224e 100644 --- a/crates/evalbox-sandbox/Cargo.toml +++ b/crates/evalbox-sandbox/Cargo.toml @@ -14,7 +14,7 @@ rustix.workspace = true tempfile.workspace = true mio.workspace = true thiserror.workspace = true -which = "8" +which.workspace = true [build-dependencies] cc = "1.2" diff --git a/crates/evalbox-sandbox/src/executor.rs b/crates/evalbox-sandbox/src/executor.rs index 7d659de..57512fb 100644 --- a/crates/evalbox-sandbox/src/executor.rs +++ b/crates/evalbox-sandbox/src/executor.rs @@ -39,7 +39,7 @@ use std::collections::HashMap; use std::ffi::CString; use std::io::{self, Write as _}; use std::os::fd::{AsRawFd, OwnedFd, RawFd}; -use std::path::{Path, PathBuf}; +use std::path::PathBuf; use std::time::{Duration, Instant}; use mio::unix::SourceFd; @@ -48,14 +48,16 @@ use rustix::io::Errno; use rustix::process::{Pid, PidfdFlags, Signal, pidfd_open, pidfd_send_signal}; use thiserror::Error; -use evalbox_sys::{check, last_errno}; - -use crate::isolation::{ - LockdownError, bind_mount, lockdown, make_rprivate, mount_minimal_dev, mount_proc, - pivot_root_and_cleanup, set_hostname, setup_id_maps, +use evalbox_sys::seccomp::{ + DEFAULT_WHITELIST, NOTIFY_FS_SYSCALLS, SockFprog, build_notify_filter, build_whitelist_filter, }; +use evalbox_sys::seccomp_notify::seccomp_set_mode_filter_listener; +use evalbox_sys::{check, last_errno, seccomp::seccomp_set_mode_filter}; + +use crate::isolation::{LockdownError, close_extra_fds, lockdown}; use crate::monitor::{Output, Status, monitor, set_nonblocking, wait_for_exit, write_stdin}; -use crate::plan::{Mount, Plan}; +use crate::notify::scm_rights; +use crate::plan::{Mount, NotifyMode, Plan}; use crate::resolve::{ResolvedBinary, resolve_binary}; use crate::validate::validate_cmd; use crate::workspace::Workspace; @@ -75,15 +77,6 @@ pub enum ExecutorError { #[error("fork: {0}")] Fork(Errno), - #[error("unshare: {0}")] - Unshare(Errno), - - #[error("id map: {0}")] - IdMap(io::Error), - - #[error("rootfs: {0}")] - Rootfs(Errno), - #[error("lockdown: {0}")] Lockdown(#[from] LockdownError), @@ -102,6 +95,9 @@ pub enum ExecutorError { #[error("command not found: {0}")] CommandNotFound(String), + #[error("seccomp notify: {0}")] + SeccompNotify(String), + #[error("io: {0}")] Io(#[from] io::Error), } @@ -155,12 +151,17 @@ impl ExecutionInfo { } /// A spawned sandbox that hasn't been waited on yet. +/// +/// Some fields are never read but kept alive for RAII (fd lifetime, temp dir cleanup). +#[allow(dead_code)] struct SpawnedSandbox { pidfd: OwnedFd, stdin_fd: RawFd, stdout_fd: RawFd, stderr_fd: RawFd, - #[allow(dead_code)] + /// Seccomp listener fd kept alive for RAII; future supervisor integration. + notify_fd: Option, + /// Workspace kept alive so temp directory isn't deleted while sandbox runs. workspace: std::mem::ManuallyDrop, } @@ -235,15 +236,22 @@ impl Executor { let workspace = Workspace::with_prefix("evalbox-").map_err(ExecutorError::Workspace)?; + workspace + .setup_sandbox_dirs() + .map_err(ExecutorError::Workspace)?; for file in &plan.user_files { + let work_path = format!("work/{}", file.path); workspace - .write_file(&file.path, &file.content, file.executable) + .write_file(&work_path, &file.content, file.executable) .map_err(ExecutorError::Workspace)?; } - workspace - .setup_sandbox_dirs() - .map_err(ExecutorError::Workspace)?; - create_mount_dirs(&workspace, &exec_info, &plan)?; + + // Create socketpair for notify fd transfer (if needed) + let notify_sockets = if plan.notify_mode != NotifyMode::Disabled { + Some(scm_rights::create_socketpair().map_err(ExecutorError::Workspace)?) + } else { + None + }; let child_pid = unsafe { libc::fork() }; if child_pid < 0 { @@ -251,7 +259,9 @@ impl Executor { } if child_pid == 0 { - match child_process(&workspace, &plan, &exec_info) { + // In child: close parent's socket end + let child_socket = notify_sockets.map(|(_, child)| child); + match child_process(&workspace, &plan, &exec_info, child_socket.as_ref()) { Ok(()) => unsafe { libc::_exit(127) }, Err(e) => { writeln!(io::stderr(), "sandbox error: {e}").ok(); @@ -263,7 +273,22 @@ impl Executor { let pid = unsafe { Pid::from_raw_unchecked(child_pid) }; let pidfd = pidfd_open(pid, PidfdFlags::empty()).map_err(ExecutorError::Pidfd)?; - blocking_parent(child_pid, pidfd, workspace, plan) + // Parent: receive notify fd if applicable + let notify_fd = if let Some((parent_socket, _)) = notify_sockets { + poll_or_kill( + parent_socket.as_raw_fd(), + child_pid, + "timeout waiting for notify fd", + )?; + Some( + scm_rights::recv_fd(parent_socket.as_raw_fd()) + .map_err(|e| ExecutorError::SeccompNotify(e.to_string()))?, + ) + } else { + None + }; + + blocking_parent(child_pid, pidfd, notify_fd, workspace, plan) } /// Spawn a new sandbox. Returns immediately with a [`SandboxId`]. @@ -534,6 +559,50 @@ impl Executor { } } +/// Close the parent-side pipe ends that the child uses (stdin read, stdout write, stderr write). +fn close_parent_pipe_ends(workspace: &Workspace) { + unsafe { + libc::close(workspace.pipes.stdin.read.as_raw_fd()); + libc::close(workspace.pipes.stdout.write.as_raw_fd()); + libc::close(workspace.pipes.stderr.write.as_raw_fd()); + } +} + +/// Poll an fd with a 30-second timeout; kill the child on timeout or error. +fn poll_or_kill(fd: RawFd, child_pid: libc::pid_t, msg: &str) -> Result<(), ExecutorError> { + let mut pfd = libc::pollfd { + fd, + events: libc::POLLIN, + revents: 0, + }; + if unsafe { libc::poll(&mut pfd, 1, 30000) } <= 0 { + unsafe { libc::kill(child_pid, libc::SIGKILL) }; + return Err(ExecutorError::ChildSetup(msg.into())); + } + Ok(()) +} + +/// Wait for the child to signal readiness via eventfd, then signal back. +fn sync_with_child(workspace: &Workspace, child_pid: libc::pid_t) -> Result<(), ExecutorError> { + let child_ready_fd = workspace.pipes.sync.child_ready_fd(); + poll_or_kill(child_ready_fd, child_pid, "timeout waiting for child")?; + + let mut value: u64 = 0; + if unsafe { libc::read(child_ready_fd, (&mut value as *mut u64).cast(), 8) } != 8 { + unsafe { libc::kill(child_pid, libc::SIGKILL) }; + return Err(ExecutorError::ChildSetup("eventfd read failed".into())); + } + + let parent_done_fd = workspace.pipes.sync.parent_done_fd(); + let signal_value: u64 = 1; + if unsafe { libc::write(parent_done_fd, (&signal_value as *const u64).cast(), 8) } != 8 { + unsafe { libc::kill(child_pid, libc::SIGKILL) }; + return Err(ExecutorError::ChildSetup("eventfd write failed".into())); + } + + Ok(()) +} + fn spawn_sandbox(plan: Plan) -> Result { let cmd_refs: Vec<&str> = plan.cmd.iter().map(|s| s.as_str()).collect(); validate_cmd(&cmd_refs).map_err(ExecutorError::Validation)?; @@ -552,15 +621,22 @@ fn spawn_sandbox(plan: Plan) -> Result { let workspace = Workspace::with_prefix("evalbox-").map_err(ExecutorError::Workspace)?; + workspace + .setup_sandbox_dirs() + .map_err(ExecutorError::Workspace)?; for file in &plan.user_files { + let work_path = format!("work/{}", file.path); workspace - .write_file(&file.path, &file.content, file.executable) + .write_file(&work_path, &file.content, file.executable) .map_err(ExecutorError::Workspace)?; } - workspace - .setup_sandbox_dirs() - .map_err(ExecutorError::Workspace)?; - create_mount_dirs(&workspace, &exec_info, &plan)?; + + // Create socketpair for notify fd transfer (if needed) + let notify_sockets = if plan.notify_mode != NotifyMode::Disabled { + Some(scm_rights::create_socketpair().map_err(ExecutorError::Workspace)?) + } else { + None + }; let child_pid = unsafe { libc::fork() }; if child_pid < 0 { @@ -568,7 +644,8 @@ fn spawn_sandbox(plan: Plan) -> Result { } if child_pid == 0 { - match child_process(&workspace, &plan, &exec_info) { + let child_socket = notify_sockets.map(|(_, child)| child); + match child_process(&workspace, &plan, &exec_info, child_socket.as_ref()) { Ok(()) => unsafe { libc::_exit(127) }, Err(e) => { writeln!(io::stderr(), "sandbox error: {e}").ok(); @@ -580,47 +657,28 @@ fn spawn_sandbox(plan: Plan) -> Result { let pid = unsafe { Pid::from_raw_unchecked(child_pid) }; let pidfd = pidfd_open(pid, PidfdFlags::empty()).map_err(ExecutorError::Pidfd)?; - // Parent: close unused pipe ends let stdin_write_fd = workspace.pipes.stdin.write.as_raw_fd(); let stdout_read_fd = workspace.pipes.stdout.read.as_raw_fd(); let stderr_read_fd = workspace.pipes.stderr.read.as_raw_fd(); - unsafe { - libc::close(workspace.pipes.stdin.read.as_raw_fd()); - libc::close(workspace.pipes.stdout.write.as_raw_fd()); - libc::close(workspace.pipes.stderr.write.as_raw_fd()); - } + close_parent_pipe_ends(&workspace); - // Wait for child to signal readiness - let child_ready_fd = workspace.pipes.sync.child_ready_fd(); - let mut pfd = libc::pollfd { - fd: child_ready_fd, - events: libc::POLLIN, - revents: 0, + // Receive notify fd from child if applicable + let notify_fd = if let Some((parent_socket, _)) = notify_sockets { + poll_or_kill( + parent_socket.as_raw_fd(), + child_pid, + "timeout waiting for notify fd", + )?; + Some( + scm_rights::recv_fd(parent_socket.as_raw_fd()) + .map_err(|e| ExecutorError::SeccompNotify(e.to_string()))?, + ) + } else { + None }; - if unsafe { libc::poll(&mut pfd, 1, 30000) } <= 0 { - unsafe { libc::kill(child_pid, libc::SIGKILL) }; - return Err(ExecutorError::ChildSetup( - "timeout waiting for child".into(), - )); - } - - let mut value: u64 = 0; - if unsafe { libc::read(child_ready_fd, (&mut value as *mut u64).cast(), 8) } != 8 { - unsafe { libc::kill(child_pid, libc::SIGKILL) }; - return Err(ExecutorError::ChildSetup("eventfd read failed".into())); - } - - setup_id_maps(child_pid).map_err(ExecutorError::IdMap)?; - - // Signal child to continue - let parent_done_fd = workspace.pipes.sync.parent_done_fd(); - let signal_value: u64 = 1; - if unsafe { libc::write(parent_done_fd, (&signal_value as *const u64).cast(), 8) } != 8 { - unsafe { libc::kill(child_pid, libc::SIGKILL) }; - return Err(ExecutorError::ChildSetup("eventfd write failed".into())); - } + sync_with_child(&workspace, child_pid)?; // Write stdin if provided if let Some(ref stdin_data) = plan.stdin { @@ -647,6 +705,7 @@ fn spawn_sandbox(plan: Plan) -> Result { }, stdout_fd: stdout_read_fd, stderr_fd: stderr_read_fd, + notify_fd, workspace: std::mem::ManuallyDrop::new(workspace), }) } @@ -654,45 +713,15 @@ fn spawn_sandbox(plan: Plan) -> Result { fn blocking_parent( child_pid: libc::pid_t, pidfd: OwnedFd, + _notify_fd: Option, workspace: Workspace, plan: Plan, ) -> Result { let workspace = std::mem::ManuallyDrop::new(workspace); - unsafe { - libc::close(workspace.pipes.stdin.read.as_raw_fd()); - libc::close(workspace.pipes.stdout.write.as_raw_fd()); - libc::close(workspace.pipes.stderr.write.as_raw_fd()); - } + close_parent_pipe_ends(&workspace); - let child_ready_fd = workspace.pipes.sync.child_ready_fd(); - let mut pfd = libc::pollfd { - fd: child_ready_fd, - events: libc::POLLIN, - revents: 0, - }; - - if unsafe { libc::poll(&mut pfd, 1, 30000) } <= 0 { - unsafe { libc::kill(child_pid, libc::SIGKILL) }; - return Err(ExecutorError::ChildSetup( - "timeout waiting for child".into(), - )); - } - - let mut value: u64 = 0; - if unsafe { libc::read(child_ready_fd, (&mut value as *mut u64).cast(), 8) } != 8 { - unsafe { libc::kill(child_pid, libc::SIGKILL) }; - return Err(ExecutorError::ChildSetup("eventfd read failed".into())); - } - - setup_id_maps(child_pid).map_err(ExecutorError::IdMap)?; - - let parent_done_fd = workspace.pipes.sync.parent_done_fd(); - let signal_value: u64 = 1; - if unsafe { libc::write(parent_done_fd, (&signal_value as *const u64).cast(), 8) } != 8 { - unsafe { libc::kill(child_pid, libc::SIGKILL) }; - return Err(ExecutorError::ChildSetup("eventfd write failed".into())); - } + sync_with_child(&workspace, child_pid)?; if let Some(ref stdin_data) = plan.stdin { write_stdin(&workspace, stdin_data).map_err(ExecutorError::Monitor)?; @@ -711,100 +740,93 @@ fn blocking_parent( result } +/// Child process flow (runs after fork in the child). +/// +/// 1. Close parent pipe ends +/// 2. Setup stdio (dup2 stdin/stdout/stderr) +/// 3. chdir(workspace/work) +/// 4. Landlock v5 + rlimits + securebits + drop caps (lockdown) +/// 5. If `notify_mode` != Disabled: install notify filter, send listener fd +/// 6. Install kill seccomp filter (whitelist) +/// 7. Signal parent readiness +/// 8. Wait for parent signal +/// 9. `close_range(3, MAX, 0)` +/// 10. execve fn child_process( workspace: &Workspace, plan: &Plan, exec_info: &ExecutionInfo, + notify_socket: Option<&OwnedFd>, ) -> Result<(), ExecutorError> { + // 1. Close parent pipe ends unsafe { libc::close(workspace.pipes.stdin.write.as_raw_fd()); libc::close(workspace.pipes.stdout.read.as_raw_fd()); libc::close(workspace.pipes.stderr.read.as_raw_fd()); } - if unsafe { libc::unshare(libc::CLONE_NEWUSER) } != 0 { - return Err(ExecutorError::Unshare(last_errno())); + // 2. Setup stdio + setup_stdio(workspace)?; + + // 3. chdir to workspace/work + let work_dir = workspace.root().join("work"); + let work_cstr = CString::new(work_dir.to_string_lossy().as_bytes()) + .map_err(|_| ExecutorError::Exec(Errno::INVAL))?; + if unsafe { libc::chdir(work_cstr.as_ptr()) } != 0 { + return Err(ExecutorError::Exec(last_errno())); + } + + // 4. Apply lockdown (Landlock v5 + rlimits + securebits + drop caps) + let extra_paths: Vec<&str> = exec_info + .extra_mounts + .iter() + .filter_map(|m| m.source.to_str()) + .collect(); + lockdown(plan, workspace.root(), &extra_paths).map_err(ExecutorError::Lockdown)?; + + // 5. If notify mode != Disabled: install notify seccomp filter, send listener fd + if plan.notify_mode != NotifyMode::Disabled { + let notify_filter = build_notify_filter(NOTIFY_FS_SYSCALLS); + let fprog = SockFprog { + len: notify_filter.len() as u16, + filter: notify_filter.as_ptr(), + }; + let listener_fd = unsafe { seccomp_set_mode_filter_listener(&fprog) }.map_err(|e| { + ExecutorError::SeccompNotify(format!("failed to install notify filter: {e}")) + })?; + + // Send listener fd to parent via SCM_RIGHTS + if let Some(sock) = notify_socket { + scm_rights::send_fd(sock.as_raw_fd(), listener_fd.as_raw_fd()).map_err(|e| { + ExecutorError::SeccompNotify(format!("failed to send listener fd: {e}")) + })?; + } } + // 6. Install kill seccomp filter (whitelist) + apply_seccomp(plan)?; + + // 7. Signal parent readiness let child_ready_fd = workspace.pipes.sync.child_ready_fd(); let signal_value: u64 = 1; if unsafe { libc::write(child_ready_fd, (&signal_value as *const u64).cast(), 8) } != 8 { return Err(ExecutorError::ChildSetup("eventfd write failed".into())); } + // 8. Wait for parent signal let parent_done_fd = workspace.pipes.sync.parent_done_fd(); let mut value: u64 = 0; if unsafe { libc::read(parent_done_fd, (&mut value as *mut u64).cast(), 8) } != 8 { return Err(ExecutorError::ChildSetup("eventfd read failed".into())); } - if unsafe { libc::unshare(libc::CLONE_NEWNS | libc::CLONE_NEWUTS | libc::CLONE_NEWIPC) } != 0 { - return Err(ExecutorError::Unshare(last_errno())); - } - - setup_rootfs(workspace, plan, exec_info)?; - setup_stdio(workspace)?; - - let extra_paths: Vec<&str> = exec_info - .extra_mounts - .iter() - .filter_map(|m| m.target.to_str()) - .collect(); - lockdown(plan, None, &extra_paths).map_err(ExecutorError::Lockdown)?; - - let cwd = CString::new(plan.cwd.as_bytes()).map_err(|_| ExecutorError::Exec(Errno::INVAL))?; - if unsafe { libc::chdir(cwd.as_ptr()) } != 0 { - return Err(ExecutorError::Exec(last_errno())); - } + // 9. Close all fds except 0,1,2 + close_extra_fds(); + // 10. execve exec_command(plan, exec_info) } -fn setup_rootfs( - workspace: &Workspace, - plan: &Plan, - exec_info: &ExecutionInfo, -) -> Result<(), ExecutorError> { - let sandbox_root = workspace.root(); - - make_rprivate().map_err(ExecutorError::Rootfs)?; - - for mount in &exec_info.extra_mounts { - let target = sandbox_root.join(mount.target.strip_prefix("/").unwrap_or(&mount.target)); - if mount.source.exists() { - bind_mount(&mount.source, &target, !mount.writable).map_err(ExecutorError::Rootfs)?; - } - } - - for mount in &plan.mounts { - let target = sandbox_root.join(mount.target.strip_prefix("/").unwrap_or(&mount.target)); - if let Some(parent) = target.parent() { - std::fs::create_dir_all(parent).map_err(ExecutorError::Workspace)?; - } - std::fs::create_dir_all(&target).map_err(ExecutorError::Workspace)?; - if mount.source.exists() { - bind_mount(&mount.source, &target, !mount.writable).map_err(ExecutorError::Rootfs)?; - } - } - - mount_proc(&sandbox_root.join("proc")).map_err(ExecutorError::Rootfs)?; - mount_minimal_dev(&sandbox_root.join("dev")).map_err(ExecutorError::Rootfs)?; - - for file in &plan.user_files { - let target_path = if file.path.starts_with('/') { - file.path[1..].to_string() - } else { - format!("work/{}", file.path) - }; - workspace - .write_file(&target_path, &file.content, file.executable) - .map_err(ExecutorError::Workspace)?; - } - - set_hostname("sandbox").map_err(ExecutorError::Rootfs)?; - pivot_root_and_cleanup(sandbox_root).map_err(ExecutorError::Rootfs) -} - fn setup_stdio(workspace: &Workspace) -> Result<(), ExecutorError> { let stdin_fd = workspace.pipes.stdin.read.as_raw_fd(); let stdout_fd = workspace.pipes.stdout.write.as_raw_fd(); @@ -827,6 +849,33 @@ fn setup_stdio(workspace: &Workspace) -> Result<(), ExecutorError> { Ok(()) } +fn apply_seccomp(plan: &Plan) -> Result<(), ExecutorError> { + let whitelist: Vec = if let Some(ref syscalls) = plan.syscalls { + let mut wl: Vec = DEFAULT_WHITELIST + .iter() + .copied() + .filter(|s| !syscalls.denied.contains(s)) + .collect(); + for s in &syscalls.allowed { + if !wl.contains(s) { + wl.push(*s); + } + } + wl + } else { + DEFAULT_WHITELIST.to_vec() + }; + + let filter = build_whitelist_filter(&whitelist); + let fprog = SockFprog { + len: filter.len() as u16, + filter: filter.as_ptr(), + }; + unsafe { seccomp_set_mode_filter(&fprog) } + .map_err(|e| ExecutorError::Lockdown(LockdownError::Seccomp(e)))?; + Ok(()) +} + fn exec_command(plan: &Plan, exec_info: &ExecutionInfo) -> Result<(), ExecutorError> { let cmd_path = CString::new(exec_info.binary_path.to_string_lossy().as_bytes()) .map_err(|_| ExecutorError::Exec(Errno::INVAL))?; @@ -861,36 +910,6 @@ fn exec_command(plan: &Plan, exec_info: &ExecutionInfo) -> Result<(), ExecutorEr Err(ExecutorError::Exec(last_errno())) } -fn create_mount_dirs( - workspace: &Workspace, - exec_info: &ExecutionInfo, - plan: &Plan, -) -> Result<(), ExecutorError> { - for mount in &exec_info.extra_mounts { - create_mount_dir(workspace, &mount.target)?; - } - for mount in &plan.mounts { - create_mount_dir(workspace, &mount.target)?; - } - Ok(()) -} - -fn create_mount_dir(workspace: &Workspace, target: &Path) -> Result<(), ExecutorError> { - if let Some(parent) = target.parent() { - if parent != Path::new("/") { - let target_dir = workspace - .root() - .join(parent.strip_prefix("/").unwrap_or(parent)); - std::fs::create_dir_all(&target_dir).map_err(ExecutorError::Workspace)?; - } - } - let mount_point = workspace - .root() - .join(target.strip_prefix("/").unwrap_or(target)); - std::fs::create_dir_all(&mount_point).map_err(ExecutorError::Workspace)?; - Ok(()) -} - #[cfg(test)] mod tests { use super::*; diff --git a/crates/evalbox-sandbox/src/isolation/lockdown.rs b/crates/evalbox-sandbox/src/isolation/lockdown.rs index 227725a..3c0006e 100644 --- a/crates/evalbox-sandbox/src/isolation/lockdown.rs +++ b/crates/evalbox-sandbox/src/isolation/lockdown.rs @@ -1,41 +1,43 @@ //! Security lockdown for sandboxed processes. //! -//! Applies all security restrictions to the child process after `pivot_root`. +//! Applies all security restrictions to the child process. //! The order of operations is critical for security: //! -//! 1. **Landlock** - Filesystem and network access control (ABI 4+) -//! 2. **Seccomp** - Syscall whitelist filter (BPF) -//! 3. **Rlimits** - Resource limits (memory, CPU, files, processes) -//! 4. **Capabilities** - Drop all capabilities, set `NO_NEW_PRIVS` -//! 5. **Close FDs** - Close all file descriptors except stdin/stdout/stderr +//! 0. **`NO_NEW_PRIVS`** - Required before Landlock and seccomp +//! 1. **Landlock v5** - Filesystem, network, signal, and IPC access control +//! 2. **Rlimits** - Resource limits (memory, CPU, files, processes) +//! 3. **Securebits** - Lock capability state permanently +//! 4. **Capabilities** - Drop all capabilities +//! +//! Note: Seccomp filters and fd closing are handled separately in `child_process()` +//! because the notify filter must return a listener fd that gets sent to the parent. //! //! After lockdown, the process cannot: //! - Access files outside allowed paths -//! - Make network connections (if landlock ABI >= 4) -//! - Call restricted syscalls (ptrace, mount, reboot, etc.) +//! - Make network connections (if network blocked, requires Landlock ABI 4+) +//! - Send signals to processes outside the sandbox (Landlock ABI 5+) +//! - Connect to abstract unix sockets outside the sandbox (Landlock ABI 5+) //! - Exceed resource limits //! - Gain new privileges use std::ffi::CString; -use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd}; +use std::os::fd::{AsRawFd, FromRawFd, OwnedFd}; use std::os::unix::ffi::OsStrExt; use std::path::Path; use evalbox_sys::landlock::{ - self, LANDLOCK_ACCESS_FS_EXECUTE, LANDLOCK_ACCESS_FS_MAKE_DIR, LANDLOCK_ACCESS_FS_MAKE_REG, - LANDLOCK_ACCESS_FS_READ_DIR, LANDLOCK_ACCESS_FS_READ_FILE, LANDLOCK_ACCESS_FS_REMOVE_DIR, - LANDLOCK_ACCESS_FS_REMOVE_FILE, LANDLOCK_ACCESS_FS_TRUNCATE, LANDLOCK_ACCESS_FS_WRITE_FILE, - LandlockPathBeneathAttr, LandlockRulesetAttr, fs_access_for_abi, landlock_add_rule_path, - landlock_create_ruleset, landlock_restrict_self, net_access_for_abi, + self, LANDLOCK_ACCESS_FS_EXECUTE, LANDLOCK_ACCESS_FS_MAKE_DIR, LANDLOCK_ACCESS_FS_MAKE_FIFO, + LANDLOCK_ACCESS_FS_MAKE_REG, LANDLOCK_ACCESS_FS_MAKE_SYM, LANDLOCK_ACCESS_FS_READ_DIR, + LANDLOCK_ACCESS_FS_READ_FILE, LANDLOCK_ACCESS_FS_REMOVE_DIR, LANDLOCK_ACCESS_FS_REMOVE_FILE, + LANDLOCK_ACCESS_FS_TRUNCATE, LANDLOCK_ACCESS_FS_WRITE_FILE, LandlockPathBeneathAttr, + LandlockRulesetAttr, fs_access_for_abi, landlock_add_rule_path, landlock_create_ruleset, + landlock_restrict_self, net_access_for_abi, scope_for_abi, }; use evalbox_sys::last_errno; -use evalbox_sys::seccomp::{ - DEFAULT_WHITELIST, SockFprog, build_whitelist_filter, seccomp_set_mode_filter, -}; use rustix::io::Errno; use thiserror::Error; -use super::rootfs::apply_rlimits; +use super::rlimits::apply_rlimits; use crate::plan::Plan; /// Error during security lockdown. @@ -53,26 +55,34 @@ pub enum LockdownError { #[error("capability: {0}")] Capability(Errno), - #[error("close fds: {0}")] - CloseFds(Errno), + #[error("securebits: {0}")] + Securebits(Errno), } +/// Apply security lockdown to the current process. +/// +/// `workspace_root` is the real absolute path to the workspace directory +/// (no `pivot_root`, so we use real paths). +/// +/// `extra_readonly_paths` are additional paths that should be readable +/// (e.g., resolved binary mount paths). pub fn lockdown( plan: &Plan, - workspace_path: Option<&Path>, + workspace_root: &Path, extra_readonly_paths: &[&str], ) -> Result<(), LockdownError> { - apply_landlock(plan, workspace_path, extra_readonly_paths)?; - apply_seccomp()?; + // NO_NEW_PRIVS must be set before landlock_restrict_self and seccomp. + set_no_new_privs()?; + apply_landlock_v5(plan, workspace_root, extra_readonly_paths)?; apply_rlimits(plan).map_err(LockdownError::Rlimit)?; + apply_securebits()?; drop_all_caps()?; - close_extra_fds()?; Ok(()) } -fn apply_landlock( +fn apply_landlock_v5( plan: &Plan, - workspace_path: Option<&Path>, + workspace_root: &Path, extra_readonly_paths: &[&str], ) -> Result<(), LockdownError> { let abi = match landlock::landlock_abi_version() { @@ -80,16 +90,22 @@ fn apply_landlock( Err(_) => return Ok(()), // Landlock not available }; + if abi < 5 { + eprintln!("warning: landlock ABI {abi} < 5, signal/IPC scoping unavailable"); + } + let fs_access = fs_access_for_abi(abi); let net_access = if plan.network_blocked && abi >= 4 { net_access_for_abi(abi) } else { 0 }; + let scoped = scope_for_abi(abi); let attr = LandlockRulesetAttr { handled_access_fs: fs_access, handled_access_net: net_access, + scoped, }; let ruleset_fd = landlock_create_ruleset(&attr).map_err(LockdownError::Landlock)?; @@ -99,11 +115,13 @@ fn apply_landlock( | LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_MAKE_REG | LANDLOCK_ACCESS_FS_MAKE_DIR + | LANDLOCK_ACCESS_FS_MAKE_SYM + | LANDLOCK_ACCESS_FS_MAKE_FIFO | LANDLOCK_ACCESS_FS_REMOVE_FILE | LANDLOCK_ACCESS_FS_REMOVE_DIR | LANDLOCK_ACCESS_FS_TRUNCATE; - // Read-only paths from plan.mounts (pre-computed by evalbox, includes system paths) + // Read-only mounts from plan (system paths computed by evalbox or user-specified) for mount in &plan.mounts { if !mount.writable { let access = if mount.executable { @@ -111,32 +129,45 @@ fn apply_landlock( } else { read_access & !LANDLOCK_ACCESS_FS_EXECUTE }; - add_path_rule(&ruleset_fd, &mount.target, access); + add_path_rule(&ruleset_fd, &mount.source, access); } } + // Extra readonly paths (resolved binary mounts) for path in extra_readonly_paths { add_path_rule(&ruleset_fd, path, read_access); } - // Pre-pivot_root workspace path - if let Some(ws_path) = workspace_path { - add_path_rule(&ruleset_fd, ws_path, write_access); + // Writable workspace paths (real absolute paths, no pivot_root) + add_path_rule(&ruleset_fd, workspace_root.join("work"), write_access); + add_path_rule(&ruleset_fd, workspace_root.join("tmp"), write_access); + add_path_rule(&ruleset_fd, workspace_root.join("home"), write_access); + + // System paths (read-only with execute) + for path in ["/usr", "/bin", "/lib", "/lib64", "/etc"] { + add_path_rule(&ruleset_fd, path, read_access); } - // Writable paths - for path in ["/work", "/tmp", "/home"] { - add_path_rule(&ruleset_fd, path, write_access); + // NixOS store + if Path::new("/nix/store").exists() { + add_path_rule(&ruleset_fd, "/nix/store", read_access); + } + if Path::new("/run/current-system").exists() { + add_path_rule(&ruleset_fd, "/run/current-system", read_access); } // Proc (read-only) - add_path_rule(&ruleset_fd, "/proc", read_access); + add_path_rule( + &ruleset_fd, + "/proc", + read_access & !LANDLOCK_ACCESS_FS_EXECUTE, + ); // Dev (read + write for /dev/null etc.) add_path_rule( &ruleset_fd, "/dev", - read_access | LANDLOCK_ACCESS_FS_WRITE_FILE, + (read_access & !LANDLOCK_ACCESS_FS_EXECUTE) | LANDLOCK_ACCESS_FS_WRITE_FILE, ); landlock_restrict_self(&ruleset_fd).map_err(LockdownError::Landlock) @@ -144,9 +175,9 @@ fn apply_landlock( /// Add a path rule to the Landlock ruleset. /// -/// Errors are logged to stderr but not propagated - the path simply won't be -/// accessible in the sandbox. This is intentional: missing paths (like /nix/store -/// on non-NixOS) should not prevent sandbox creation. +/// Errors are logged but not propagated - the path simply won't be +/// accessible in the sandbox. Missing paths (like /nix/store on non-NixOS) +/// should not prevent sandbox creation. fn add_path_rule(ruleset_fd: &OwnedFd, path: impl AsRef, access: u64) { let path = path.as_ref(); let fd = match open_path(path) { @@ -159,7 +190,6 @@ fn add_path_rule(ruleset_fd: &OwnedFd, path: impl AsRef, access: u64) { parent_fd: fd.as_raw_fd(), }; if let Err(e) = landlock_add_rule_path(ruleset_fd, &rule) { - // Log but don't fail - path won't be accessible in sandbox eprintln!("warning: landlock rule for {path:?} failed: {e}"); } } @@ -175,13 +205,45 @@ fn open_path(path: impl AsRef) -> Result { } } -fn apply_seccomp() -> Result<(), LockdownError> { - let filter = build_whitelist_filter(DEFAULT_WHITELIST); - let fprog = SockFprog { - len: filter.len() as u16, - filter: filter.as_ptr(), - }; - unsafe { seccomp_set_mode_filter(&fprog) }.map_err(LockdownError::Seccomp) +// Securebits constants (from ) +const SECBIT_NOROOT: u64 = 1 << 0; +const SECBIT_NOROOT_LOCKED: u64 = 1 << 1; +const SECBIT_NO_SETUID_FIXUP: u64 = 1 << 2; +const SECBIT_NO_SETUID_FIXUP_LOCKED: u64 = 1 << 3; +const SECBIT_KEEP_CAPS_LOCKED: u64 = 1 << 5; +const SECBIT_NO_CAP_AMBIENT_RAISE: u64 = 1 << 6; +const SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED: u64 = 1 << 7; + +/// Apply securebits to lock capability state permanently. +/// +/// This prevents the process from ever regaining capabilities through +/// any mechanism (exec of setuid, ambient capabilities, etc.). +fn apply_securebits() -> Result<(), LockdownError> { + let bits = SECBIT_NOROOT + | SECBIT_NOROOT_LOCKED + | SECBIT_NO_SETUID_FIXUP + | SECBIT_NO_SETUID_FIXUP_LOCKED + | SECBIT_KEEP_CAPS_LOCKED + | SECBIT_NO_CAP_AMBIENT_RAISE + | SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED; + + let ret = unsafe { libc::prctl(libc::PR_SET_SECUREBITS, bits, 0, 0, 0) }; + if ret != 0 { + // Not fatal — securebits may require capabilities we don't have. + // The important thing is NO_NEW_PRIVS + dropping all caps. + eprintln!("warning: PR_SET_SECUREBITS failed: {}", last_errno()); + } + Ok(()) +} + +/// Set `PR_SET_NO_NEW_PRIVS` — required before `landlock_restrict_self` and seccomp. +fn set_no_new_privs() -> Result<(), LockdownError> { + let ret = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) }; + if ret != 0 { + Err(LockdownError::Capability(last_errno())) + } else { + Ok(()) + } } fn drop_all_caps() -> Result<(), LockdownError> { @@ -197,33 +259,18 @@ fn drop_all_caps() -> Result<(), LockdownError> { libc::prctl(libc::PR_CAPBSET_DROP, cap, 0, 0, 0); } } - - let ret = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) }; - if ret != 0 { - Err(LockdownError::Capability(last_errno())) - } else { - Ok(()) - } + Ok(()) } -fn close_extra_fds() -> Result<(), LockdownError> { - let mut fds_to_close = Vec::new(); - - if let Ok(entries) = std::fs::read_dir("/proc/self/fd") { - for entry in entries.flatten() { - if let Ok(fd) = entry.file_name().to_string_lossy().parse::() { - if fd > 2 { - fds_to_close.push(fd); - } - } - } - } - - for fd in fds_to_close { - unsafe { libc::close(fd) }; +/// Close all file descriptors > 2 using `close_range` syscall. +/// +/// This is called separately from lockdown because it must happen after +/// seccomp filter installation and listener fd transfer. +pub fn close_extra_fds() { + // close_range(3, MAX, 0) — close all fds from 3 to MAX + unsafe { + libc::syscall(libc::SYS_close_range, 3u32, u32::MAX, 0u32); } - - Ok(()) } #[cfg(test)] diff --git a/crates/evalbox-sandbox/src/isolation/mod.rs b/crates/evalbox-sandbox/src/isolation/mod.rs index 4b1c334..4e69f46 100644 --- a/crates/evalbox-sandbox/src/isolation/mod.rs +++ b/crates/evalbox-sandbox/src/isolation/mod.rs @@ -2,16 +2,10 @@ //! //! This module contains all the security isolation layers: //! -//! - **namespace** - User namespace and ID mapping setup -//! - **rootfs** - Filesystem setup (bind mounts, `pivot_root`, rlimits) -//! - **lockdown** - Security restrictions (Landlock, seccomp, capabilities) +//! - **lockdown** - Security restrictions (Landlock v5, seccomp, securebits, capabilities) +//! - **rlimits** - Resource limits (memory, CPU, files, processes) mod lockdown; -mod namespace; -mod rootfs; +pub mod rlimits; -pub use lockdown::{LockdownError, lockdown}; -pub use namespace::setup_id_maps; -pub use rootfs::{ - bind_mount, make_rprivate, mount_minimal_dev, mount_proc, pivot_root_and_cleanup, set_hostname, -}; +pub use lockdown::{LockdownError, close_extra_fds, lockdown}; diff --git a/crates/evalbox-sandbox/src/isolation/namespace.rs b/crates/evalbox-sandbox/src/isolation/namespace.rs deleted file mode 100644 index 23ad83b..0000000 --- a/crates/evalbox-sandbox/src/isolation/namespace.rs +++ /dev/null @@ -1,83 +0,0 @@ -//! User namespace and ID mapping setup. -//! -//! Sets up UID/GID mappings so the sandboxed process runs as root (UID 0) -//! inside the namespace, but maps to the real user outside. -//! -//! ## How It Works -//! -//! ```text -//! Outside namespace: uid=1000 (real user) -//! │ -//! ┌─────▼─────┐ -//! │ uid_map │ "0 1000 1" -//! └─────┬─────┘ -//! │ -//! Inside namespace: uid=0 (appears as root) -//! ``` -//! -//! ## Security -//! -//! - `deny_setgroups` must be called BEFORE writing `gid_map` (kernel requirement) -//! - The process appears as root inside but has no real privileges -//! - This enables `pivot_root` and mount operations inside the namespace - -use std::fs; -use std::io; - -/// Write UID mapping for a process in a user namespace. -/// -/// Maps `inside_uid` (seen inside namespace) to `outside_uid` (real UID). -/// The "1" at the end means we map exactly one UID. -pub fn write_uid_map(pid: libc::pid_t, inside_uid: u32, outside_uid: u32) -> io::Result<()> { - fs::write( - format!("/proc/{pid}/uid_map"), - format!("{inside_uid} {outside_uid} 1\n"), - ) -} - -/// Write GID mapping for a process in a user namespace. -/// -/// Maps `inside_gid` (seen inside namespace) to `outside_gid` (real GID). -pub fn write_gid_map(pid: libc::pid_t, inside_gid: u32, outside_gid: u32) -> io::Result<()> { - fs::write( - format!("/proc/{pid}/gid_map"), - format!("{inside_gid} {outside_gid} 1\n"), - ) -} - -/// Deny setgroups syscall for a process. -/// -/// # Safety Order -/// -/// MUST be called before `write_gid_map`. The kernel requires this to prevent -/// privilege escalation via group manipulation. -pub fn deny_setgroups(pid: libc::pid_t) -> io::Result<()> { - fs::write(format!("/proc/{pid}/setgroups"), "deny\n") -} - -/// Set up complete ID mappings for a child process. -/// -/// Maps UID 0 and GID 0 inside the namespace to the current user's -/// real UID/GID outside. This allows the sandboxed process to appear -/// as root while having no actual privileges. -pub fn setup_id_maps(child_pid: libc::pid_t) -> io::Result<()> { - // SAFETY: getuid/getgid are always safe to call - let uid = unsafe { libc::getuid() }; - let gid = unsafe { libc::getgid() }; - - // SAFETY: deny_setgroups MUST come before write_gid_map - deny_setgroups(child_pid)?; - write_uid_map(child_pid, 0, uid)?; - write_gid_map(child_pid, 0, gid) -} - -#[cfg(test)] -mod tests { - #[test] - fn current_uid_gid() { - // SAFETY: getuid/getgid are always safe - let uid = unsafe { libc::getuid() }; - let gid = unsafe { libc::getgid() }; - assert!(uid > 0 || gid > 0); - } -} diff --git a/crates/evalbox-sandbox/src/isolation/rlimits.rs b/crates/evalbox-sandbox/src/isolation/rlimits.rs new file mode 100644 index 0000000..67d97b8 --- /dev/null +++ b/crates/evalbox-sandbox/src/isolation/rlimits.rs @@ -0,0 +1,71 @@ +//! Resource limits for sandboxed processes. +//! +//! Sets kernel-enforced resource limits to prevent denial-of-service. +//! +//! ## Limits Applied +//! +//! | Limit | Purpose | Default | +//! |-------|---------|---------| +//! | `RLIMIT_DATA` | Memory usage | 256 MiB | +//! | `RLIMIT_CPU` | CPU time | timeout * 2 + 60s | +//! | `RLIMIT_FSIZE` | Output file size | 16 MiB | +//! | `RLIMIT_NOFILE` | Open file descriptors | 256 | +//! | `RLIMIT_NPROC` | Max processes | 64 | +//! | `RLIMIT_CORE` | Core dump size | 0 (disabled) | +//! | `RLIMIT_STACK` | Stack size | 8 MiB | +//! +//! ## Note on `RLIMIT_AS` +//! +//! We intentionally do NOT set `RLIMIT_AS` (virtual address space). +//! Modern runtimes like Go, Java, and V8 pre-allocate large virtual address +//! ranges but only commit small portions. `RLIMIT_AS` would break these +//! runtimes. `RLIMIT_DATA` limits actual memory and is more appropriate. + +use evalbox_sys::last_errno; +use rustix::io::Errno; + +use crate::plan::Plan; + +/// Apply resource limits based on the sandbox plan. +pub fn apply_rlimits(plan: &Plan) -> Result<(), Errno> { + let cpu_secs = plan.timeout.as_secs().saturating_mul(2).saturating_add(60); + + set_rlimit(libc::RLIMIT_DATA, plan.memory_limit)?; + set_rlimit(libc::RLIMIT_CPU, cpu_secs)?; + set_rlimit(libc::RLIMIT_FSIZE, plan.max_output)?; + set_rlimit(libc::RLIMIT_NOFILE, 256)?; + set_rlimit(libc::RLIMIT_NPROC, u64::from(plan.max_pids))?; + set_rlimit(libc::RLIMIT_CORE, 0)?; + set_rlimit(libc::RLIMIT_STACK, 8 * 1024 * 1024)?; + Ok(()) +} + +#[inline] +fn set_rlimit(resource: libc::__rlimit_resource_t, limit: u64) -> Result<(), Errno> { + let rlim = libc::rlimit { + rlim_cur: limit, + rlim_max: limit, + }; + // SAFETY: rlim is valid, resource is a valid constant. + if unsafe { libc::setrlimit(resource, &rlim) } != 0 { + Err(last_errno()) + } else { + Ok(()) + } +} + +#[cfg(test)] +mod tests { + #[test] + fn get_current_nofile() { + let mut rlim = libc::rlimit { + rlim_cur: 0, + rlim_max: 0, + }; + assert_eq!( + unsafe { libc::getrlimit(libc::RLIMIT_NOFILE, &mut rlim) }, + 0 + ); + assert!(rlim.rlim_cur > 0); + } +} diff --git a/crates/evalbox-sandbox/src/isolation/rootfs.rs b/crates/evalbox-sandbox/src/isolation/rootfs.rs deleted file mode 100644 index 09892ba..0000000 --- a/crates/evalbox-sandbox/src/isolation/rootfs.rs +++ /dev/null @@ -1,267 +0,0 @@ -//! Rootfs setup and resource limits for sandboxed processes. -//! -//! This module handles: -//! - Bind mounts for the sandbox filesystem -//! - Pivot root to isolate the filesystem -//! - Resource limits (rlimits) -//! -//! ## Filesystem Layout (after `pivot_root`) -//! -//! ```text -//! / -//! ├── bin/ → bind mount from /bin (read-only) -//! ├── dev/ → bind mounts: null, zero, urandom, random + symlinks -//! ├── etc/ → bind mount from /etc (read-only) -//! ├── home/ → empty, writable -//! ├── lib/ → bind mount from /lib (read-only) -//! ├── lib64/ → bind mount from /lib64 if exists (read-only) -//! ├── nix/ → bind mount from /nix/store on NixOS (read-only) -//! ├── proc/ → bind mount from /proc (read-only) -//! ├── tmp/ → empty, writable -//! ├── usr/ → bind mount from /usr (read-only) -//! └── work/ → user code directory, writable -//! ``` - -use std::ffi::CString; -use std::os::unix::ffi::OsStrExt; -use std::path::Path; - -use evalbox_sys::last_errno; -use rustix::io::Errno; -use rustix::process::pivot_root; - -use crate::plan::Plan; - -/// Make all mounts private recursively. -pub fn make_rprivate() -> Result<(), Errno> { - let ret = unsafe { - libc::mount( - std::ptr::null(), - c"/".as_ptr(), - std::ptr::null(), - libc::MS_REC | libc::MS_PRIVATE, - std::ptr::null(), - ) - }; - if ret != 0 { Err(last_errno()) } else { Ok(()) } -} - -/// Mount proc filesystem (bind-mounted read-only from host). -pub fn mount_proc(target: &Path) -> Result<(), Errno> { - let target_c = path_to_cstring(target)?; - - let ret = unsafe { - libc::mount( - c"/proc".as_ptr(), - target_c.as_ptr(), - std::ptr::null(), - libc::MS_BIND | libc::MS_REC, - std::ptr::null(), - ) - }; - if ret != 0 { - return Err(last_errno()); - } - - let ret = unsafe { - libc::mount( - std::ptr::null(), - target_c.as_ptr(), - std::ptr::null(), - libc::MS_BIND - | libc::MS_REMOUNT - | libc::MS_RDONLY - | libc::MS_NOSUID - | libc::MS_NODEV - | libc::MS_NOEXEC, - std::ptr::null(), - ) - }; - if ret != 0 { Err(last_errno()) } else { Ok(()) } -} - -/// Create minimal /dev with null, zero, urandom (bind-mounted from host). -pub fn mount_minimal_dev(target: &Path) -> Result<(), Errno> { - for dev in ["null", "zero", "urandom", "random"] { - bind_mount_dev(target, dev)?; - } - - let fd_path = target.join("fd"); - let fd_c = path_to_cstring(&fd_path)?; - if unsafe { libc::symlink(c"/proc/self/fd".as_ptr(), fd_c.as_ptr()) } != 0 { - return Err(last_errno()); - } - - for (name, num) in [("stdin", 0), ("stdout", 1), ("stderr", 2)] { - let link_path = target.join(name); - let link_c = path_to_cstring(&link_path)?; - let target_str = CString::new(format!("/proc/self/fd/{num}")).map_err(|_| Errno::INVAL)?; - if unsafe { libc::symlink(target_str.as_ptr(), link_c.as_ptr()) } != 0 { - return Err(last_errno()); - } - } - - Ok(()) -} - -fn bind_mount_dev(target_dev: &Path, name: &str) -> Result<(), Errno> { - let source = Path::new("/dev").join(name); - let target = target_dev.join(name); - let target_c = path_to_cstring(&target)?; - let source_c = path_to_cstring(&source)?; - - let fd = unsafe { libc::open(target_c.as_ptr(), libc::O_CREAT | libc::O_WRONLY, 0o644) }; - if fd < 0 { - return Err(last_errno()); - } - unsafe { libc::close(fd) }; - - let ret = unsafe { - libc::mount( - source_c.as_ptr(), - target_c.as_ptr(), - std::ptr::null(), - libc::MS_BIND, - std::ptr::null(), - ) - }; - if ret != 0 { Err(last_errno()) } else { Ok(()) } -} - -/// Bind mount a path. -pub fn bind_mount(source: &Path, target: &Path, readonly: bool) -> Result<(), Errno> { - let source_c = path_to_cstring(source)?; - let target_c = path_to_cstring(target)?; - - let ret = unsafe { - libc::mount( - source_c.as_ptr(), - target_c.as_ptr(), - std::ptr::null(), - libc::MS_BIND | libc::MS_REC, - std::ptr::null(), - ) - }; - if ret != 0 { - return Err(last_errno()); - } - - if readonly { - let ret = unsafe { - libc::mount( - std::ptr::null(), - target_c.as_ptr(), - std::ptr::null(), - libc::MS_BIND | libc::MS_REMOUNT | libc::MS_RDONLY, - std::ptr::null(), - ) - }; - if ret != 0 { - return Err(last_errno()); - } - } - - Ok(()) -} - -/// Perform `pivot_root` and clean up the old root. -pub fn pivot_root_and_cleanup(new_root: &Path) -> Result<(), Errno> { - let new_root_c = path_to_cstring(new_root)?; - - let ret = unsafe { - libc::mount( - new_root_c.as_ptr(), - new_root_c.as_ptr(), - std::ptr::null(), - libc::MS_BIND | libc::MS_REC, - std::ptr::null(), - ) - }; - if ret != 0 { - return Err(last_errno()); - } - - let old_root = new_root.join(".old_root"); - let old_root_c = path_to_cstring(&old_root)?; - unsafe { libc::mkdir(old_root_c.as_ptr(), 0o700) }; - - let new_root_cstr = CString::new(new_root_c.as_bytes()).map_err(|_| Errno::INVAL)?; - let old_root_cstr = CString::new(old_root_c.as_bytes()).map_err(|_| Errno::INVAL)?; - pivot_root(new_root_cstr.as_c_str(), old_root_cstr.as_c_str())?; - - unsafe { - libc::chdir(c"/".as_ptr()); - libc::umount2(c"/.old_root".as_ptr(), libc::MNT_DETACH); - libc::rmdir(c"/.old_root".as_ptr()); - } - - Ok(()) -} - -/// Set the hostname. -pub fn set_hostname(name: &str) -> Result<(), Errno> { - let ret = unsafe { libc::sethostname(name.as_ptr().cast::(), name.len()) }; - if ret != 0 { Err(last_errno()) } else { Ok(()) } -} - -#[inline] -fn path_to_cstring(path: &Path) -> Result { - CString::new(path.as_os_str().as_bytes()).map_err(|_| Errno::INVAL) -} - -/// Apply resource limits based on the sandbox plan. -pub fn apply_rlimits(plan: &Plan) -> Result<(), Errno> { - let cpu_secs = plan.timeout.as_secs().saturating_mul(2).saturating_add(60); - - // Note: We intentionally do NOT set RLIMIT_AS (address space). - // RLIMIT_AS limits virtual memory, which can be much larger than actual usage. - // Modern runtimes like Go, Java, and V8 pre-allocate large virtual address ranges - // but only commit (use) small portions. RLIMIT_AS would break these runtimes. - // RLIMIT_DATA limits the data segment and is more appropriate for real memory control. - set_rlimit(libc::RLIMIT_DATA, plan.memory_limit)?; - set_rlimit(libc::RLIMIT_CPU, cpu_secs)?; - set_rlimit(libc::RLIMIT_FSIZE, plan.max_output)?; - set_rlimit(libc::RLIMIT_NOFILE, 256)?; - set_rlimit(libc::RLIMIT_NPROC, u64::from(plan.max_pids))?; - set_rlimit(libc::RLIMIT_CORE, 0)?; - set_rlimit(libc::RLIMIT_STACK, 8 * 1024 * 1024)?; - Ok(()) -} - -#[inline] -fn set_rlimit(resource: libc::__rlimit_resource_t, limit: u64) -> Result<(), Errno> { - let rlim = libc::rlimit { - rlim_cur: limit, - rlim_max: limit, - }; - // SAFETY: rlim is valid, resource is a valid constant. - if unsafe { libc::setrlimit(resource, &rlim) } != 0 { - Err(last_errno()) - } else { - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn path_to_cstring_valid() { - let cstr = path_to_cstring(Path::new("/tmp/test")).unwrap(); - assert_eq!(cstr.as_bytes(), b"/tmp/test"); - } - - #[test] - fn get_current_nofile() { - let mut rlim = libc::rlimit { - rlim_cur: 0, - rlim_max: 0, - }; - assert_eq!( - unsafe { libc::getrlimit(libc::RLIMIT_NOFILE, &mut rlim) }, - 0 - ); - assert!(rlim.rlim_cur > 0); - } -} diff --git a/crates/evalbox-sandbox/src/lib.rs b/crates/evalbox-sandbox/src/lib.rs index bfd4bfb..006b14a 100644 --- a/crates/evalbox-sandbox/src/lib.rs +++ b/crates/evalbox-sandbox/src/lib.rs @@ -3,12 +3,13 @@ //! This crate provides secure sandboxed execution of untrusted code on Linux. //! It combines multiple isolation mechanisms for defense in depth: //! -//! - **User namespaces** - Unprivileged containers, UID 0 inside = real user outside -//! - **Mount namespaces** - Private filesystem view with minimal bind mounts -//! - **Pivot root** - Change root directory, unmount host filesystem -//! - **Landlock** - Filesystem and network access control (kernel 5.13+) +//! - **Landlock v5** - Filesystem, network, signal, and IPC access control //! - **Seccomp-BPF** - Syscall whitelist (~40 allowed syscalls) +//! - **Seccomp User Notify** - Optional syscall interception for FS virtualization //! - **Rlimits** - Resource limits (memory, CPU, files, processes) +//! - **Capabilities** - All capabilities dropped, `NO_NEW_PRIVS` enforced +//! +//! No user namespaces required — works inside Docker with default seccomp profile. //! //! ## Quick Start //! @@ -22,8 +23,7 @@ //! //! ## Requirements //! -//! - Linux kernel 5.13+ (for Landlock ABI 1+) -//! - User namespaces enabled (`/proc/sys/kernel/unprivileged_userns_clone = 1`) +//! - Linux kernel 6.12+ (for Landlock ABI 5) //! - Seccomp enabled in kernel #![allow(clippy::cast_possible_truncation)] @@ -32,6 +32,7 @@ pub mod executor; pub mod isolation; pub mod monitor; +pub mod notify; pub mod plan; pub mod resolve; pub mod sysinfo; @@ -40,10 +41,5 @@ pub mod workspace; pub use executor::{Event, Executor, ExecutorError, SandboxId}; pub use monitor::{Output, Status}; -pub use plan::{Landlock, Mount, Plan, Syscalls, UserFile}; +pub use plan::{Landlock, Mount, NotifyMode, Plan, Syscalls, UserFile}; pub use resolve::{ResolveError, ResolvedBinary, resolve_binary}; - -// Backwards compatibility -#[allow(deprecated)] -#[doc(hidden)] -pub use plan::SandboxPlan; diff --git a/crates/evalbox-sandbox/src/notify/mod.rs b/crates/evalbox-sandbox/src/notify/mod.rs new file mode 100644 index 0000000..d4912f2 --- /dev/null +++ b/crates/evalbox-sandbox/src/notify/mod.rs @@ -0,0 +1,17 @@ +//! Seccomp user notification support. +//! +//! This module provides the supervisor side of seccomp user notification, +//! enabling syscall interception without Linux user namespaces. +//! +//! ## Modules +//! +//! - **supervisor** - Main notification loop that handles intercepted syscalls +//! - **`virtual_fs`** - Path translation for filesystem virtualization +//! - **`scm_rights`** - Unix socket fd passing (child → parent listener fd transfer) + +pub mod scm_rights; +pub mod supervisor; +pub mod virtual_fs; + +pub use supervisor::{NotifyEvent, Supervisor}; +pub use virtual_fs::VirtualFs; diff --git a/crates/evalbox-sandbox/src/notify/scm_rights.rs b/crates/evalbox-sandbox/src/notify/scm_rights.rs new file mode 100644 index 0000000..3acf6c4 --- /dev/null +++ b/crates/evalbox-sandbox/src/notify/scm_rights.rs @@ -0,0 +1,158 @@ +//! Unix socket fd passing via `SCM_RIGHTS`. +//! +//! After the child installs its seccomp notify filter, it receives a listener fd. +//! This fd must be passed to the parent process so the parent can handle +//! notifications. We use `SCM_RIGHTS` over an `AF_UNIX` socketpair to transfer +//! the fd across the fork boundary. + +use std::io; +use std::os::fd::{FromRawFd, OwnedFd, RawFd}; + +/// Create an `AF_UNIX SOCK_STREAM` socketpair. +/// +/// Returns `(parent_sock, child_sock)`. After fork, parent closes `child_sock` +/// and child closes `parent_sock`. +pub fn create_socketpair() -> io::Result<(OwnedFd, OwnedFd)> { + let mut fds = [0i32; 2]; + let ret = unsafe { + libc::socketpair( + libc::AF_UNIX, + libc::SOCK_STREAM | libc::SOCK_CLOEXEC, + 0, + fds.as_mut_ptr(), + ) + }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + Ok(unsafe { (OwnedFd::from_raw_fd(fds[0]), OwnedFd::from_raw_fd(fds[1])) }) +} + +/// Send a file descriptor over a unix socket using `SCM_RIGHTS`. +pub fn send_fd(socket: RawFd, fd: RawFd) -> io::Result<()> { + let data = [0u8; 1]; + let iov = libc::iovec { + iov_base: data.as_ptr() as *mut libc::c_void, + iov_len: 1, + }; + + // cmsg buffer: header + one fd + let cmsg_space = unsafe { libc::CMSG_SPACE(size_of::() as u32) } as usize; + let mut cmsg_buf = vec![0u8; cmsg_space]; + + let mut msg: libc::msghdr = unsafe { std::mem::zeroed() }; + msg.msg_iov = &iov as *const _ as *mut _; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf.as_mut_ptr().cast(); + msg.msg_controllen = cmsg_space; + + // Fill control message + let cmsg = unsafe { libc::CMSG_FIRSTHDR(&msg) }; + if cmsg.is_null() { + return Err(io::Error::other("CMSG_FIRSTHDR null")); + } + unsafe { + (*cmsg).cmsg_level = libc::SOL_SOCKET; + (*cmsg).cmsg_type = libc::SCM_RIGHTS; + (*cmsg).cmsg_len = libc::CMSG_LEN(size_of::() as u32) as usize; + let data_ptr = libc::CMSG_DATA(cmsg); + std::ptr::copy_nonoverlapping( + (&fd as *const RawFd).cast::(), + data_ptr, + size_of::(), + ); + } + + let ret = unsafe { libc::sendmsg(socket, &msg, 0) }; + if ret < 0 { + Err(io::Error::last_os_error()) + } else { + Ok(()) + } +} + +/// Receive a file descriptor from a unix socket using `SCM_RIGHTS`. +pub fn recv_fd(socket: RawFd) -> io::Result { + let mut data = [0u8; 1]; + let mut iov = libc::iovec { + iov_base: data.as_mut_ptr().cast(), + iov_len: 1, + }; + + let cmsg_space = unsafe { libc::CMSG_SPACE(size_of::() as u32) } as usize; + let mut cmsg_buf = vec![0u8; cmsg_space]; + + let mut msg: libc::msghdr = unsafe { std::mem::zeroed() }; + msg.msg_iov = &mut iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf.as_mut_ptr().cast(); + msg.msg_controllen = cmsg_space; + + let ret = unsafe { libc::recvmsg(socket, &mut msg, 0) }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + + let cmsg = unsafe { libc::CMSG_FIRSTHDR(&msg) }; + if cmsg.is_null() { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "no control message received", + )); + } + + unsafe { + if (*cmsg).cmsg_level != libc::SOL_SOCKET || (*cmsg).cmsg_type != libc::SCM_RIGHTS { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "unexpected control message type", + )); + } + let mut fd: RawFd = 0; + let data_ptr = libc::CMSG_DATA(cmsg); + std::ptr::copy_nonoverlapping( + data_ptr, + (&mut fd as *mut RawFd).cast::(), + size_of::(), + ); + Ok(OwnedFd::from_raw_fd(fd)) + } +} + +#[cfg(test)] +mod tests { + use std::os::fd::AsRawFd; + + use super::*; + + #[test] + fn socketpair_creation() { + let (a, b) = create_socketpair().unwrap(); + assert!(a.as_raw_fd() >= 0); + assert!(b.as_raw_fd() >= 0); + assert_ne!(a.as_raw_fd(), b.as_raw_fd()); + } + + #[test] + fn send_recv_fd() { + let (parent, child) = create_socketpair().unwrap(); + + // Create a pipe and send its read end + let mut pipe_fds = [0i32; 2]; + unsafe { libc::pipe(pipe_fds.as_mut_ptr()) }; + let pipe_read = pipe_fds[0]; + let pipe_write = pipe_fds[1]; + + send_fd(child.as_raw_fd(), pipe_read).unwrap(); + let received = recv_fd(parent.as_raw_fd()).unwrap(); + + // The received fd should be valid and different from the original + assert!(received.as_raw_fd() >= 0); + + // Clean up + unsafe { + libc::close(pipe_read); + libc::close(pipe_write); + } + } +} diff --git a/crates/evalbox-sandbox/src/notify/supervisor.rs b/crates/evalbox-sandbox/src/notify/supervisor.rs new file mode 100644 index 0000000..cf458d6 --- /dev/null +++ b/crates/evalbox-sandbox/src/notify/supervisor.rs @@ -0,0 +1,274 @@ +//! Seccomp notification supervisor. +//! +//! Runs in the parent process, handling intercepted syscalls from the sandboxed child. +//! The supervisor receives notifications via the seccomp listener fd and decides +//! how to respond based on the configured [`NotifyMode`]. +//! +//! ## Modes +//! +//! - **Monitor**: Log syscall and return `SECCOMP_USER_NOTIF_FLAG_CONTINUE` +//! - **Virtualize**: Translate filesystem paths via [`VirtualFs`], inject fds via `ADDFD` + +use std::fs::File; +use std::io::{self, Read, Seek, SeekFrom}; +use std::os::fd::{AsRawFd, OwnedFd, RawFd}; + +use evalbox_sys::seccomp_notify::{ + SECCOMP_ADDFD_FLAG_SEND, SECCOMP_USER_NOTIF_FLAG_CONTINUE, SeccompNotif, SeccompNotifAddfd, + SeccompNotifResp, notif_addfd, notif_id_valid, notif_recv, notif_send, +}; + +use super::virtual_fs::VirtualFs; +use crate::plan::NotifyMode; + +/// Events emitted by the supervisor for future user-facing notifications. +#[derive(Debug)] +pub enum NotifyEvent { + /// A syscall was intercepted and handled. + SyscallHandled { + /// PID of the process that made the syscall. + pid: u32, + /// Syscall number. + syscall_nr: i32, + /// Whether the syscall was allowed. + allowed: bool, + }, +} + +/// Seccomp notification supervisor. +pub struct Supervisor { + listener_fd: OwnedFd, + mode: NotifyMode, + vfs: VirtualFs, +} + +impl Supervisor { + /// Create a new supervisor. + pub fn new(listener_fd: OwnedFd, mode: NotifyMode, vfs: VirtualFs) -> Self { + Self { + listener_fd, + mode, + vfs, + } + } + + /// Get the raw fd for registering with poll/mio. + pub fn fd(&self) -> RawFd { + self.listener_fd.as_raw_fd() + } + + /// Handle a notification event. Call when the listener fd is readable. + /// + /// Returns `Some(NotifyEvent)` on success, `None` if the notification was + /// stale (child died or already handled). + pub fn handle_event(&self) -> io::Result> { + let mut notif = SeccompNotif::default(); + + if let Err(e) = notif_recv(self.listener_fd.as_raw_fd(), &mut notif) { + // ENOENT means the target process died before we could receive + if e == rustix::io::Errno::NOENT { + return Ok(None); + } + return Err(io::Error::from_raw_os_error(e.raw_os_error())); + } + + match self.mode { + NotifyMode::Disabled => { + debug_assert!( + false, + "supervisor received notification with NotifyMode::Disabled" + ); + self.respond_continue(¬if)?; + Ok(None) + } + NotifyMode::Monitor => self.handle_monitor(¬if), + NotifyMode::Virtualize => self.handle_virtualize(¬if), + } + } + + fn handle_monitor(&self, notif: &SeccompNotif) -> io::Result> { + let syscall_name = syscall_name(notif.data.nr); + eprintln!( + "[notify] pid={} syscall={}({}) args=[{:#x}, {:#x}, {:#x}]", + notif.pid, + syscall_name, + notif.data.nr, + notif.data.args[0], + notif.data.args[1], + notif.data.args[2], + ); + + self.respond_continue(notif)?; + + Ok(Some(NotifyEvent::SyscallHandled { + pid: notif.pid, + syscall_nr: notif.data.nr, + allowed: true, + })) + } + + fn handle_virtualize(&self, notif: &SeccompNotif) -> io::Result> { + let syscall_nr = notif.data.nr; + + // For openat-family syscalls, args[1] is the pathname pointer + // For open/creat, args[0] is the pathname pointer + let path_addr = if syscall_nr == libc::SYS_openat as i32 + || syscall_nr == libc::SYS_newfstatat as i32 + || syscall_nr == libc::SYS_faccessat as i32 + || syscall_nr == libc::SYS_faccessat2 as i32 + || syscall_nr == libc::SYS_readlinkat as i32 + { + notif.data.args[1] + } else { + notif.data.args[0] + }; + + // Read path from child's memory + let path = match self.read_child_string(notif.pid, path_addr) { + Ok(p) => p, + Err(_) => { + // Can't read memory, let syscall proceed + self.respond_continue(notif)?; + return Ok(None); + } + }; + + // TOCTOU check: verify notification is still valid after reading memory + if notif_id_valid(self.listener_fd.as_raw_fd(), notif.id).is_err() { + return Ok(None); // Notification is stale + } + + // Try to translate path + if let Some(real_path) = self.vfs.translate(&path) { + // For openat: open the file ourselves and inject the fd + if syscall_nr == libc::SYS_openat as i32 + || syscall_nr == libc::SYS_open as i32 + || syscall_nr == libc::SYS_creat as i32 + { + let flags = if syscall_nr == libc::SYS_openat as i32 { + notif.data.args[2] as i32 + } else { + notif.data.args[1] as i32 + }; + + match self.open_and_inject(notif, &real_path, flags) { + Ok(()) => { + return Ok(Some(NotifyEvent::SyscallHandled { + pid: notif.pid, + syscall_nr, + allowed: true, + })); + } + Err(_) => { + // Fall through to continue + } + } + } + } + + // No translation or non-open syscall: let it proceed as-is + self.respond_continue(notif)?; + Ok(Some(NotifyEvent::SyscallHandled { + pid: notif.pid, + syscall_nr, + allowed: true, + })) + } + + fn respond_continue(&self, notif: &SeccompNotif) -> io::Result<()> { + let resp = SeccompNotifResp { + id: notif.id, + val: 0, + error: 0, + flags: SECCOMP_USER_NOTIF_FLAG_CONTINUE, + }; + notif_send(self.listener_fd.as_raw_fd(), &resp) + .map_err(|e| io::Error::from_raw_os_error(e.raw_os_error())) + } + + fn open_and_inject( + &self, + notif: &SeccompNotif, + real_path: &std::path::Path, + flags: i32, + ) -> io::Result<()> { + use std::ffi::CString; + use std::os::unix::ffi::OsStrExt; + + let path_c = CString::new(real_path.as_os_str().as_bytes()) + .map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "invalid path"))?; + + // Open the file at the translated path + let fd = unsafe { libc::open(path_c.as_ptr(), flags & !libc::O_CLOEXEC, 0o666) }; + if fd < 0 { + return Err(io::Error::last_os_error()); + } + + // Inject the fd into the child and atomically respond + let addfd = SeccompNotifAddfd { + id: notif.id, + flags: SECCOMP_ADDFD_FLAG_SEND, + srcfd: fd as u32, + newfd: 0, + newfd_flags: 0, + }; + + let result = notif_addfd(self.listener_fd.as_raw_fd(), &addfd) + .map_err(|e| io::Error::from_raw_os_error(e.raw_os_error())); + + // Close our copy of the fd + unsafe { libc::close(fd) }; + + result.map(|_| ()) + } + + /// Read a null-terminated string from the child's memory via `/proc/pid/mem`. + fn read_child_string(&self, pid: u32, addr: u64) -> io::Result { + let mem_path = format!("/proc/{pid}/mem"); + let mut file = File::open(&mem_path)?; + file.seek(SeekFrom::Start(addr))?; + + let mut buf = vec![0u8; 4096]; + let n = file.read(&mut buf)?; + buf.truncate(n); + + // Find null terminator + if let Some(nul_pos) = buf.iter().position(|&b| b == 0) { + buf.truncate(nul_pos); + } + + String::from_utf8(buf) + .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "invalid UTF-8 in path")) + } +} + +/// Map syscall number to name for logging. +fn syscall_name(nr: i32) -> &'static str { + match nr as i64 { + libc::SYS_openat => "openat", + libc::SYS_open => "open", + libc::SYS_creat => "creat", + libc::SYS_access => "access", + libc::SYS_faccessat => "faccessat", + libc::SYS_faccessat2 => "faccessat2", + libc::SYS_stat => "stat", + libc::SYS_lstat => "lstat", + libc::SYS_newfstatat => "newfstatat", + libc::SYS_statx => "statx", + libc::SYS_readlink => "readlink", + libc::SYS_readlinkat => "readlinkat", + _ => "unknown", + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn syscall_names() { + assert_eq!(syscall_name(libc::SYS_openat as i32), "openat"); + assert_eq!(syscall_name(libc::SYS_stat as i32), "stat"); + assert_eq!(syscall_name(9999), "unknown"); + } +} diff --git a/crates/evalbox-sandbox/src/notify/virtual_fs.rs b/crates/evalbox-sandbox/src/notify/virtual_fs.rs new file mode 100644 index 0000000..7c7df5b --- /dev/null +++ b/crates/evalbox-sandbox/src/notify/virtual_fs.rs @@ -0,0 +1,139 @@ +//! Virtual filesystem path translation. +//! +//! Maps paths from the child's perspective to real paths on the host. +//! Used by the supervisor in `Virtualize` mode to translate filesystem +//! syscalls to the correct workspace paths. +//! +//! ## Default Mappings +//! +//! | Child sees | Host path | +//! |-----------|-----------| +//! | `/work` | `{workspace}/work` | +//! | `/tmp` | `{workspace}/tmp` | +//! | `/home` | `{workspace}/home` | + +use std::collections::HashMap; +use std::path::{Path, PathBuf}; + +/// Virtual filesystem with path translation. +#[derive(Debug, Clone)] +pub struct VirtualFs { + /// Maps virtual prefix → real prefix. + mappings: HashMap, +} + +impl VirtualFs { + /// Create a new `VirtualFs` with default mappings for the given workspace root. + pub fn new(workspace_root: &Path) -> Self { + let mut mappings = HashMap::new(); + mappings.insert(PathBuf::from("/work"), workspace_root.join("work")); + mappings.insert(PathBuf::from("/tmp"), workspace_root.join("tmp")); + mappings.insert(PathBuf::from("/home"), workspace_root.join("home")); + Self { mappings } + } + + /// Create an empty `VirtualFs` with no mappings. + pub fn empty() -> Self { + Self { + mappings: HashMap::new(), + } + } + + /// Add a path mapping. + pub fn add_mapping(&mut self, virtual_path: impl Into, real_path: impl Into) { + self.mappings.insert(virtual_path.into(), real_path.into()); + } + + /// Translate a path from child's view to host's view. + /// + /// Returns `Some(real_path)` if the path matches a mapping, + /// `None` if the path should be accessed as-is (passthrough). + pub fn translate(&self, path: &str) -> Option { + let path = Path::new(path); + for (virtual_prefix, real_prefix) in &self.mappings { + if let Ok(suffix) = path.strip_prefix(virtual_prefix) { + return Some(real_prefix.join(suffix)); + } + } + None + } + + /// Check if a path is within any allowed scope. + /// + /// In `Virtualize` mode, only paths within mappings or system paths are allowed. + pub fn is_allowed(&self, path: &str) -> bool { + let path = Path::new(path); + + // Check virtual mappings + for virtual_prefix in self.mappings.keys() { + if path.starts_with(virtual_prefix) { + return true; + } + } + + // Allow common system paths (read-only, handled by Landlock) + let system_prefixes = ["/usr", "/bin", "/lib", "/lib64", "/etc", "/proc", "/dev"]; + for prefix in &system_prefixes { + if path.starts_with(prefix) { + return true; + } + } + + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn default_mappings() { + let vfs = VirtualFs::new(Path::new("/tmp/evalbox-abc123")); + + assert_eq!( + vfs.translate("/work/main.py"), + Some(PathBuf::from("/tmp/evalbox-abc123/work/main.py")) + ); + assert_eq!( + vfs.translate("/tmp/output.txt"), + Some(PathBuf::from("/tmp/evalbox-abc123/tmp/output.txt")) + ); + assert_eq!( + vfs.translate("/home/.bashrc"), + Some(PathBuf::from("/tmp/evalbox-abc123/home/.bashrc")) + ); + } + + #[test] + fn no_translation_for_system_paths() { + let vfs = VirtualFs::new(Path::new("/tmp/evalbox-abc123")); + assert_eq!(vfs.translate("/usr/bin/python3"), None); + assert_eq!(vfs.translate("/etc/passwd"), None); + } + + #[test] + fn is_allowed_checks() { + let vfs = VirtualFs::new(Path::new("/tmp/evalbox-abc123")); + + assert!(vfs.is_allowed("/work/test.py")); + assert!(vfs.is_allowed("/tmp/output")); + assert!(vfs.is_allowed("/usr/bin/python3")); + assert!(vfs.is_allowed("/etc/passwd")); + assert!(vfs.is_allowed("/proc/self/status")); + assert!(!vfs.is_allowed("/root/.ssh/id_rsa")); + assert!(!vfs.is_allowed("/var/log/syslog")); + } + + #[test] + fn custom_mapping() { + let mut vfs = VirtualFs::empty(); + vfs.add_mapping("/data", "/mnt/shared/data"); + + assert_eq!( + vfs.translate("/data/file.csv"), + Some(PathBuf::from("/mnt/shared/data/file.csv")) + ); + assert_eq!(vfs.translate("/work/test"), None); + } +} diff --git a/crates/evalbox-sandbox/src/plan.rs b/crates/evalbox-sandbox/src/plan.rs index 31dc2a6..62a31ae 100644 --- a/crates/evalbox-sandbox/src/plan.rs +++ b/crates/evalbox-sandbox/src/plan.rs @@ -41,6 +41,22 @@ use std::collections::{HashMap, HashSet}; use std::path::PathBuf; use std::time::Duration; +/// Seccomp user notification mode. +/// +/// Controls how the supervisor handles intercepted syscalls from the sandboxed child. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum NotifyMode { + /// No seccomp notify filter installed. Zero overhead. Default. + #[default] + Disabled, + /// Supervisor logs syscalls and returns `SECCOMP_USER_NOTIF_FLAG_CONTINUE`. + /// Minimal overhead. For debugging/auditing. + Monitor, + /// Supervisor intercepts FS syscalls, translates paths via `VirtualFs`, + /// opens files at translated paths, injects fd via `SECCOMP_IOCTL_NOTIF_ADDFD`. + Virtualize, +} + /// Mount point configuration. /// /// This is the canonical Mount type used throughout evalbox. @@ -273,7 +289,7 @@ impl UserFile { #[derive(Debug, Clone)] pub struct Plan { pub cmd: Vec, - /// Pre-resolved binary path. If set, sandbox uses this instead of resolving `cmd\[0\]`. + /// Pre-resolved binary path. If set, sandbox uses this instead of resolving `cmd[0]`. /// This allows evalbox to do binary resolution before calling sandbox. pub binary_path: Option, pub env: HashMap, @@ -291,12 +307,10 @@ pub struct Plan { pub syscalls: Option, /// Custom Landlock configuration. pub landlock: Option, + /// Seccomp user notification mode. + pub notify_mode: NotifyMode, } -/// Type alias for backwards compatibility. -#[deprecated(since = "0.2.0", note = "Use `Plan` instead")] -pub type SandboxPlan = Plan; - impl Default for Plan { fn default() -> Self { Self { @@ -315,6 +329,7 @@ impl Default for Plan { network_blocked: true, syscalls: None, landlock: None, + notify_mode: NotifyMode::Disabled, } } } @@ -425,6 +440,16 @@ impl Plan { self } + /// Set the seccomp user notification mode. + /// + /// - `Disabled` (default): No notify filter, zero overhead. + /// - `Monitor`: Log intercepted syscalls for debugging. + /// - `Virtualize`: Full filesystem virtualization via path translation. + pub fn notify_mode(mut self, mode: NotifyMode) -> Self { + self.notify_mode = mode; + self + } + /// Execute this plan (convenience method). /// /// Equivalent to `Executor::run(self)`. diff --git a/crates/evalbox-sandbox/src/resolve.rs b/crates/evalbox-sandbox/src/resolve.rs index 1a7b757..7fd934d 100644 --- a/crates/evalbox-sandbox/src/resolve.rs +++ b/crates/evalbox-sandbox/src/resolve.rs @@ -101,7 +101,8 @@ mod tests { let sys_paths = &*SYSTEM_PATHS; let mounts = detect_mounts(Path::new("/usr/bin/echo"), sys_paths); - if sys_paths.system_type == SystemType::Fhs { + // Only check for /usr mount if we're on an actual FHS system with /usr + if sys_paths.system_type == SystemType::Fhs && Path::new("/usr").exists() { assert!(mounts.iter().any(|m| m.source == Path::new("/usr"))); } } diff --git a/crates/evalbox-sandbox/src/workspace.rs b/crates/evalbox-sandbox/src/workspace.rs index 68b2001..dcf5d42 100644 --- a/crates/evalbox-sandbox/src/workspace.rs +++ b/crates/evalbox-sandbox/src/workspace.rs @@ -1,14 +1,14 @@ //! Workspace and pipe management for sandboxed execution. //! -//! The workspace is a temporary directory that becomes the sandbox root after `pivot_root`. -//! It contains all the pipes for parent-child communication. +//! The workspace is a temporary directory containing the sandbox's writable areas +//! and all the pipes for parent-child communication. //! //! ## Pipes //! //! - **stdin**: Parent writes → Child reads //! - **stdout**: Child writes → Parent reads //! - **stderr**: Child writes → Parent reads -//! - **sync**: Eventfd pair for parent-child synchronization (UID map setup) +//! - **sync**: Eventfd for parent-child synchronization //! //! ## Important: Pipe Hygiene //! @@ -59,6 +59,9 @@ impl Pipe { } /// Eventfd-based parent-child synchronization. +/// +/// Used when `NotifyMode::Disabled` — the child signals readiness via eventfd +/// after completing setup, and the parent writes back to let it proceed to exec. #[derive(Debug)] pub struct SyncPair { pub child_ready: OwnedFd, @@ -150,7 +153,6 @@ impl Workspace { fs::write(&full, content)?; if executable { - // Set executable permission (rwxr-xr-x) fs::set_permissions(&full, fs::Permissions::from_mode(0o755))?; } @@ -163,57 +165,14 @@ impl Workspace { Ok(full) } + /// Create standard sandbox directories. + /// + /// Only creates the writable workspace directories (work, tmp, home). + /// No rootfs directories (proc, dev, etc.) needed since we don't use `pivot_root`. pub fn setup_sandbox_dirs(&self) -> io::Result<()> { - for dir in [ - "proc", "dev", "tmp", "home", "work", "usr", "bin", "lib", "lib64", "etc", - ] { + for dir in ["work", "tmp", "home"] { self.create_dir(dir)?; } - self.setup_minimal_etc()?; - Ok(()) - } - - /// Create minimal /etc files to prevent information leakage. - /// - /// Instead of mounting the host's /etc (which contains sensitive info like - /// /etc/passwd, /etc/shadow), we create a minimal /etc with only essential files. - pub fn setup_minimal_etc(&self) -> io::Result<()> { - let etc = self.root.join("etc"); - - // Minimal /etc/passwd - just nobody user - fs::write( - etc.join("passwd"), - "nobody:x:65534:65534:Unprivileged user:/nonexistent:/usr/sbin/nologin\n", - )?; - - // Minimal /etc/group - just nobody group - fs::write(etc.join("group"), "nogroup:x:65534:\n")?; - - // Minimal /etc/hosts - localhost only - fs::write(etc.join("hosts"), "127.0.0.1 localhost\n::1 localhost\n")?; - - // Minimal /etc/nsswitch.conf - required for name resolution - fs::write( - etc.join("nsswitch.conf"), - "passwd: files\ngroup: files\nhosts: files dns\n", - )?; - - // Copy /etc/ld.so.cache from host if it exists (needed for dynamic linking) - let host_ldcache = Path::new("/etc/ld.so.cache"); - if host_ldcache.exists() { - if let Ok(content) = fs::read(host_ldcache) { - fs::write(etc.join("ld.so.cache"), content)?; - } - } - - // Create /etc/ssl directory for certificates - let ssl_dir = etc.join("ssl"); - fs::create_dir_all(&ssl_dir)?; - - // Minimal /etc/resolv.conf - empty (network is blocked by default) - // When network is enabled, Landlock will allow DNS - fs::write(etc.join("resolv.conf"), "# DNS disabled in sandbox\n")?; - Ok(()) } } @@ -252,4 +211,13 @@ mod tests { let perms = std::fs::metadata(&path).unwrap().permissions(); assert_eq!(perms.mode() & 0o777, 0o755); } + + #[test] + fn workspace_sandbox_dirs() { + let ws = Workspace::new().unwrap(); + ws.setup_sandbox_dirs().unwrap(); + assert!(ws.root().join("work").exists()); + assert!(ws.root().join("tmp").exists()); + assert!(ws.root().join("home").exists()); + } } diff --git a/crates/evalbox-sandbox/tests/common/mod.rs b/crates/evalbox-sandbox/tests/common/mod.rs index d863f1b..f125ca5 100644 --- a/crates/evalbox-sandbox/tests/common/mod.rs +++ b/crates/evalbox-sandbox/tests/common/mod.rs @@ -16,43 +16,40 @@ pub fn payload(name: &str) -> Vec { /// Find payload in cargo's build directory structure. fn find_payload(name: &str) -> Option { - // Get the workspace root by looking for Cargo.toml + // 1. Next to the test executable (Nix builds) + if let Ok(exe) = std::env::current_exe() { + if let Some(exe_dir) = exe.parent() { + let path = exe_dir.join("payloads").join(name); + if path.exists() { + return Some(path); + } + } + } + + // 2. Cargo build directory (development) let manifest_dir = std::env::var("CARGO_MANIFEST_DIR") .map(PathBuf::from) .unwrap_or_else(|_| PathBuf::from(".")); - // The workspace root is two levels up from crates/evalbox-sandbox let workspace_root = manifest_dir.parent()?.parent()?; - let target_dir = workspace_root.join("target"); - // Look in both debug and release builds - for profile in ["debug", "release"] { - let build_dir = target_dir.join(profile).join("build"); - if let Ok(entries) = std::fs::read_dir(&build_dir) { - for entry in entries.flatten() { - let dir_name = entry.file_name(); - if dir_name.to_string_lossy().starts_with("evalbox-sandbox-") { - let payload_path = entry.path().join("out").join("payloads").join(name); - if payload_path.exists() { - return Some(payload_path); - } - } - } - } - } + let target_dirs: Vec = std::iter::once(workspace_root.join("target")) + .chain(std::env::var("CARGO_TARGET_DIR").ok().map(PathBuf::from)) + .collect(); - // Also try CARGO_TARGET_DIR if set - if let Ok(target) = std::env::var("CARGO_TARGET_DIR") { - let target_dir = PathBuf::from(target); + for target_dir in target_dirs { for profile in ["debug", "release"] { let build_dir = target_dir.join(profile).join("build"); if let Ok(entries) = std::fs::read_dir(&build_dir) { for entry in entries.flatten() { - let dir_name = entry.file_name(); - if dir_name.to_string_lossy().starts_with("evalbox-sandbox-") { - let payload_path = entry.path().join("out").join("payloads").join(name); - if payload_path.exists() { - return Some(payload_path); + if entry + .file_name() + .to_string_lossy() + .starts_with("evalbox-sandbox-") + { + let path = entry.path().join("out").join("payloads").join(name); + if path.exists() { + return Some(path); } } } @@ -63,33 +60,6 @@ fn find_payload(name: &str) -> Option { None } -/// Check if we have permission to create user namespaces. -pub fn can_create_namespaces() -> bool { - // Check kernel parameter - if let Ok(content) = std::fs::read_to_string("/proc/sys/kernel/unprivileged_userns_clone") { - if content.trim() == "0" { - return false; - } - } - - // Try to actually create a namespace - let result = std::process::Command::new("unshare") - .args(["--user", "--map-root-user", "true"]) - .output(); - - result.map(|o| o.status.success()).unwrap_or(false) -} - -/// Skip test if namespaces aren't available. Call at start of test. -pub fn skip_if_no_namespaces() -> bool { - if !can_create_namespaces() { - eprintln!("Skipping: Cannot create user namespaces"); - true - } else { - false - } -} - /// SIGSYS signal number (seccomp violation). pub const SIGSYS: i32 = 31; diff --git a/crates/evalbox-sandbox/tests/security/cve.rs b/crates/evalbox-sandbox/tests/security/cve.rs index a46007e..933c0a9 100644 --- a/crates/evalbox-sandbox/tests/security/cve.rs +++ b/crates/evalbox-sandbox/tests/security/cve.rs @@ -7,7 +7,7 @@ use std::time::Duration; use evalbox_sandbox::{Executor, Plan}; -use crate::common::{SIGSYS, payload, skip_if_no_namespaces}; +use crate::common::{SIGSYS, payload}; // ============================================================================= // CVE-2024-1086: nf_tables Use-After-Free @@ -23,13 +23,10 @@ use crate::common::{SIGSYS, payload, skip_if_no_namespaces}; #[test] #[ignore] fn test_cve_2024_1086_nftables_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("cve_2024_1086_nftables")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -57,13 +54,10 @@ fn test_cve_2024_1086_nftables_blocked() { #[test] #[ignore] fn test_cve_2022_0185_fsconfig_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("cve_2022_0185_fsconfig")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -91,13 +85,10 @@ fn test_cve_2022_0185_fsconfig_blocked() { #[test] #[ignore] fn test_cve_2017_5226_tiocsti_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("cve_2017_5226_tiocsti")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -125,13 +116,10 @@ fn test_cve_2017_5226_tiocsti_blocked() { #[test] #[ignore] fn test_cve_2022_0492_cgroups_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("cve_2022_0492_cgroups")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -159,13 +147,10 @@ fn test_cve_2022_0492_cgroups_blocked() { #[test] #[ignore] fn test_fileless_memfd_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("fileless_memfd")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -190,13 +175,10 @@ fn test_fileless_memfd_blocked() { #[test] #[ignore] fn test_ioctl_tioclinux_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("ioctl_tioclinux")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -218,13 +200,10 @@ fn test_ioctl_tioclinux_blocked() { #[test] #[ignore] fn test_ioctl_tiocsetd_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("ioctl_tiocsetd")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -250,13 +229,10 @@ fn test_ioctl_tiocsetd_blocked() { #[test] #[ignore] fn test_userns_creation_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("userns_escape")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -284,13 +260,10 @@ fn test_userns_creation_blocked() { #[test] #[ignore] fn test_ptrace_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("ptrace_escape")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -317,13 +290,10 @@ fn test_ptrace_blocked() { #[test] #[ignore] fn test_cve_2019_10063_ioctl_bypass_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("cve_2019_10063_ioctl_bypass")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); diff --git a/crates/evalbox-sandbox/tests/security/filesystem.rs b/crates/evalbox-sandbox/tests/security/filesystem.rs index 93ed59d..fe08618 100644 --- a/crates/evalbox-sandbox/tests/security/filesystem.rs +++ b/crates/evalbox-sandbox/tests/security/filesystem.rs @@ -1,23 +1,20 @@ //! Filesystem isolation tests. //! //! These tests verify that sandboxed processes cannot access -//! files outside their allowed mounts. +//! files outside their Landlock-allowed paths. +//! +//! Without `pivot_root`, the child process chdir's to `{workspace}/work`. +//! Landlock restricts filesystem access to only allowed paths. use std::time::Duration; use evalbox_sandbox::{Executor, Plan}; -use crate::common::skip_if_no_namespaces; - /// Test that /etc/shadow is not accessible. -/// This file contains password hashes and should never be readable. +/// Landlock only grants read access to /etc, and /etc/shadow requires root. #[test] #[ignore] fn test_cannot_read_etc_shadow() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run(Plan::new(["cat", "/etc/shadow"]).timeout(Duration::from_secs(5))) .expect("Executor should run"); @@ -31,13 +28,10 @@ fn test_cannot_read_etc_shadow() { } /// Test that /etc/passwd cannot be written to. +/// Landlock grants read-only access to /etc, so writes should be blocked. #[test] #[ignore] fn test_cannot_write_etc_passwd() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( Plan::new(["sh", "-c", "echo 'hacked:x:0:0::/:/bin/sh' >> /etc/passwd"]) .timeout(Duration::from_secs(5)), @@ -48,143 +42,109 @@ fn test_cannot_write_etc_passwd() { } /// Test that /root is not accessible. +/// Landlock has no rule for /root, so access should be denied. #[test] #[ignore] fn test_cannot_access_root_home() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run(Plan::new(["ls", "/root"]).timeout(Duration::from_secs(5))) .expect("Executor should run"); assert!(!output.success(), "/root should not be accessible"); } -/// Test that the work directory is writable. +/// Test that the work directory (CWD) is writable. +/// The child chdir's to {workspace}/work, which Landlock grants read/write access to. #[test] #[ignore] fn test_work_dir_is_writable() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( Plan::new([ "sh", "-c", - "echo 'test content' > /work/test.txt && cat /work/test.txt", + "echo 'test content' > ./test.txt && cat ./test.txt", ]) .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); - assert!(output.success(), "Should be able to write to /work"); + assert!( + output.success(), + "Should be able to write to CWD (work dir)" + ); assert_eq!(output.stdout_str().trim(), "test content"); } -/// Test that /tmp is writable. +/// Test that the workspace tmp directory is writable. +/// The workspace tmp dir is at ../tmp relative to CWD ({workspace}/work). +/// Landlock grants read/write access to the workspace root. #[test] #[ignore] fn test_tmp_is_writable() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( Plan::new([ "sh", "-c", - "echo 'temp data' > /tmp/test.txt && cat /tmp/test.txt", + "echo 'temp data' > ../tmp/test.txt && cat ../tmp/test.txt", ]) .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); - assert!(output.success(), "Should be able to write to /tmp"); + assert!( + output.success(), + "Should be able to write to workspace tmp (../tmp)" + ); assert_eq!(output.stdout_str().trim(), "temp data"); } -/// Test that path traversal attempts are blocked. +/// Test that path traversal attempts are blocked by Landlock. /// -/// The sandbox creates a minimal /etc with only essential files (passwd, group, hosts). -/// Path traversal should only see the sandbox's minimal /etc, not the host's. +/// Without `pivot_root`, path traversal from CWD goes up the real filesystem. +/// Landlock should block access to paths outside the allowed set. #[test] #[ignore] fn test_path_traversal_blocked() { - if skip_if_no_namespaces() { - return; - } - - let output = Executor::run( - Plan::new(["cat", "/work/../../../etc/passwd"]).timeout(Duration::from_secs(5)), - ) - .expect("Executor should run"); + let output = + Executor::run(Plan::new(["cat", "../../../etc/shadow"]).timeout(Duration::from_secs(5))) + .expect("Executor should run"); - // The path resolves to /etc/passwd which is the sandbox's minimal passwd - if output.success() { - let content = output.stdout_str(); - - // Verify this is NOT the real host passwd - // Real passwd would have many entries (root, daemon, bin, etc.) - let line_count = content.lines().count(); - let has_nixbld = content.contains("nixbld"); // NixOS specific - let has_root = content.contains("root:"); - let has_real_users = content.contains("daemon:") || content.contains("bin:"); - - assert!( - !has_nixbld && !has_real_users && line_count <= 5, - "Path traversal should not leak real /etc/passwd.\n\ - Expected minimal sandbox passwd, got {line_count} lines:\n{content}" - ); - - // If there's root: it should be the sandbox's nobody-only passwd - if has_root { - panic!("Path traversal leaked real /etc/passwd with root entry:\n{content}"); - } - } + // Landlock should block access to /etc/shadow (no read on shadow, even via traversal) + assert!( + !output.success(), + "Path traversal to /etc/shadow should be blocked by Landlock" + ); } /// Test that symlink attacks are prevented. +/// Landlock controls access at the kernel level, so symlinks to restricted +/// paths should still be blocked. #[test] #[ignore] fn test_symlink_escape_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new([ - "sh", - "-c", - "ln -s /etc/shadow /work/shadow && cat /work/shadow", - ]) - .timeout(Duration::from_secs(5)), + Plan::new(["sh", "-c", "ln -s /etc/shadow ./shadow && cat ./shadow"]) + .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); - // Either symlink creation fails or reading it fails + // Either symlink creation fails or reading the target fails due to Landlock assert!(!output.success(), "Symlink escape should be blocked"); } -/// Test that /proc/self/exe cannot be used to escape. +/// Test that /proc/self/exe is safe. +/// Without `pivot_root`, /proc/self/exe reveals the real binary path on the host. +/// This is expected behavior -- we just verify the sandbox doesn't crash. #[test] #[ignore] fn test_proc_self_exe_safe() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run(Plan::new(["readlink", "/proc/self/exe"]).timeout(Duration::from_secs(5))) .expect("Executor should run"); - // Should not reveal host paths - if output.success() { - let exe_path = output.stdout_str(); - assert!( - !exe_path.contains("/home/") && !exe_path.contains("/usr/"), - "/proc/self/exe should not reveal host paths: {exe_path}" - ); - } + // Without pivot_root, /proc/self/exe will show the real host path. + // This is expected -- just verify the command runs without crashing. + assert!( + output.exit_code.is_some(), + "/proc/self/exe readlink should complete without crashing" + ); } diff --git a/crates/evalbox-sandbox/tests/security/network.rs b/crates/evalbox-sandbox/tests/security/network.rs index ef884cf..2603c6a 100644 --- a/crates/evalbox-sandbox/tests/security/network.rs +++ b/crates/evalbox-sandbox/tests/security/network.rs @@ -7,17 +7,11 @@ use std::time::Duration; use evalbox_sandbox::{Executor, Plan}; -use crate::common::skip_if_no_namespaces; - /// Test that network is blocked by default. /// curl should fail to connect to any external host. #[test] #[ignore] fn test_network_blocked_by_default() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( Plan::new(["sh", "-c", "curl -s --connect-timeout 2 http://example.com || wget -q -O- --timeout=2 http://example.com"]) .timeout(Duration::from_secs(5)), @@ -31,29 +25,21 @@ fn test_network_blocked_by_default() { #[test] #[ignore] fn test_localhost_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( Plan::new(["sh", "-c", "echo test | nc -w1 127.0.0.1 80 2>/dev/null"]) .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); - // Should fail - network namespace isolates us + // Should fail - seccomp blocks socket creation assert!(!output.success(), "Localhost should not be reachable"); } /// Test that external DNS resolution fails when network is blocked. -/// Note: /etc/hosts lookups may still work since the file exists in sandbox. +/// Note: /etc/hosts lookups may still work since the file exists on the host. #[test] #[ignore] fn test_external_dns_blocked() { - if skip_if_no_namespaces() { - return; - } - // Use a domain that definitely isn't in /etc/hosts let output = Executor::run( Plan::new([ @@ -83,10 +69,6 @@ fn test_external_dns_blocked() { #[test] #[ignore] fn test_network_flag_enabled() { - if skip_if_no_namespaces() { - return; - } - // Just verify that enabling network doesn't break sandbox execution let output = Executor::run( Plan::new(["sh", "-c", "echo 'network flag test'"]) @@ -109,10 +91,6 @@ fn test_network_flag_enabled() { #[test] #[ignore] fn test_loopback_isolated() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( Plan::new([ "sh", @@ -123,14 +101,14 @@ fn test_loopback_isolated() { ) .expect("Executor should run"); - // The loopback might exist in the network namespace but be isolated - // This is more of a sanity check + // Without network namespaces, the host loopback is visible but + // seccomp blocks socket creation so it can't be used to connect. + // This is more of a sanity check that the command runs. if output.success() { - // If lo exists, verify it's the sandbox's own interface let stdout = output.stdout_str(); assert!( stdout.contains("lo") || stdout.contains("127.0.0.1"), - "Loopback should be visible if network namespace is active" + "Loopback should be visible" ); } } diff --git a/crates/evalbox-sandbox/tests/security/resources.rs b/crates/evalbox-sandbox/tests/security/resources.rs index 2075527..fa51dd0 100644 --- a/crates/evalbox-sandbox/tests/security/resources.rs +++ b/crates/evalbox-sandbox/tests/security/resources.rs @@ -7,16 +7,12 @@ use std::time::Duration; use evalbox_sandbox::{Executor, Plan, Status}; -use crate::common::{payload, skip_if_no_namespaces}; +use crate::common::payload; /// Test that timeout is enforced. #[test] #[ignore] fn test_timeout_enforced() { - if skip_if_no_namespaces() { - return; - } - let start = std::time::Instant::now(); let output = Executor::run(Plan::new(["sleep", "60"]).timeout(Duration::from_millis(500))) @@ -36,10 +32,6 @@ fn test_timeout_enforced() { #[test] #[ignore] fn test_infinite_loop_timeout() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( Plan::new(["sh", "-c", "while true; do :; done"]).timeout(Duration::from_millis(500)), ) @@ -53,13 +45,10 @@ fn test_infinite_loop_timeout() { #[test] #[ignore] fn test_max_pids_enforced() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("fork_bomb")) + .binary_path("./payload") .max_pids(10) .timeout(Duration::from_secs(5)), ) @@ -80,10 +69,6 @@ fn test_max_pids_enforced() { #[test] #[ignore] fn test_output_limit_enforced() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( Plan::new(["sh", "-c", "yes | head -c 100000"]) // 100KB of 'y' .max_output(1024) // 1KB limit @@ -113,10 +98,6 @@ fn test_output_limit_enforced() { #[test] #[ignore] fn test_memory_limit_set() { - if skip_if_no_namespaces() { - return; - } - // Check that the memory rlimit is set correctly let output = Executor::run( Plan::new([ @@ -146,10 +127,6 @@ fn test_memory_limit_set() { #[test] #[ignore] fn test_fd_limit_set() { - if skip_if_no_namespaces() { - return; - } - // Check the fd limit using ulimit let output = Executor::run(Plan::new(["sh", "-c", "ulimit -n"]).timeout(Duration::from_secs(5))) @@ -173,10 +150,6 @@ fn test_fd_limit_set() { #[test] #[ignore] fn test_cpu_intensive_timeout() { - if skip_if_no_namespaces() { - return; - } - let start = std::time::Instant::now(); // CPU-intensive work that doesn't sleep diff --git a/crates/evalbox-sandbox/tests/security/seccomp.rs b/crates/evalbox-sandbox/tests/security/seccomp.rs index bbfdc00..937dd68 100644 --- a/crates/evalbox-sandbox/tests/security/seccomp.rs +++ b/crates/evalbox-sandbox/tests/security/seccomp.rs @@ -7,20 +7,17 @@ use std::time::Duration; use evalbox_sandbox::{Executor, Plan}; -use crate::common::{SIGSYS, payload, skip_if_no_namespaces}; +use crate::common::{SIGSYS, payload}; /// Test that a simple payload can execute successfully. /// This is a control test to verify the sandbox is working. #[test] #[ignore] fn test_payload_can_execute() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("success")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -39,13 +36,10 @@ fn test_payload_can_execute() { #[test] #[ignore] fn test_ptrace_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("syscall_ptrace")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -64,13 +58,10 @@ fn test_ptrace_blocked() { #[test] #[ignore] fn test_mount_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("syscall_mount")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -89,13 +80,10 @@ fn test_mount_blocked() { #[test] #[ignore] fn test_reboot_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("syscall_reboot")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -114,13 +102,10 @@ fn test_reboot_blocked() { #[test] #[ignore] fn test_clone_newuser_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("syscall_clone_ns")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -139,13 +124,10 @@ fn test_clone_newuser_blocked() { #[test] #[ignore] fn test_socket_netlink_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("socket_netlink")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -164,13 +146,10 @@ fn test_socket_netlink_blocked() { #[test] #[ignore] fn test_socket_raw_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("socket_raw")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -190,13 +169,10 @@ fn test_socket_raw_blocked() { #[test] #[ignore] fn test_keyctl_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("syscall_keyctl")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); @@ -216,13 +192,10 @@ fn test_keyctl_blocked() { #[test] #[ignore] fn test_bpf_blocked() { - if skip_if_no_namespaces() { - return; - } - let output = Executor::run( - Plan::new(["/work/payload"]) + Plan::new(["./payload"]) .executable("payload", payload("syscall_bpf")) + .binary_path("./payload") .timeout(Duration::from_secs(5)), ) .expect("Executor should run"); diff --git a/crates/evalbox-sys/src/check.rs b/crates/evalbox-sys/src/check.rs index f88e3c2..6076036 100644 --- a/crates/evalbox-sys/src/check.rs +++ b/crates/evalbox-sys/src/check.rs @@ -7,9 +7,8 @@ //! //! | Feature | Minimum | Check Method | //! |---------|---------|--------------| -//! | Kernel | 5.13 | `uname` syscall | -//! | Landlock | ABI 1 | `landlock_create_ruleset` with VERSION flag | -//! | User NS | enabled | `/proc/sys/kernel/unprivileged_userns_clone` or fork+unshare test | +//! | Kernel | 6.12 | `uname` syscall | +//! | Landlock | ABI 5 | `landlock_create_ruleset` with VERSION flag | //! | Seccomp | enabled | `prctl(PR_GET_SECCOMP)` | //! //! ## Usage @@ -20,13 +19,6 @@ //! Err(e) => eprintln!("System not supported: {}", e), //! } //! ``` -//! -//! ## User Namespaces -//! -//! User namespace support varies by distribution: -//! - **Debian/Ubuntu**: `/proc/sys/kernel/unprivileged_userns_clone` -//! - **NixOS/Fedora**: `/proc/sys/user/max_user_namespaces` -//! - **Fallback**: Fork + unshare test use std::sync::OnceLock; @@ -41,7 +33,6 @@ use crate::seccomp; pub struct SystemInfo { pub kernel_version: (u32, u32, u32), pub landlock_abi: u32, - pub user_ns_enabled: bool, pub seccomp_enabled: bool, } @@ -57,8 +48,8 @@ pub enum CheckError { #[error("landlock is not available")] LandlockNotAvailable, - #[error("user namespaces are disabled")] - UserNamespacesDisabled, + #[error("landlock ABI {found} is too old, need at least ABI {required}")] + LandlockAbiTooOld { required: u32, found: u32 }, #[error("seccomp is not available")] SeccompNotAvailable, @@ -67,8 +58,9 @@ pub enum CheckError { KernelVersionReadFailed, } -// Minimum kernel version: 5.13 (first with Landlock) -const MIN_KERNEL_VERSION: (u32, u32, u32) = (5, 13, 0); +// Minimum kernel version: 6.12 (Landlock ABI 5 with SCOPE_SIGNAL + SCOPE_ABSTRACT_UNIX_SOCKET) +const MIN_KERNEL_VERSION: (u32, u32, u32) = (6, 12, 0); +const MIN_LANDLOCK_ABI: u32 = 5; static SYSTEM_INFO: OnceLock> = OnceLock::new(); @@ -93,10 +85,11 @@ fn check_impl() -> Result { if landlock_abi == 0 { return Err(CheckError::LandlockNotAvailable); } - - let user_ns_enabled = check_user_namespaces(); - if !user_ns_enabled { - return Err(CheckError::UserNamespacesDisabled); + if landlock_abi < MIN_LANDLOCK_ABI { + return Err(CheckError::LandlockAbiTooOld { + required: MIN_LANDLOCK_ABI, + found: landlock_abi, + }); } let seccomp_enabled = seccomp::seccomp_available(); @@ -107,7 +100,6 @@ fn check_impl() -> Result { Ok(SystemInfo { kernel_version, landlock_abi, - user_ns_enabled, seccomp_enabled, }) } @@ -145,36 +137,6 @@ fn parse_kernel_version(release: &str) -> Result<(u32, u32, u32), CheckError> { Ok((major, minor, patch)) } -fn check_user_namespaces() -> bool { - // Check sysctl first (Debian/Ubuntu) - if let Ok(content) = std::fs::read_to_string("/proc/sys/kernel/unprivileged_userns_clone") { - return content.trim() == "1"; - } - - // Check max_user_namespaces (NixOS and others) - if let Ok(content) = std::fs::read_to_string("/proc/sys/user/max_user_namespaces") - && content.trim().parse::().unwrap_or(0) > 0 - { - return true; - } - - // Last resort: fork + unshare test (must fork to avoid polluting parent) - // SAFETY: fork/unshare/waitpid are safe when used correctly. Child exits immediately. - unsafe { - let pid = libc::fork(); - if pid < 0 { - return false; - } - if pid == 0 { - let ret = libc::unshare(libc::CLONE_NEWUSER); - libc::_exit(if ret == 0 { 0 } else { 1 }); - } - let mut status: i32 = 0; - libc::waitpid(pid, &mut status, 0); - libc::WIFEXITED(status) && libc::WEXITSTATUS(status) == 0 - } -} - #[cfg(test)] mod tests { use super::*; @@ -187,6 +149,7 @@ mod tests { parse_kernel_version("5.4.0-150-generic").unwrap(), (5, 4, 0) ); + assert_eq!(parse_kernel_version("6.12.0").unwrap(), (6, 12, 0)); } #[test] @@ -195,7 +158,6 @@ mod tests { Ok(info) => { println!("Kernel version: {:?}", info.kernel_version); println!("Landlock ABI: {}", info.landlock_abi); - println!("User NS enabled: {}", info.user_ns_enabled); println!("Seccomp enabled: {}", info.seccomp_enabled); } Err(e) => { diff --git a/crates/evalbox-sys/src/landlock.rs b/crates/evalbox-sys/src/landlock.rs index 6a7599d..c862cd4 100644 --- a/crates/evalbox-sys/src/landlock.rs +++ b/crates/evalbox-sys/src/landlock.rs @@ -12,6 +12,7 @@ //! | 2 | 5.19 | `REFER` (cross-directory rename/link) | //! | 3 | 6.2 | `TRUNCATE` (file truncation) | //! | 4 | 6.7 | `IOCTL_DEV`, TCP network access | +//! | 5 | 6.12 | `SCOPE_SIGNAL`, `SCOPE_ABSTRACT_UNIX_SOCKET` | //! //! ## Usage //! @@ -75,11 +76,19 @@ pub const LANDLOCK_ACCESS_FS_IOCTL_DEV: u64 = 1 << 15; pub const LANDLOCK_ACCESS_NET_BIND_TCP: u64 = 1 << 0; pub const LANDLOCK_ACCESS_NET_CONNECT_TCP: u64 = 1 << 1; +// ABI v5 - Scoped restrictions +/// Block abstract unix socket connections outside the sandbox. +pub const LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET: u64 = 1 << 0; +/// Block signals to processes outside the sandbox. +pub const LANDLOCK_SCOPE_SIGNAL: u64 = 1 << 1; + #[repr(C)] #[derive(Debug, Default)] pub struct LandlockRulesetAttr { pub handled_access_fs: u64, pub handled_access_net: u64, + /// ABI 5+: Scoped restrictions (signal and abstract unix socket isolation). + pub scoped: u64, } #[repr(C)] @@ -205,6 +214,18 @@ pub fn net_access_for_abi(abi: u32) -> u64 { } } +/// Returns the scoped restriction flags for the given ABI version. +/// +/// ABI 5+ supports signal isolation and abstract unix socket isolation, +/// replacing the need for PID and IPC namespaces. +pub fn scope_for_abi(abi: u32) -> u64 { + if abi >= 5 { + LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET | LANDLOCK_SCOPE_SIGNAL + } else { + 0 + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/evalbox-sys/src/lib.rs b/crates/evalbox-sys/src/lib.rs index 59a11d7..78a4688 100644 --- a/crates/evalbox-sys/src/lib.rs +++ b/crates/evalbox-sys/src/lib.rs @@ -5,8 +5,9 @@ //! //! ## Modules //! -//! - **landlock** - Landlock LSM for filesystem/network access control (kernel 5.13+) +//! - **landlock** - Landlock LSM for filesystem/network/scope access control (kernel 5.13+) //! - **seccomp** - Seccomp-BPF syscall filtering +//! - **`seccomp_notify`** - Seccomp user notification (`SECCOMP_RET_USER_NOTIF`) //! - **check** - Runtime system capability detection //! //! ## Landlock @@ -16,6 +17,7 @@ //! - ABI 2: File truncation (kernel 5.19) //! - ABI 3: File permission changes (kernel 6.2) //! - ABI 4: Network TCP access control (kernel 6.7) +//! - ABI 5: Scoped signals and abstract unix sockets (kernel 6.12) //! //! ## Seccomp-BPF //! @@ -23,6 +25,11 @@ //! a whitelist-based filter that allows ~40 safe syscalls and kills the process //! on any other syscall. //! +//! ## Seccomp User Notify +//! +//! Seccomp user notification allows a supervisor process to intercept syscalls +//! from a sandboxed child, enabling filesystem virtualization without namespaces. +//! //! # Safety //! //! This crate contains raw syscall wrappers. Casts between integer types @@ -34,6 +41,7 @@ pub mod check; pub mod landlock; pub mod seccomp; +pub mod seccomp_notify; pub use check::{CheckError, SystemInfo, check}; diff --git a/crates/evalbox-sys/src/seccomp.rs b/crates/evalbox-sys/src/seccomp.rs index b2b57d6..1fd50d4 100644 --- a/crates/evalbox-sys/src/seccomp.rs +++ b/crates/evalbox-sys/src/seccomp.rs @@ -47,14 +47,14 @@ //! - `memfd_create` + `execveat` - Enables fileless execution (bypass Landlock) //! - `setresuid`/`setresgid` - No reason to change UID in sandbox //! - `setsid`/`setpgid` - Session manipulation, unnecessary -//! - `ioctl` - Too powerful without argument filtering (TODO: whitelist specific codes) +//! - `ioctl` - Allowed with argument filtering (TIOCSTI, TIOCSETD, TIOCLINUX blocked) //! //! ## Security Notes //! //! - Filter is permanent - cannot be removed once applied //! - Requires `PR_SET_NO_NEW_PRIVS` first //! - Blocked syscall = immediate process termination (SIGSYS) -//! - `kill`/`tgkill` are safe because we use PID namespace (`CLONE_NEWPID`) +//! - `kill`/`tgkill` are safe due to Landlock v5 `SCOPE_SIGNAL` isolation //! - `prctl` allowed but `PR_SET_SECCOMP` has no effect (filter already applied) use rustix::io::Errno; @@ -64,6 +64,7 @@ use crate::last_errno; // Seccomp constants const SECCOMP_SET_MODE_FILTER: u32 = 1; const SECCOMP_RET_KILL_PROCESS: u32 = 0x80000000; +const SECCOMP_RET_USER_NOTIF: u32 = 0x7fc00000; const SECCOMP_RET_ALLOW: u32 = 0x7fff0000; // Return ENOSYS (38) to allow graceful fallback const SECCOMP_RET_ERRNO_ENOSYS: u32 = 0x00050000 | 38; @@ -172,7 +173,7 @@ pub struct SockFprog { /// - `setsid`/`setpgid` - Session manipulation unnecessary /// /// ## Notes: -/// - `kill`/`tgkill` safe due to PID namespace isolation +/// - `kill`/`tgkill` safe due to Landlock v5 `SCOPE_SIGNAL` isolation /// - `prctl` kept for runtime needs (`PR_SET_NAME`, etc.) pub const DEFAULT_WHITELIST: &[i64] = &[ // === Basic I/O === @@ -291,7 +292,7 @@ pub const DEFAULT_WHITELIST: &[i64] = &[ libc::SYS_fchdir, libc::SYS_readlink, libc::SYS_readlinkat, - // === Signals (safe due to PID namespace) === + // === Signals (safe due to Landlock SCOPE_SIGNAL) === libc::SYS_rt_sigaction, libc::SYS_rt_sigprocmask, libc::SYS_rt_sigreturn, @@ -299,9 +300,9 @@ pub const DEFAULT_WHITELIST: &[i64] = &[ libc::SYS_rt_sigpending, libc::SYS_rt_sigtimedwait, libc::SYS_sigaltstack, - libc::SYS_kill, // Safe: PID namespace isolates - libc::SYS_tgkill, // Safe: PID namespace isolates - libc::SYS_tkill, // Safe: PID namespace isolates + libc::SYS_kill, // Safe: Landlock SCOPE_SIGNAL isolates + libc::SYS_tgkill, // Safe: Landlock SCOPE_SIGNAL isolates + libc::SYS_tkill, // Safe: Landlock SCOPE_SIGNAL isolates // === Process control === libc::SYS_execve, // execveat REMOVED - with memfd_create enables fileless execution @@ -567,6 +568,81 @@ pub fn seccomp_available() -> bool { unsafe { libc::prctl(libc::PR_GET_SECCOMP, 0, 0, 0, 0) >= 0 } } +/// Builds a BPF filter that returns `SECCOMP_RET_USER_NOTIF` for the listed +/// syscalls and `SECCOMP_RET_ALLOW` for everything else. +/// +/// This filter is installed *before* the kill filter. The kernel evaluates all +/// stacked filters and returns the strictest verdict, so: +/// - Syscall in both ALLOW lists → ALLOW +/// - Syscall in NOTIFY + ALLOW → NOTIFY (supervisor decides) +/// - Syscall not in kill filter whitelist → KILL (regardless of notify filter) +/// +/// # Panics +/// +/// Panics if `syscalls.len()` > 200 (BPF jump offsets are u8). +pub fn build_notify_filter(syscalls: &[i64]) -> Vec { + assert!( + syscalls.len() <= MAX_WHITELIST_SIZE, + "notify syscall list too large: {} > {}", + syscalls.len(), + MAX_WHITELIST_SIZE + ); + + let n = syscalls.len(); + let mut filter = Vec::with_capacity(n + 8); + + // Architecture check + filter.push(SockFilter::stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARCH)); + filter.push(SockFilter::jump( + BPF_JMP | BPF_JEQ | BPF_K, + AUDIT_ARCH_X86_64, + 1, + 0, + )); + filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)); + + // Load syscall number + filter.push(SockFilter::stmt( + BPF_LD | BPF_W | BPF_ABS, + OFFSET_SYSCALL_NR, + )); + + // Check each syscall → jump to NOTIFY + for (i, &nr) in syscalls.iter().enumerate() { + let notify_offset = (n - i) as u8; // jump to NOTIFY instruction + filter.push(SockFilter::jump( + BPF_JMP | BPF_JEQ | BPF_K, + nr as u32, + notify_offset, + 0, + )); + } + + // Default: ALLOW + filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)); + + // NOTIFY + filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_USER_NOTIF)); + + filter +} + +/// Syscalls that are intercepted by the notify filter for filesystem virtualization. +pub const NOTIFY_FS_SYSCALLS: &[i64] = &[ + libc::SYS_openat, + libc::SYS_open, + libc::SYS_creat, + libc::SYS_access, + libc::SYS_faccessat, + libc::SYS_faccessat2, + libc::SYS_stat, + libc::SYS_lstat, + libc::SYS_newfstatat, + libc::SYS_statx, + libc::SYS_readlink, + libc::SYS_readlinkat, +]; + #[cfg(test)] mod tests { use super::*; @@ -659,4 +735,20 @@ mod tests { let huge: Vec = (0..300).map(|i| i as i64).collect(); build_whitelist_filter(&huge); } + + #[test] + fn notify_filter_structure() { + let syscalls = &[libc::SYS_openat, libc::SYS_open, libc::SYS_stat]; + let filter = build_notify_filter(syscalls); + // 3 (arch) + 1 (load) + 3 (checks) + 1 (allow) + 1 (notify) = 9 + assert_eq!(filter.len(), 9); + } + + #[test] + fn notify_fs_syscalls_present() { + assert!(NOTIFY_FS_SYSCALLS.contains(&libc::SYS_openat)); + assert!(NOTIFY_FS_SYSCALLS.contains(&libc::SYS_open)); + assert!(NOTIFY_FS_SYSCALLS.contains(&libc::SYS_stat)); + assert!(NOTIFY_FS_SYSCALLS.contains(&libc::SYS_readlink)); + } } diff --git a/crates/evalbox-sys/src/seccomp_notify.rs b/crates/evalbox-sys/src/seccomp_notify.rs new file mode 100644 index 0000000..91a2357 --- /dev/null +++ b/crates/evalbox-sys/src/seccomp_notify.rs @@ -0,0 +1,246 @@ +//! Seccomp user notification (`SECCOMP_RET_USER_NOTIF`) support. +//! +//! Seccomp user notification allows a supervisor process to intercept +//! syscalls from a sandboxed child and make decisions on its behalf. +//! This enables filesystem virtualization without user namespaces. +//! +//! ## Architecture +//! +//! 1. Child installs a seccomp filter with `SECCOMP_FILTER_FLAG_NEW_LISTENER` +//! 2. This returns a "listener fd" which is passed to the parent via `SCM_RIGHTS` +//! 3. Parent polls the listener fd; when readable, calls `SECCOMP_IOCTL_NOTIF_RECV` +//! 4. Parent inspects the syscall and either: +//! - Returns `SECCOMP_USER_NOTIF_FLAG_CONTINUE` to let it proceed +//! - Returns an error code to deny it +//! - Uses `SECCOMP_IOCTL_NOTIF_ADDFD` to inject a file descriptor +//! +//! ## TOCTOU Protection +//! +//! Between receiving a notification and responding, the child's memory may change. +//! Always call `SECCOMP_IOCTL_NOTIF_ID_VALID` after reading child memory to verify +//! the notification is still valid. + +use std::os::fd::{FromRawFd, OwnedFd}; + +use rustix::io::Errno; + +use crate::last_errno; +use crate::seccomp::SockFprog; + +// Seccomp constants for notify +const SECCOMP_SET_MODE_FILTER: u32 = 1; +pub const SECCOMP_FILTER_FLAG_NEW_LISTENER: u32 = 1 << 3; + +/// Let the syscall proceed as-is (supervisor approves). +pub const SECCOMP_USER_NOTIF_FLAG_CONTINUE: u32 = 1; + +/// Atomically inject fd and respond to the notification. +pub const SECCOMP_ADDFD_FLAG_SEND: u32 = 1 << 0; +/// Replace an existing fd in the target process. +pub const SECCOMP_ADDFD_FLAG_SETFD: u32 = 1 << 1; + +// ioctl numbers for seccomp notify (from kernel headers) +// These are architecture-dependent; values below are for x86_64. +// SECCOMP_IOCTL_NOTIF_RECV = SECCOMP_IOWR(0, struct seccomp_notif) +// SECCOMP_IOCTL_NOTIF_SEND = SECCOMP_IOWR(1, struct seccomp_notif_resp) +// SECCOMP_IOCTL_NOTIF_ID_VALID = SECCOMP_IOW(2, __u64) +// SECCOMP_IOCTL_NOTIF_ADDFD = SECCOMP_IOW(3, struct seccomp_notif_addfd) + +/// ioctl to receive a notification from the seccomp listener fd. +pub const SECCOMP_IOCTL_NOTIF_RECV: u64 = 0xc0502100; +/// ioctl to send a response to a seccomp notification. +pub const SECCOMP_IOCTL_NOTIF_SEND: u64 = 0xc0182101; +/// ioctl to check if a notification ID is still valid (TOCTOU protection). +pub const SECCOMP_IOCTL_NOTIF_ID_VALID: u64 = 0x40082102; +/// ioctl to inject a file descriptor into the notifying process. +pub const SECCOMP_IOCTL_NOTIF_ADDFD: u64 = 0x40182103; + +/// Seccomp notification data (mirrors kernel `struct seccomp_data`). +#[repr(C)] +#[derive(Debug, Clone, Copy, Default)] +pub struct SeccompData { + /// Syscall number. + pub nr: i32, + /// Architecture (`AUDIT_ARCH_*`). + pub arch: u32, + /// Instruction pointer at time of syscall. + pub instruction_pointer: u64, + /// Syscall arguments. + pub args: [u64; 6], +} + +/// Seccomp notification received from the child (mirrors kernel `struct seccomp_notif`). +#[repr(C)] +#[derive(Debug, Clone, Copy)] +pub struct SeccompNotif { + /// Unique notification ID. + pub id: u64, + /// PID of the notifying process (in supervisor's PID namespace). + pub pid: u32, + /// Flags (currently unused, must be 0). + pub flags: u32, + /// The syscall data. + pub data: SeccompData, +} + +impl Default for SeccompNotif { + fn default() -> Self { + // SAFETY: SeccompNotif is a plain C struct with no invariants. + unsafe { std::mem::zeroed() } + } +} + +/// Response to a seccomp notification (mirrors kernel `struct seccomp_notif_resp`). +#[repr(C)] +#[derive(Debug, Clone, Copy, Default)] +pub struct SeccompNotifResp { + /// Must match the notification ID. + pub id: u64, + /// Return value for the syscall. + pub val: i64, + /// Errno value (negated in kernel). + pub error: i32, + /// Flags (e.g., `SECCOMP_USER_NOTIF_FLAG_CONTINUE`). + pub flags: u32, +} + +/// Inject a file descriptor into the notifying process +/// (mirrors kernel `struct seccomp_notif_addfd`). +#[repr(C)] +#[derive(Debug, Clone, Copy, Default)] +pub struct SeccompNotifAddfd { + /// Must match the notification ID. + pub id: u64, + /// Flags (e.g., `SECCOMP_ADDFD_FLAG_SEND`). + pub flags: u32, + /// The fd in the supervisor to inject. + pub srcfd: u32, + /// The fd number to use in the target (0 = kernel picks). + pub newfd: u32, + /// Flags for the new fd (e.g., `O_CLOEXEC`). + pub newfd_flags: u32, +} + +/// Install a seccomp filter with `SECCOMP_FILTER_FLAG_NEW_LISTENER`. +/// +/// Returns the listener fd which can be used to receive notifications. +/// The caller must have already called `PR_SET_NO_NEW_PRIVS`. +/// +/// # Safety +/// +/// The filter must be a valid BPF program. This permanently restricts +/// syscalls for this thread. +/// +/// # Errors +/// +/// Returns `Errno` if the filter cannot be installed. +pub unsafe fn seccomp_set_mode_filter_listener(fprog: &SockFprog) -> Result { + unsafe { + let ret = libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + if ret != 0 { + return Err(last_errno()); + } + + let ret = libc::syscall( + libc::SYS_seccomp, + SECCOMP_SET_MODE_FILTER, + SECCOMP_FILTER_FLAG_NEW_LISTENER, + fprog as *const _, + ); + if ret < 0 { + Err(last_errno()) + } else { + // SAFETY: On success, ret is a valid listener file descriptor. + Ok(OwnedFd::from_raw_fd(ret as i32)) + } + } +} + +/// Receive a notification from the seccomp listener fd. +/// +/// Blocks until a notification is available (or use poll/epoll first). +/// +/// # Errors +/// +/// Returns `Errno` on failure (e.g., `ENOENT` if the target died). +pub fn notif_recv(listener_fd: i32, notif: &mut SeccompNotif) -> Result<(), Errno> { + let ret = unsafe { + libc::ioctl( + listener_fd, + SECCOMP_IOCTL_NOTIF_RECV, + notif as *mut SeccompNotif, + ) + }; + if ret < 0 { Err(last_errno()) } else { Ok(()) } +} + +/// Send a response to a seccomp notification. +/// +/// # Errors +/// +/// Returns `Errno` on failure. +pub fn notif_send(listener_fd: i32, resp: &SeccompNotifResp) -> Result<(), Errno> { + let ret = unsafe { + libc::ioctl( + listener_fd, + SECCOMP_IOCTL_NOTIF_SEND, + resp as *const SeccompNotifResp, + ) + }; + if ret < 0 { Err(last_errno()) } else { Ok(()) } +} + +/// Check if a notification ID is still valid. +/// +/// Must be called after reading from child's `/proc/pid/mem` to protect +/// against TOCTOU attacks. +/// +/// # Errors +/// +/// Returns `Errno::NOENT` if the notification is no longer valid. +pub fn notif_id_valid(listener_fd: i32, id: u64) -> Result<(), Errno> { + let ret = unsafe { libc::ioctl(listener_fd, SECCOMP_IOCTL_NOTIF_ID_VALID, &id as *const u64) }; + if ret < 0 { Err(last_errno()) } else { Ok(()) } +} + +/// Inject a file descriptor into the notifying process. +/// +/// With `SECCOMP_ADDFD_FLAG_SEND`, this atomically injects the fd and +/// responds to the notification (the return value becomes the new fd number +/// in the target process). +/// +/// # Errors +/// +/// Returns `Errno` on failure. +pub fn notif_addfd(listener_fd: i32, addfd: &SeccompNotifAddfd) -> Result { + let ret = unsafe { + libc::ioctl( + listener_fd, + SECCOMP_IOCTL_NOTIF_ADDFD, + addfd as *const SeccompNotifAddfd, + ) + }; + if ret < 0 { Err(last_errno()) } else { Ok(ret) } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn struct_sizes() { + // Verify struct sizes match kernel expectations + assert_eq!(size_of::(), 64); + assert_eq!(size_of::(), 80); + assert_eq!(size_of::(), 24); + assert_eq!(size_of::(), 24); + } + + #[test] + fn default_notif_is_zeroed() { + let notif = SeccompNotif::default(); + assert_eq!(notif.id, 0); + assert_eq!(notif.pid, 0); + assert_eq!(notif.data.nr, 0); + } +} diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 9f22c66..adeca15 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -2,7 +2,7 @@ ## Overview -evalbox is a secure sandbox for executing untrusted code on Linux. It provides millisecond-startup isolation using Linux namespaces, Landlock LSM, and seccomp-BPF. +evalbox is a secure sandbox for executing untrusted code on Linux. It provides millisecond-startup isolation using Landlock LSM v5, seccomp-BPF, and rlimits — no namespaces, no containers, no root. ``` ┌─────────────────────────────────────────────────────────────────┐ @@ -29,23 +29,23 @@ evalbox is a secure sandbox for executing untrusted code on Linux. It provides m │ └──────────────────────────────────────────────────────────┘ │ │ ┌──────────────────────────────────────────────────────────┐ │ │ │ Isolation │ │ -│ │ • Namespaces (user, pid, net, mount, uts, ipc) │ │ -│ │ • pivot_root + minimal rootfs │ │ -│ │ • Landlock filesystem rules │ │ -│ │ • Seccomp syscall filter │ │ +│ │ • Landlock v5 (filesystem, network, signal, IPC) │ │ +│ │ • Seccomp-BPF (syscall whitelist) │ │ +│ │ • rlimits (memory, CPU, PIDs, fds) │ │ +│ │ • Privilege hardening (securebits, capability drop) │ │ │ └──────────────────────────────────────────────────────────┘ │ └─────────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────────┐ │ evalbox-sys │ -│ Raw Linux syscalls: clone3, pidfd, seccomp, landlock │ +│ Raw Linux syscalls: seccomp, landlock, seccomp_notify │ └─────────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────────┐ │ Linux Kernel │ -│ namespaces │ seccomp-bpf │ landlock │ cgroups │ rlimits │ +│ seccomp-bpf │ landlock │ rlimits │ └─────────────────────────────────────────────────────────────────┘ ``` @@ -68,17 +68,18 @@ evalbox/ │ ├── workspace.rs # Temporary filesystem setup │ ├── monitor.rs # Process monitoring, output capture │ ├── isolation/ # Isolation mechanisms -│ │ ├── namespace.rs # User/PID/Net namespace setup -│ │ ├── rootfs.rs # Mount namespace, pivot_root -│ │ └── lockdown.rs # Landlock + seccomp application +│ │ ├── lockdown.rs # Landlock v5 + securebits + cap drop +│ │ └── rlimits.rs # Resource limits +│ ├── notify/ # Seccomp user notify (optional) │ ├── validate.rs # Input validation -│ └── sysinfo.rs # System detection (Nix, paths) +│ └── resolve.rs # Binary resolution │ └── evalbox-sys/ # Low-level syscalls └── src/ ├── seccomp.rs # BPF filter generation + ├── seccomp_notify.rs # Seccomp user notify support ├── landlock.rs # Landlock ruleset API - └── check.rs # Capability detection + └── check.rs # System capability detection ``` --- @@ -132,13 +133,6 @@ loop { } ``` -### Platform Behavior - -| Platform | Process Monitoring | I/O Multiplexing | -|----------|-------------------|------------------| -| Linux | pidfd + epoll | mio (epoll) | -| macOS | vsock to VM | mio (kqueue) | - --- ## Sandbox Lifecycle @@ -156,85 +150,93 @@ loop { ┌──────────────────────────────────────────────────────────────────┐ │ 2. WORKSPACE PREPARATION │ │ • Create tempdir (/tmp/evalbox-XXXXX) │ -│ • Setup directory structure (/work, /tmp, /etc) │ -│ • Write user files │ -│ • Create pipes (stdin, stdout, stderr) │ +│ • Create writable directories: /work, /tmp, /home │ +│ • Write user files to /work │ +│ • Create pipes (stdin, stdout, stderr) + eventfd sync │ └──────────────────────────────────────────────────────────────────┘ │ ▼ ┌──────────────────────────────────────────────────────────────────┐ -│ 3. CLONE WITH NAMESPACES │ -│ clone3(CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET | │ -│ CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC) │ +│ 3. FORK │ +│ fork() — plain fork, no CLONE_NEW* flags │ │ │ │ Parent Child │ │ │ │ │ -│ ├─ Write UID/GID maps ├─ Wait for parent │ -│ ├─ Signal ready ────────────────► │ -│ │ ├─ Setup isolation │ -│ │ │ (see step 4) │ -│ ▼ ▼ │ +│ ├─ Open pidfd ├─ Close parent pipe ends │ +│ ├─ Wait for child ready ├─ Setup stdio (dup2) │ +│ ├─ Signal to proceed ├─ chdir(workspace/work) │ +│ ▼ ├─ Apply lockdown (step 4)│ +│ ▼ │ └──────────────────────────────────────────────────────────────────┘ │ ▼ ┌──────────────────────────────────────────────────────────────────┐ -│ 4. CHILD ISOLATION SETUP │ +│ 4. CHILD LOCKDOWN (irreversible) │ │ │ │ ┌─────────────────────────────────────────────────────────┐ │ -│ │ a) Mount namespace │ │ -│ │ • Bind mount /usr, /lib, /lib64 (read-only) │ │ -│ │ • Bind mount workspace to /work │ │ -│ │ • Mount minimal /dev (null, zero, urandom) │ │ -│ │ • pivot_root to new root │ │ -│ │ • Unmount old root │ │ +│ │ a) NO_NEW_PRIVS │ │ +│ │ prctl(PR_SET_NO_NEW_PRIVS) — required before │ │ +│ │ Landlock and seccomp │ │ │ └─────────────────────────────────────────────────────────┘ │ │ ┌─────────────────────────────────────────────────────────┐ │ -│ │ b) Landlock (kernel 5.13+) │ │ -│ │ • Create ruleset with FS restrictions │ │ -│ │ • Allow read-only: /usr, /lib, /bin, /etc │ │ -│ │ • Allow read-write: /work, /tmp │ │ -│ │ • Enforce ruleset │ │ -│ │ (See SECURITY.md for details) │ │ +│ │ b) Landlock v5 │ │ +│ │ • Filesystem: read-only /usr, /lib, /etc, /bin │ │ +│ │ read-write workspace/work, /tmp, /home │ │ +│ │ • Network: block TCP bind + connect (ABI 4+) │ │ +│ │ • Signals: block cross-sandbox signals (ABI 5) │ │ +│ │ • IPC: block abstract unix sockets (ABI 5) │ │ │ └─────────────────────────────────────────────────────────┘ │ │ ┌─────────────────────────────────────────────────────────┐ │ -│ │ c) Seccomp BPF │ │ -│ │ • Load syscall whitelist filter │ │ -│ │ • Block dangerous syscalls (ptrace, mount, etc.) │ │ -│ │ • Filter clone() flags, socket() domains │ │ -│ │ • Filter dangerous ioctls (TIOCSTI, etc.) │ │ -│ │ (See SECURITY.md for full policy) │ │ +│ │ c) Resource limits (rlimits) │ │ +│ │ • RLIMIT_DATA: 256 MiB memory │ │ +│ │ • RLIMIT_CPU: timeout * 2 + 60s │ │ +│ │ • RLIMIT_NPROC: 64 processes │ │ +│ │ • RLIMIT_NOFILE: 256 file descriptors │ │ +│ │ • RLIMIT_FSIZE: 16 MiB output │ │ +│ │ • RLIMIT_CORE: 0 (disabled) │ │ +│ │ • RLIMIT_STACK: 8 MiB │ │ │ └─────────────────────────────────────────────────────────┘ │ │ ┌─────────────────────────────────────────────────────────┐ │ -│ │ d) Resource limits (rlimits) │ │ -│ │ • RLIMIT_AS: Memory limit │ │ -│ │ • RLIMIT_NPROC: Process limit │ │ -│ │ • RLIMIT_NOFILE: File descriptor limit │ │ -│ │ • RLIMIT_FSIZE: Output file size limit │ │ +│ │ d) Securebits + capability drop │ │ +│ │ • Lock NOROOT, NO_SETUID_FIXUP, KEEP_CAPS, │ │ +│ │ NO_CAP_AMBIENT_RAISE │ │ +│ │ • Drop all 64 capabilities │ │ │ └─────────────────────────────────────────────────────────┘ │ └──────────────────────────────────────────────────────────────────┘ │ ▼ ┌──────────────────────────────────────────────────────────────────┐ -│ 5. EXECVE TARGET PROGRAM │ -│ execve("/usr/bin/python", ["python", "-c", code], env) │ +│ 5. SECCOMP FILTERS │ +│ • [Optional] Install notify filter for FS syscall │ +│ interception, send listener fd to parent via SCM_RIGHTS │ +│ • Install kill filter — whitelist of ~100 safe syscalls │ +│ • Argument filtering: clone flags, socket domains, ioctls │ +│ • Violation = SECCOMP_RET_KILL_PROCESS (SIGSYS) │ +└──────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ 6. SIGNAL PARENT + WAIT + EXEC │ +│ • Signal parent readiness (eventfd) │ +│ • Wait for parent go-ahead (eventfd) │ +│ • close_range(3, MAX, 0) — close all fds except 0,1,2 │ +│ • execve(binary, args, env) │ │ │ -│ • All isolation is now permanent │ -│ • Seccomp filter cannot be removed │ -│ • Landlock rules cannot be relaxed │ +│ All isolation is now permanent and cannot be undone. │ └──────────────────────────────────────────────────────────────────┘ │ ▼ ┌──────────────────────────────────────────────────────────────────┐ -│ 6. PARENT MONITORS │ +│ 7. PARENT MONITORS │ │ • Poll pidfd for process exit │ │ • Read stdout/stderr via pipes │ │ • Enforce timeout (kill if exceeded) │ -│ • Track output size (truncate if exceeded) │ +│ • Track output size (kill if exceeded) │ └──────────────────────────────────────────────────────────────────┘ │ ▼ ┌──────────────────────────────────────────────────────────────────┐ -│ 7. CLEANUP │ +│ 8. CLEANUP │ │ • Collect exit status │ │ • Remove workspace tempdir │ │ • Return Output { stdout, stderr, exit_code, signal } │ @@ -245,7 +247,7 @@ loop { ## Security Architecture -evalbox implements **defense in depth** with 7 independent isolation layers: +evalbox implements **defense in depth** with independent isolation mechanisms: ``` ┌─────────────────────────────────────────────────────────────┐ @@ -254,42 +256,16 @@ evalbox implements **defense in depth** with 7 independent isolation layers: │ ▼ ┌─────────────────────────────────────────────────────────────┐ -│ Layer 1: User Namespace │ -│ • UID 0 inside = real user outside │ -│ • No capabilities in parent namespace │ -└─────────────────────────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────┐ -│ Layer 2: PID Namespace │ -│ • Isolated process tree (PID 1 inside) │ -│ • Cannot see/signal host processes │ -└─────────────────────────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────┐ -│ Layer 3: Network Namespace │ -│ • Empty by default (no interfaces) │ -│ • Cannot access host network │ +│ Landlock v5 │ +│ • Filesystem: read-only system paths, read-write workspace │ +│ • Network: block TCP bind + connect (ABI 4+) │ +│ • Signals: block cross-sandbox signals (ABI 5) │ +│ • IPC: block abstract unix sockets (ABI 5) │ └─────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────┐ -│ Layer 4: Mount Namespace + pivot_root │ -│ • Minimal rootfs (no /proc, /sys, /home) │ -│ • Host filesystem completely unmounted │ -└─────────────────────────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────┐ -│ Layer 5: Landlock LSM │ -│ • Kernel-enforced filesystem rules │ -│ • Read-only binaries, read-write workspace only │ -└─────────────────────────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────┐ -│ Layer 6: Seccomp BPF │ +│ Seccomp BPF │ │ • ~100 allowed syscalls (whitelist) │ │ • Blocks ptrace, mount, clone(NEWUSER), AF_NETLINK │ │ • SIGSYS on violation (immediate termination) │ @@ -297,12 +273,20 @@ evalbox implements **defense in depth** with 7 independent isolation layers: │ ▼ ┌─────────────────────────────────────────────────────────────┐ -│ Layer 7: Resource Limits │ +│ Resource Limits │ │ • Memory, CPU, processes, file descriptors │ │ • Prevents DoS attacks │ +└─────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ Privilege Hardening │ +│ • NO_NEW_PRIVS — cannot gain privileges via exec │ +│ • Securebits locked — cannot regain capabilities │ +│ • All 64 capabilities dropped │ └─────────────────────────────────────────────────────────────┘ -For detailed security policy and threat model, see SECURITY.md +For detailed security policy and threat model, see SECURITY_MODEL.md ``` --- @@ -344,7 +328,7 @@ BPF Program Flow: KILL ALLOW KILL ALLOW KILL ALLOW ALLOW KILL ``` -For the complete syscall policy, see [SECURITY.md](SECURITY.md#syscall-policy). +For the complete syscall policy, see [SECURITY_MODEL.md](SECURITY_MODEL.md#syscall-policy). --- @@ -352,31 +336,15 @@ For the complete syscall policy, see [SECURITY.md](SECURITY.md#syscall-policy). ``` /tmp/evalbox-XXXXX/ Workspace root (tmpdir) -├── root/ New root filesystem -│ ├── work/ User workspace (read-write) -│ │ ├── script.py User files -│ │ └── data.json -│ ├── tmp/ Temporary files (read-write) -│ ├── etc/ Minimal config -│ │ ├── passwd nobody user -│ │ ├── group nogroup -│ │ ├── hosts localhost -│ │ └── resolv.conf DNS (if network enabled) -│ ├── dev/ Minimal devices -│ │ ├── null -│ │ ├── zero -│ │ ├── urandom -│ │ └── fd → /proc/self/fd -│ ├── usr/ ──────────────── Bind mount (read-only) -│ ├── lib/ ──────────────── Bind mount (read-only) -│ ├── lib64/ ────────────── Bind mount (read-only) -│ └── bin/ ──────────────── Symlink to /usr/bin -│ -├── stdin Input pipe -├── stdout Output pipe -└── stderr Error pipe +├── work/ User workspace (read-write via Landlock) +│ ├── script.py User files +│ └── data.json +├── tmp/ Temporary files (read-write via Landlock) +└── home/ Home directory (read-write via Landlock) ``` +The workspace is a plain tempdir. No `pivot_root`, no bind mounts, no special rootfs. Landlock rules control which real filesystem paths are accessible. + --- ## Design Principles @@ -384,26 +352,26 @@ For the complete syscall policy, see [SECURITY.md](SECURITY.md#syscall-policy). ### 1. Simple as eval() ```rust // One function call to run code safely -let output = python::run("print('hello')", &config)?; +let output = python::run("print('hello')").exec()?; ``` ### 2. Defense in Depth -Every isolation mechanism works independently. A bypass of one layer doesn't compromise the sandbox. See [SECURITY.md](SECURITY.md#defense-in-depth). +Each isolation mechanism works independently. Landlock controls filesystem and network access, seccomp blocks dangerous syscalls, rlimits prevent resource exhaustion. See [SECURITY_MODEL.md](SECURITY_MODEL.md#defense-in-depth). ### 3. Unprivileged - No root required - No daemon/service -- Uses user namespaces +- No namespaces needed — Landlock + seccomp work unprivileged with `NO_NEW_PRIVS` ### 4. Minimal Attack Surface - Small syscall whitelist (~100 syscalls) -- Minimal filesystem -- No /proc, /sys by default +- Landlock restricts filesystem to minimal paths +- All capabilities dropped ### 5. Fast - ~5ms sandbox creation -- No VM boot -- No container image pull +- No VM boot, no container image pull +- Plain `fork()` + lockdown ### 6. Embeddable - Library, not a service @@ -414,12 +382,11 @@ Every isolation mechanism works independently. A bypass of one layer doesn't com ## System Requirements -| Requirement | Minimum | Recommended | -|-------------|---------|-------------| -| Linux Kernel | 5.13 | 6.1+ | -| User Namespaces | Required | - | -| Landlock | Required (ABI 1) | ABI 4 | -| Seccomp | Required | - | +| Requirement | Minimum | +|-------------|---------| +| Linux Kernel | 6.12 | +| Landlock | ABI 5 | +| Seccomp | Required | Check compatibility: ```bash @@ -430,8 +397,7 @@ evalbox check ## References -- [SECURITY.md](SECURITY.md) - Detailed security model and threat analysis -- [ROADMAP.md](ROADMAP.md) - Planned features -- [Linux namespaces](https://man7.org/linux/man-pages/man7/namespaces.7.html) +- [Security Model](SECURITY_MODEL.md) - Detailed security model and threat analysis +- [Roadmap](ROADMAP.md) - Planned features - [Landlock LSM](https://docs.kernel.org/userspace-api/landlock.html) - [Seccomp BPF](https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html) diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index fb8fba7..32e49f2 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -1,5 +1,32 @@ # Roadmap +## Security Hardening + +### Block UDP exfiltration via seccomp + +Landlock only controls TCP (`LANDLOCK_ACCESS_NET_{BIND,CONNECT}_TCP`). A sandboxed process can create a `SOCK_DGRAM` socket and `sendto()` data to any IP without Landlock blocking it. + +**Fix:** Block `SOCK_DGRAM` in the seccomp socket filter when `plan.network_blocked`. DNS inside the sandbox already doesn't work, so this breaks nothing. + +**Tracking:** Landlock ABI v8 RFC patches (Dec 2025) propose `LANDLOCK_ACCESS_NET_{BIND,CONNECT,SENDTO}_UDP`. Once merged, seccomp filtering can be relaxed. + +### Restrict /proc access + +`/proc` is currently Landlock read-only. Landlock's ptrace scoping already blocks access to `environ`, `maps`, `fd/` of processes outside the sandbox domain. However, `/proc/[pid]/cmdline` is world-readable (`0444`, no ptrace check) — any host process's command line is visible. + +**Options:** +- Remove `/proc` from Landlock entirely (breaks programs that read `/proc/self/`) +- Accept `cmdline` leak as residual risk (low impact for single-user) +- Wait for kernel support: `hidepid=` requires mount namespace, Landlock can't target `/proc/self` (magic symlink resolves to fixed inode at `open()` time) + +### PID namespace (optional) + +Without PID namespace the sandbox can enumerate host PIDs via `/proc`. Combined with `cmdline` being world-readable, this is an information leak. Adding `CLONE_NEWPID` back would fully isolate the process tree, but requires re-introducing namespace setup code. + +**Trade-off:** Adds ~0.5ms and complexity. Not needed for single-user code execution, useful for multi-tenant deployments. + +--- + ## Supervised Execution Mode Intercept syscalls before execution for AI CLI tools and interactive approval. diff --git a/docs/SECURITY.md b/docs/SECURITY.md deleted file mode 100644 index e3ebdf2..0000000 --- a/docs/SECURITY.md +++ /dev/null @@ -1,354 +0,0 @@ -# evalbox Security Model - -## Defense in Depth - -evalbox uses **7 independent isolation layers**. Each layer provides protection even if another layer is bypassed. - -``` -┌─────────────────────────────────────────────────────────────┐ -│ Layer 1 │ User Namespaces │ Identity │ -├───────────┼──────────────────────────────────┼──────────────┤ -│ Layer 2 │ PID Namespace │ Process │ -├───────────┼──────────────────────────────────┼──────────────┤ -│ Layer 3 │ Network Namespace │ Network │ -├───────────┼──────────────────────────────────┼──────────────┤ -│ Layer 4 │ Mount Namespace + pivot_root │ Filesystem │ -├───────────┼──────────────────────────────────┼──────────────┤ -│ Layer 5 │ Landlock LSM │ FS Rules │ -├───────────┼──────────────────────────────────┼──────────────┤ -│ Layer 6 │ Seccomp BPF │ Syscalls │ -├───────────┼──────────────────────────────────┼──────────────┤ -│ Layer 7 │ rlimits │ Resources │ -└───────────┴──────────────────────────────────┴──────────────┘ -``` - ---- - -## Isolation Layers - -### Layer 1: User Namespaces - -User namespaces provide identity isolation. - -| Inside Sandbox | Outside Sandbox | -|----------------|-----------------| -| UID 0 (root) | Real user UID | -| GID 0 (root) | Real user GID | -| Full capabilities | No capabilities | - -**Security properties:** -- Cannot access host user's files (different UID) -- Capabilities only valid inside namespace -- Cannot escalate to real root - -### Layer 2: PID Namespace - -Process isolation prevents interference with host processes. - -``` -Host PID Namespace Sandbox PID Namespace -┌───────────────────┐ ┌───────────────────┐ -│ PID 1 (init) │ │ PID 1 (sandbox) │ -│ PID 1234 (shell) │ │ PID 2 (python) │ -│ PID 5678 (...) │ │ PID 3 (child) │ -└───────────────────┘ └───────────────────┘ - │ │ - │ ✗ Cannot see │ - │◄─────────────────────────┤ - │ ✗ Cannot signal │ -``` - -**Security properties:** -- Sandbox sees only its own processes -- Cannot enumerate host processes -- Cannot send signals to host processes -- kill() safe inside namespace - -### Layer 3: Network Namespace - -Network isolation blocks all network access by default. - -``` -┌─────────────────────────────────────────┐ -│ Host Network │ -│ eth0: 192.168.1.100 │ -│ lo: 127.0.0.1 │ -│ docker0: 172.17.0.1 │ -└─────────────────────────────────────────┘ - ✗ No access -┌─────────────────────────────────────────┐ -│ Sandbox Network │ -│ (empty - no interfaces) │ -│ │ -│ • No loopback │ -│ • No external access │ -│ • socket() works but connect() fails │ -└─────────────────────────────────────────┘ -``` - -**Security properties:** -- Cannot connect to localhost services -- Cannot access local network -- Cannot exfiltrate data via network -- Optional: Enable with `.network(true)` - -### Layer 4: Mount Namespace + pivot_root - -Filesystem isolation provides a minimal, controlled view. - -``` -Host Filesystem Sandbox Filesystem -/ / -├── home/ ├── work/ ← User workspace (rw) -│ └── user/ ✗ ├── tmp/ ← Temp files (rw) -├── etc/ ├── etc/ ← Minimal config -│ └── shadow ✗ │ ├── passwd (nobody) -├── root/ ✗ │ └── hosts (localhost) -├── proc/ ✗ ├── dev/ ← Minimal devices -├── sys/ ✗ │ ├── null -├── usr/ ───────────────────┼── usr/ ← Bind mount (ro) -├── lib/ ───────────────────┼── lib/ ← Bind mount (ro) -└── lib64/ ──────────────────┼── lib64/ ← Bind mount (ro) - └── (host root unmounted) -``` - -**Security properties:** -- Cannot access /home, /root -- Cannot read /etc/shadow, /etc/passwd (host) -- Cannot access /proc (no process info) -- Cannot access /sys (no kernel info) -- Host filesystem completely unmounted - -### Layer 5: Landlock LSM - -Kernel-enforced filesystem access control (requires Linux 5.13+). - -```rust -// Landlock ruleset -Ruleset { - read_only: ["/usr", "/lib", "/lib64", "/bin", "/etc"], - read_write: ["/work", "/tmp"], - execute: ["/usr/bin", "/bin"], - no_access: [everything else], -} -``` - -**Landlock ABI versions:** -| ABI | Kernel | Features | -|-----|--------|----------| -| 1 | 5.13 | Basic filesystem | -| 2 | 5.19 | Truncate control | -| 3 | 6.2 | File permissions | -| 4 | 6.7 | Network TCP control | - -**Security properties:** -- Enforced at kernel level (bypass-resistant) -- Works even if mount namespace bypassed -- Cannot be disabled after application - -### Layer 6: Seccomp BPF - -Syscall filtering with immediate termination on violation. - -**Filter approach:** Whitelist (allow known-safe syscalls, kill on others) - -**Blocked syscall categories:** -| Category | Syscalls | Reason | -|----------|----------|--------| -| Namespaces | `clone(CLONE_NEW*)`, `unshare`, `setns` | Prevent new namespaces | -| Mounting | `mount`, `umount`, `pivot_root` | Prevent FS manipulation | -| Debugging | `ptrace`, `process_vm_*` | Prevent process injection | -| Kernel | `reboot`, `kexec_load`, `init_module` | Prevent system damage | -| Privilege | `setuid`, `setgid`, `setgroups` | Prevent escalation | -| Keyring | `keyctl` | Not namespaced | -| eBPF | `bpf` | Kernel attack surface | - -**Argument filtering:** -| Syscall | Blocked Arguments | Reason | -|---------|-------------------|--------| -| `clone` | `CLONE_NEWUSER`, `CLONE_NEWNET`, etc. | Block namespace creation | -| `socket` | `AF_NETLINK`, `SOCK_RAW` | Block kernel interfaces | -| `ioctl` | `TIOCSTI`, `TIOCSETD`, `TIOCLINUX` | Block terminal injection | - -**Violation behavior:** `SECCOMP_RET_KILL_PROCESS` (SIGSYS, signal 31) - -### Layer 7: Resource Limits - -Prevent denial-of-service attacks. - -| Resource | Limit | Purpose | -|----------|-------|---------| -| `RLIMIT_AS` | 256 MB | Memory limit | -| `RLIMIT_NPROC` | 64 | Fork bomb prevention | -| `RLIMIT_NOFILE` | 256 | File descriptor limit | -| `RLIMIT_FSIZE` | 10 MB | Output file size | -| Timeout | 30s | CPU time limit | - ---- - -## Syscall Policy - -### Allowed Syscalls (~100) - -``` -Basic I/O: read, write, close, lseek, pread64, pwrite64 -File ops: openat, stat, fstat, access, readlink -Memory: mmap, mprotect, munmap, brk, mremap -Process: fork, vfork, execve, exit, exit_group, wait4 -Signals: rt_sigaction, rt_sigprocmask, rt_sigreturn -Time: clock_gettime, nanosleep, gettimeofday -Sockets: socket*, connect, bind, listen, accept, send*, recv* -Events: epoll_*, poll, select -``` - -### Blocked Syscalls (examples) - -``` -Dangerous: ptrace, mount, reboot, kexec_load, init_module -Namespaces: clone3, unshare, setns (blocked or filtered) -Privilege: setuid, setgid, setresuid, setresgid -Kernel: bpf, perf_event_open, keyctl -Fileless: memfd_create, execveat (together enable fileless exec) -``` - -### Special Handling - -| Syscall | Handling | -|---------|----------| -| `clone` | Allowed, but `CLONE_NEW*` flags blocked | -| `clone3` | Returns `ENOSYS` (glibc falls back to `clone`) | -| `socket` | Allowed, but `AF_NETLINK` and `SOCK_RAW` blocked | -| `ioctl` | Allowed, but `TIOCSTI`, `TIOCSETD`, `TIOCLINUX` blocked | - ---- - -## Threat Model - -### In Scope (Protected Against) - -| Threat | Mitigation | -|--------|------------| -| **Arbitrary code execution** | Sandboxed environment | -| **Filesystem escape** | Namespaces + Landlock + pivot_root | -| **Network access** | Network namespace (empty) | -| **Process injection** | PID namespace + ptrace blocked | -| **Privilege escalation** | User namespace + seccomp | -| **Resource exhaustion** | rlimits + timeouts | -| **Fork bombs** | RLIMIT_NPROC | -| **Terminal injection** | TIOCSTI/TIOCLINUX blocked | -| **Fileless malware** | memfd_create + execveat blocked | - -### Out of Scope - -| Threat | Reason | -|--------|--------| -| **Kernel exploits** | Requires kernel hardening (grsecurity, etc.) | -| **Side-channel attacks** | Spectre/Meltdown require CPU mitigations | -| **Container breakout via 0-day** | Defense in depth limits impact | -| **Covert channels** | Timing-based data exfiltration possible | - -### CVE Protection - -evalbox's seccomp policy blocks attack vectors for many kernel CVEs: - -| CVE | Attack Vector | Blocked By | -|-----|---------------|------------| -| CVE-2024-1086 | AF_NETLINK + nf_tables | Socket filtering | -| CVE-2022-0185 | fsconfig + user namespace | CLONE_NEWUSER blocked | -| CVE-2022-0492 | cgroups + user namespace | CLONE_NEWUSER blocked | -| CVE-2017-5226 | TIOCSTI terminal injection | ioctl filtering | -| CVE-2019-13272 | ptrace PTRACE_TRACEME | ptrace blocked | -| CVE-2021-3490 | eBPF verifier bypass | bpf blocked | - ---- - -## Filesystem Access - -### Default Mounts - -| Path | Access | Source | Purpose | -|------|--------|--------|---------| -| `/work` | Read-Write | Workspace | User files | -| `/tmp` | Read-Write | tmpfs | Temporary files | -| `/usr` | Read-Only | Host | Binaries, libraries | -| `/lib` | Read-Only | Host | Shared libraries | -| `/lib64` | Read-Only | Host | 64-bit libraries | -| `/etc` | Read-Only | Generated | Minimal config | -| `/dev` | Read-Only | Generated | null, zero, urandom | - -### Not Mounted (Blocked) - -| Path | Contains | Risk if Accessible | -|------|----------|-------------------| -| `/home` | User data | Data theft | -| `/root` | Root home | Credential theft | -| `/proc` | Process info | Info leak, escape vectors | -| `/sys` | Kernel interfaces | Kernel manipulation | -| `/var` | System state | Log manipulation | -| `/run` | Runtime data | Socket access | - ---- - -## Verification - -### Security Tests - -Run the security test suite to verify isolation: - -```bash -# Run all security tests -cargo test -p evalbox-sandbox --test security_tests -- --ignored - -# Run specific category -cargo test -p evalbox-sandbox --test security_tests seccomp -- --ignored -cargo test -p evalbox-sandbox --test security_tests filesystem -- --ignored -cargo test -p evalbox-sandbox --test security_tests network -- --ignored -cargo test -p evalbox-sandbox --test security_tests cve -- --ignored -``` - -### Test Coverage - -| Category | Tests | Coverage | -|----------|-------|----------| -| Seccomp | 9 | ptrace, mount, reboot, clone, socket, keyctl, bpf | -| Filesystem | 8 | /etc/shadow, /root, path traversal, symlinks | -| Network | 5 | External, localhost, loopback, DNS | -| Resources | 7 | Timeout, memory, PIDs, output limit | -| CVE | 10 | Real-world exploits blocked | - -### Manual Verification - -```bash -# Try to read /etc/shadow (should fail) -evalbox shell "cat /etc/shadow" - -# Try to access network (should fail) -evalbox shell "curl https://example.com" - -# Try ptrace (should be killed with SIGSYS) -evalbox shell "strace ls" -``` - ---- - -## Production Requirements - -To deploy evalbox securely, ensure your system meets these requirements: - -| Requirement | How to Verify | -|-------------|---------------| -| Kernel 5.13+ with Landlock | `cat /sys/kernel/security/lsm` should include `landlock` | -| User namespaces enabled | `cat /proc/sys/kernel/unprivileged_userns_clone` should be `1` | -| Seccomp enabled | `grep SECCOMP /boot/config-$(uname -r)` | -| Unprivileged BPF disabled | `sysctl kernel.unprivileged_bpf_disabled=1` (recommended) | - -Run `evalbox check` to verify all requirements automatically. - ---- - -## References - -- [Architecture Overview](ARCHITECTURE.md) -- [Linux Namespaces](https://man7.org/linux/man-pages/man7/namespaces.7.html) -- [Landlock Documentation](https://docs.kernel.org/userspace-api/landlock.html) -- [Seccomp BPF](https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html) diff --git a/docs/SECURITY_MODEL.md b/docs/SECURITY_MODEL.md new file mode 100644 index 0000000..21c12a2 --- /dev/null +++ b/docs/SECURITY_MODEL.md @@ -0,0 +1,286 @@ +# evalbox Security Model + +## Defense in Depth + +evalbox uses **independent isolation mechanisms**. Each provides protection even if another is bypassed. + +``` +┌─────────────────────────────────────────────────────────────┐ +│ │ Landlock v5 │ Filesystem, Network│ +│ │ │ Signal, IPC │ +├───────────┼───────────────────────────┼─────────────────────┤ +│ │ Seccomp BPF │ Syscalls │ +├───────────┼───────────────────────────┼─────────────────────┤ +│ │ rlimits │ Resources │ +├───────────┼───────────────────────────┼─────────────────────┤ +│ │ Privilege Hardening │ NO_NEW_PRIVS, │ +│ │ │ securebits, caps │ +└───────────┴───────────────────────────┴─────────────────────┘ +``` + +--- + +## Isolation Mechanisms + +### Landlock v5 + +Kernel-enforced access control (requires Linux 6.12+, Landlock ABI 5). + +No namespaces or `pivot_root` needed — Landlock operates on real filesystem paths. + +**Filesystem rules:** +``` +read-only: /usr, /lib, /lib64, /bin, /etc, /proc, /nix/store* +read-write: workspace/work, workspace/tmp, workspace/home +write: /dev (for /dev/null, /dev/zero, /dev/urandom) +no access: everything else +``` + +**Network control (ABI 4+):** +- Blocks `LANDLOCK_ACCESS_NET_BIND_TCP` +- Blocks `LANDLOCK_ACCESS_NET_CONNECT_TCP` +- Optional: enable with `.network(true)` + +**Signal isolation (ABI 5):** +- `LANDLOCK_SCOPE_SIGNAL` — blocks signals to processes outside the sandbox + +**IPC isolation (ABI 5):** +- `LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET` — blocks connections to abstract unix sockets outside the sandbox + +**Landlock ABI versions:** +| ABI | Kernel | Features | +|-----|--------|----------| +| 1 | 5.13 | Basic filesystem | +| 2 | 5.19 | Truncate control | +| 3 | 6.2 | File permissions | +| 4 | 6.7 | Network TCP control | +| 5 | 6.12 | Signal + abstract unix socket scoping | + +**Security properties:** +- Enforced at kernel level (bypass-resistant) +- Cannot be disabled after application +- Works unprivileged with `NO_NEW_PRIVS` + +### Seccomp BPF + +Syscall filtering with immediate termination on violation. + +**Filter approach:** Whitelist (allow known-safe syscalls, kill on others) + +**Blocked syscall categories:** +| Category | Syscalls | Reason | +|----------|----------|--------| +| Namespaces | `clone(CLONE_NEW*)`, `unshare`, `setns` | Prevent new namespaces | +| Mounting | `mount`, `umount`, `pivot_root` | Prevent FS manipulation | +| Debugging | `ptrace`, `process_vm_*` | Prevent process injection | +| Kernel | `reboot`, `kexec_load`, `init_module` | Prevent system damage | +| Privilege | `setuid`, `setgid`, `setgroups` | Prevent escalation | +| Keyring | `keyctl` | Not namespaced | +| eBPF | `bpf` | Kernel attack surface | +| Fileless | `memfd_create`, `execveat` | Bypass Landlock | + +**Argument filtering:** +| Syscall | Blocked Arguments | Reason | +|---------|-------------------|--------| +| `clone` | `CLONE_NEWUSER`, `CLONE_NEWNET`, `CLONE_NEWNS`, `CLONE_NEWPID`, `CLONE_NEWIPC`, `CLONE_NEWUTS`, `CLONE_NEWCGROUP` | Block namespace creation | +| `clone3` | Entirely blocked (returns `ENOSYS`) | Cannot inspect flags in userspace struct | +| `socket` | `AF_NETLINK`, `SOCK_RAW` | Block kernel interfaces | +| `ioctl` | `TIOCSTI`, `TIOCSETD`, `TIOCLINUX` | Block terminal injection | + +**Violation behavior:** `SECCOMP_RET_KILL_PROCESS` (SIGSYS, signal 31) + +### Resource Limits + +Prevent denial-of-service attacks via kernel-enforced rlimits. + +| Resource | Limit | Purpose | +|----------|-------|---------| +| `RLIMIT_DATA` | 256 MiB | Memory usage | +| `RLIMIT_CPU` | timeout * 2 + 60s | CPU time limit | +| `RLIMIT_FSIZE` | 16 MiB | Output file size | +| `RLIMIT_NOFILE` | 256 | File descriptor limit | +| `RLIMIT_NPROC` | 64 | Fork bomb prevention | +| `RLIMIT_CORE` | 0 | Core dumps disabled | +| `RLIMIT_STACK` | 8 MiB | Stack size | + +Note: `RLIMIT_AS` (virtual address space) is intentionally not set. Modern runtimes like Go, Java, and V8 pre-allocate large virtual ranges but only commit small portions. + +### Privilege Hardening + +Permanent privilege reduction applied before seccomp: + +| Mechanism | Effect | +|-----------|--------| +| `PR_SET_NO_NEW_PRIVS` | Cannot gain privileges via exec (setuid, file caps) | +| `SECBIT_NOROOT` (locked) | Root has no special privilege | +| `SECBIT_NO_SETUID_FIXUP` (locked) | Capabilities not adjusted on UID change | +| `SECBIT_KEEP_CAPS` (locked) | Cannot keep caps through exec | +| `SECBIT_NO_CAP_AMBIENT_RAISE` (locked) | Cannot set ambient capabilities | +| Drop all 64 capabilities | No capability-based operations possible | + +--- + +## Syscall Policy + +### Allowed Syscalls (~100) + +``` +Basic I/O: read, write, close, lseek, pread64, pwrite64 +File ops: openat, stat, fstat, access, readlink +Memory: mmap, mprotect, munmap, brk, mremap +Process: fork, vfork, execve, exit, exit_group, wait4 +Signals: rt_sigaction, rt_sigprocmask, rt_sigreturn, kill, tgkill +Time: clock_gettime, nanosleep, gettimeofday +Sockets: socket*, connect, bind, listen, accept, send*, recv* +Events: epoll_*, poll, select +``` + +Note: `kill` and `tgkill` are allowed because Landlock ABI 5 provides signal scoping — signals can only reach processes within the sandbox. + +### Blocked Syscalls (examples) + +``` +Dangerous: ptrace, mount, reboot, kexec_load, init_module +Namespaces: clone3, unshare, setns (blocked or filtered) +Privilege: setuid, setgid, setresuid, setresgid +Kernel: bpf, perf_event_open, keyctl +Fileless: memfd_create, execveat (together enable fileless exec) +``` + +### Special Handling + +| Syscall | Handling | +|---------|----------| +| `clone` | Allowed, but `CLONE_NEW*` flags blocked | +| `clone3` | Returns `ENOSYS` (glibc falls back to `clone`) | +| `socket` | Allowed, but `AF_NETLINK` and `SOCK_RAW` blocked | +| `ioctl` | Allowed, but `TIOCSTI`, `TIOCSETD`, `TIOCLINUX` blocked | + +--- + +## Threat Model + +### In Scope (Protected Against) + +| Threat | Mitigation | +|--------|------------| +| **Arbitrary code execution** | Sandboxed environment | +| **Filesystem escape** | Landlock v5 path rules | +| **Network access** | Landlock network control (ABI 4+) + seccomp socket filtering | +| **Process injection** | ptrace blocked by seccomp | +| **Privilege escalation** | NO_NEW_PRIVS + seccomp + capability drop | +| **Resource exhaustion** | rlimits + timeouts | +| **Fork bombs** | RLIMIT_NPROC | +| **Terminal injection** | TIOCSTI/TIOCLINUX blocked by seccomp | +| **Fileless malware** | memfd_create + execveat blocked by seccomp | +| **Cross-sandbox signals** | Landlock signal scoping (ABI 5) | +| **Abstract unix socket abuse** | Landlock IPC scoping (ABI 5) | + +### Out of Scope + +| Threat | Reason | +|--------|--------| +| **Kernel exploits** | Requires kernel hardening (grsecurity, etc.) | +| **Side-channel attacks** | Spectre/Meltdown require CPU mitigations | +| **Container breakout via 0-day** | Defense in depth limits impact | +| **Covert channels** | Timing-based data exfiltration possible | + +### CVE Protection + +evalbox's seccomp policy blocks attack vectors for many kernel CVEs: + +| CVE | Attack Vector | Blocked By | +|-----|---------------|------------| +| CVE-2024-1086 | AF_NETLINK + nf_tables | Socket filtering | +| CVE-2022-0185 | fsconfig + user namespace | CLONE_NEWUSER blocked | +| CVE-2022-0492 | cgroups + user namespace | CLONE_NEWUSER blocked | +| CVE-2017-5226 | TIOCSTI terminal injection | ioctl filtering | +| CVE-2019-13272 | ptrace PTRACE_TRACEME | ptrace blocked | +| CVE-2021-3490 | eBPF verifier bypass | bpf blocked | + +--- + +## Filesystem Access + +### Accessible Paths (via Landlock) + +| Path | Access | Purpose | +|------|--------|---------| +| `workspace/work` | Read-Write | User files | +| `workspace/tmp` | Read-Write | Temporary files | +| `workspace/home` | Read-Write | Home directory | +| `/usr` | Read-Only + Execute | Binaries, libraries | +| `/lib` | Read-Only + Execute | Shared libraries | +| `/lib64` | Read-Only + Execute | 64-bit libraries | +| `/bin` | Read-Only + Execute | Binaries | +| `/etc` | Read-Only | System config | +| `/proc` | Read-Only | Process info (no execute) | +| `/dev` | Read + Write | null, zero, urandom | +| `/nix/store` | Read-Only + Execute | NixOS paths (if present) | + +### Not Accessible + +| Path | Contains | Risk if Accessible | +|------|----------|-------------------| +| `/home` (host) | User data | Data theft | +| `/root` | Root home | Credential theft | +| `/sys` | Kernel interfaces | Kernel manipulation | +| `/var` | System state | Log manipulation | +| `/run` | Runtime data | Socket access | + +--- + +## Verification + +### Security Tests + +Run the security test suite to verify isolation: + +```bash +# Run all security tests +cargo test -p evalbox-sandbox --test security_tests -- --ignored + +# Run specific category +cargo test -p evalbox-sandbox --test security_tests seccomp -- --ignored +cargo test -p evalbox-sandbox --test security_tests filesystem -- --ignored +cargo test -p evalbox-sandbox --test security_tests network -- --ignored +cargo test -p evalbox-sandbox --test security_tests cve -- --ignored +``` + +Or via Nix: + +```bash +nix run .#test-all +``` + +### Test Coverage + +| Category | Tests | Coverage | +|----------|-------|----------| +| Seccomp | 9 | ptrace, mount, reboot, clone, socket, keyctl, bpf | +| Filesystem | 8 | /etc/shadow, /root, path traversal, symlinks | +| Network | 5 | External, localhost, loopback, DNS | +| Resources | 7 | Timeout, memory, PIDs, output limit | +| CVE | 10 | Real-world exploits blocked | + +--- + +## Production Requirements + +| Requirement | How to Verify | +|-------------|---------------| +| Kernel 6.12+ | `uname -r` | +| Landlock ABI 5 | `cat /sys/kernel/security/lsm` should include `landlock` | +| Seccomp enabled | `grep SECCOMP /boot/config-$(uname -r)` | +| Unprivileged BPF disabled | `sysctl kernel.unprivileged_bpf_disabled=1` (recommended) | + +Run `evalbox check` to verify all requirements automatically. + +--- + +## References + +- [Architecture](ARCHITECTURE.md) +- [Security Policy](../SECURITY.md) +- [Landlock Documentation](https://docs.kernel.org/userspace-api/landlock.html) +- [Seccomp BPF](https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html) diff --git a/flake.lock b/flake.lock index 82db2d7..90964ff 100644 --- a/flake.lock +++ b/flake.lock @@ -1,12 +1,60 @@ { "nodes": { + "crane": { + "locked": { + "lastModified": 1771121070, + "narHash": "sha256-aIlv7FRXF9q70DNJPI237dEDAznSKaXmL5lfK/Id/bI=", + "owner": "ipetkov", + "repo": "crane", + "rev": "a2812c19f1ed2e5ed5ce2ef7109798b575c180e1", + "type": "github" + }, + "original": { + "owner": "ipetkov", + "repo": "crane", + "type": "github" + } + }, + "flake-parts": { + "inputs": { + "nixpkgs-lib": "nixpkgs-lib" + }, + "locked": { + "lastModified": 1769996383, + "narHash": "sha256-AnYjnFWgS49RlqX7LrC4uA+sCCDBj0Ry/WOJ5XWAsa0=", + "owner": "hercules-ci", + "repo": "flake-parts", + "rev": "57928607ea566b5db3ad13af0e57e921e6b12381", + "type": "github" + }, + "original": { + "owner": "hercules-ci", + "repo": "flake-parts", + "type": "github" + } + }, + "import-tree": { + "locked": { + "lastModified": 1771045967, + "narHash": "sha256-oYO4poyw0Sb/db2PigqugMlDwsvwLg6CSpFrMUWxA3Q=", + "owner": "vic", + "repo": "import-tree", + "rev": "c968d3b54d12cf5d9c13f16f7c545a06c9d1fde6", + "type": "github" + }, + "original": { + "owner": "vic", + "repo": "import-tree", + "type": "github" + } + }, "nixpkgs": { "locked": { - "lastModified": 1770562336, - "narHash": "sha256-ub1gpAONMFsT/GU2hV6ZWJjur8rJ6kKxdm9IlCT0j84=", + "lastModified": 1771008912, + "narHash": "sha256-gf2AmWVTs8lEq7z/3ZAsgnZDhWIckkb+ZnAo5RzSxJg=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "d6c71932130818840fc8fe9509cf50be8c64634f", + "rev": "a82ccc39b39b621151d6732718e3e250109076fa", "type": "github" }, "original": { @@ -16,8 +64,26 @@ "type": "github" } }, + "nixpkgs-lib": { + "locked": { + "lastModified": 1769909678, + "narHash": "sha256-cBEymOf4/o3FD5AZnzC3J9hLbiZ+QDT/KDuyHXVJOpM=", + "owner": "nix-community", + "repo": "nixpkgs.lib", + "rev": "72716169fe93074c333e8d0173151350670b824c", + "type": "github" + }, + "original": { + "owner": "nix-community", + "repo": "nixpkgs.lib", + "type": "github" + } + }, "root": { "inputs": { + "crane": "crane", + "flake-parts": "flake-parts", + "import-tree": "import-tree", "nixpkgs": "nixpkgs", "rust-overlay": "rust-overlay" } @@ -29,11 +95,11 @@ ] }, "locked": { - "lastModified": 1770865833, - "narHash": "sha256-oiARqnlvaW6pVGheVi4ye6voqCwhg5hCcGish2ZvQzI=", + "lastModified": 1771297684, + "narHash": "sha256-wieWskQxZLPlNXX06JEB0bMoS/ZYQ89xBzF0RL9lyLs=", "owner": "oxalica", "repo": "rust-overlay", - "rev": "c8cfbe26238638e2f3a2c0ae7e8d240f5e4ded85", + "rev": "755d3669699a7c62aef35af187d75dc2728cfd85", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index e708a37..ccc5bef 100644 --- a/flake.nix +++ b/flake.nix @@ -3,47 +3,18 @@ inputs = { nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; - rust-overlay.url = "github:oxalica/rust-overlay"; - rust-overlay.inputs.nixpkgs.follows = "nixpkgs"; + flake-parts.url = "github:hercules-ci/flake-parts"; + import-tree.url = "github:vic/import-tree"; + rust-overlay = { + url = "github:oxalica/rust-overlay"; + inputs.nixpkgs.follows = "nixpkgs"; + }; + crane.url = "github:ipetkov/crane"; }; - outputs = { self, nixpkgs, rust-overlay }: - let - systems = [ "x86_64-linux" "aarch64-linux" ]; - forAllSystems = nixpkgs.lib.genAttrs systems; - in - { - devShells = forAllSystems (system: - let - pkgs = import nixpkgs { - inherit system; - overlays = [ rust-overlay.overlays.default ]; - }; - - rust = pkgs.rust-bin.stable.latest.default.override { - extensions = [ "rust-src" "rust-analyzer" ]; - }; - in - { - default = pkgs.mkShell { - buildInputs = with pkgs; [ - rust - pkg-config - - mold - clang - - python3 - go - ]; - - RUST_BACKTRACE = "1"; - - shellHook = '' - echo "evalbox dev environment" - echo "Rust: $(rustc --version)" - ''; - }; - }); - }; + outputs = inputs: + inputs.flake-parts.lib.mkFlake { inherit inputs; } + (inputs.import-tree ./nix // { + systems = [ "x86_64-linux" ]; + }); } diff --git a/nix/devshell.nix b/nix/devshell.nix new file mode 100644 index 0000000..001172d --- /dev/null +++ b/nix/devshell.nix @@ -0,0 +1,17 @@ +{ ... }: +{ + perSystem = { pkgs, toolchainWithExtensions, ... }: { + devShells.default = pkgs.mkShell { + name = "evalbox-dev"; + buildInputs = with pkgs; [ + toolchainWithExtensions + pkg-config + gcc + python3 + go + ]; + RUST_SRC_PATH = "${toolchainWithExtensions}/lib/rustlib/src/rust/library"; + RUST_BACKTRACE = "1"; + }; + }; +} diff --git a/nix/packages.nix b/nix/packages.nix new file mode 100644 index 0000000..54b4f4c --- /dev/null +++ b/nix/packages.nix @@ -0,0 +1,57 @@ +{ ... }: +{ + perSystem = { pkgs, craneLib, src, commonArgs, cargoArtifacts, ... }: + let + srcWithPayloads = pkgs.lib.cleanSourceWith { + src = ./..; + filter = path: type: + (craneLib.filterCargoSources path type) + || (builtins.match ".*\\.c$" path != null); + }; + in { + packages = { + default = craneLib.buildPackage (commonArgs // { + inherit cargoArtifacts; + }); + + security-test-bin = craneLib.mkCargoDerivation (commonArgs // { + inherit cargoArtifacts; + src = srcWithPayloads; + pnameSuffix = "-security-tests"; + doCheck = false; + nativeBuildInputs = (commonArgs.nativeBuildInputs or []) ++ [ pkgs.jq pkgs.gcc ]; + buildPhaseCargoCommand = '' + cargo test -p evalbox-sandbox --test security_tests \ + --no-run --release --message-format=json 2>/dev/null \ + | jq -r 'select(.executable != null) | .executable' \ + > /tmp/test-bins.txt + ''; + installPhaseCommand = '' + mkdir -p $out/bin/payloads + while IFS= read -r bin; do + [ -f "$bin" ] && cp "$bin" $out/bin/ + done < /tmp/test-bins.txt + for dir in target/release/build/evalbox-sandbox-*/out/payloads; do + [ -d "$dir" ] && cp "$dir"/* $out/bin/payloads/ + done + ''; + }); + }; + + checks = { + clippy = craneLib.cargoClippy (commonArgs // { + inherit cargoArtifacts; + cargoClippyExtraArgs = "--all-targets -- -D warnings"; + }); + fmt = craneLib.cargoFmt { inherit src; }; + test = craneLib.cargoTest (commonArgs // { + inherit cargoArtifacts; + cargoTestExtraArgs = "--lib"; + }); + doc = craneLib.cargoDoc (commonArgs // { + inherit cargoArtifacts; + RUSTDOCFLAGS = "-D warnings"; + }); + }; + }; +} diff --git a/nix/toolchain.nix b/nix/toolchain.nix new file mode 100644 index 0000000..3b90771 --- /dev/null +++ b/nix/toolchain.nix @@ -0,0 +1,27 @@ +{ inputs, ... }: +{ + perSystem = { system, ... }: + let + pkgs = import inputs.nixpkgs { + inherit system; + overlays = [ inputs.rust-overlay.overlays.default ]; + }; + toolchain = pkgs.rust-bin.stable.latest.default; + toolchainWithExtensions = toolchain.override { + extensions = [ "rust-src" "rust-analyzer" "clippy" "rustfmt" ]; + }; + craneLib = (inputs.crane.mkLib pkgs).overrideToolchain toolchain; + src = craneLib.cleanCargoSource ./..; + crateInfo = craneLib.crateNameFromCargoToml { cargoToml = ./../Cargo.toml; }; + commonArgs = { + inherit src; + inherit (crateInfo) pname version; + nativeBuildInputs = with pkgs; [ pkg-config ]; + }; + cargoArtifacts = craneLib.buildDepsOnly commonArgs; + in { + _module.args = { + inherit pkgs craneLib toolchainWithExtensions src commonArgs cargoArtifacts; + }; + }; +}