diff --git a/.cargo/config.toml b/.cargo/config.toml
deleted file mode 100644
index d47f4ee..0000000
--- a/.cargo/config.toml
+++ /dev/null
@@ -1,13 +0,0 @@
-[target.x86_64-unknown-linux-gnu]
-linker = "clang"
-rustflags = ["-C", "link-arg=-fuse-ld=mold"]
-
-[target.aarch64-unknown-linux-gnu]
-linker = "clang"
-rustflags = ["-C", "link-arg=-fuse-ld=mold"]
-
-[build]
-rustflags = ["-C", "target-cpu=native"]
-
-[term]
-color = "always"
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c65f903..e7a297e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -6,45 +6,52 @@ on:
   pull_request:
     branches: [main]
 
-env:
-  CARGO_TERM_COLOR: always
-  RUSTFLAGS: -Dwarnings
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
 
 jobs:
-  fmt:
-    name: Format
+  nix-checks:
+    name: Nix Checks
     runs-on: ubuntu-latest
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - uses: actions/checkout@v4
-      - run: rustup component add rustfmt
-      - run: cargo fmt --all --check
+      - uses: DeterminateSystems/determinate-nix-action@v3
+      - uses: DeterminateSystems/magic-nix-cache-action@main
+      - name: Run checks (clippy, fmt, test, doc)
+        run: nix flake check -L
 
-  clippy:
-    name: Clippy
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - run: rustup component add clippy
-      - uses: Swatinem/rust-cache@v2
-      - run: cargo clippy --all-targets --all-features
-
-  doc:
-    name: Documentation
-    runs-on: ubuntu-latest
-    env:
-      RUSTDOCFLAGS: -Dwarnings
-    steps:
-      - uses: actions/checkout@v4
-      - uses: Swatinem/rust-cache@v2
-      - run: cargo doc --no-deps --all-features
-
-  unit-tests:
-    name: Unit Tests
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: Swatinem/rust-cache@v2
-      - run: cargo test --lib --all-features
+  # E2E security tests require kernel 6.12+ (Landlock ABI v5).
+  # GHA ubuntu-latest currently ships 6.11; image 20260209 has 6.14 but hasn't propagated yet.
+  # Uncomment when runners get kernel 6.12+.
+  #
+  # e2e:
+  #   name: E2E (${{ matrix.distro }})
+  #   runs-on: ubuntu-latest
+  #   needs: nix-checks
+  #   permissions:
+  #     id-token: write
+  #     contents: read
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       distro: [ubuntu:24.04, fedora:41, alpine:3.21]
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #     - uses: DeterminateSystems/determinate-nix-action@v3
+  #     - uses: DeterminateSystems/magic-nix-cache-action@main
+  #     - name: Build security test binary
+  #       run: nix build -L .#security-test-bin
+  #     - name: Run security tests in ${{ matrix.distro }}
+  #       run: |
+  #         TEST_BIN=$(realpath result/bin/security_tests-*)
+  #         docker run --rm --privileged \
+  #           -v /nix/store:/nix/store:ro \
+  #           ${{ matrix.distro }} \
+  #           "$TEST_BIN" --ignored --test-threads=1
 
   semver-check:
     name: SemVer Check
@@ -52,7 +59,6 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - uses: Swatinem/rust-cache@v2
-      - name: Check semver
-        uses: obi1kenobi/cargo-semver-checks-action@v2
+      - uses: obi1kenobi/cargo-semver-checks-action@v2
         with:
           package: evalbox
diff --git a/.gitignore b/.gitignore
index 73e2f03..a803b6f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
 .direnv
 target/
-bindings/
\ No newline at end of file
+bindings/
+
+# Local cargo config
+.cargo/
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4408a55..fdf27b1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,40 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.1.1] - 2026-02-22
+
+### Changed
+
+- **Architecture: remove namespace isolation, use Landlock v5 as primary**
+  - Removed user, PID, network, mount, UTS, and IPC namespaces
+  - Removed `pivot_root` and bind mount rootfs setup
+  - Landlock v5 replaces namespaces for filesystem, network, signal, and IPC control
+  - Plain `fork()` instead of `clone()` with `CLONE_NEW*` flags
+  - Minimum kernel raised from 5.13 to 6.12 (Landlock ABI 5)
+
+- **Resource limits moved to dedicated module** (`isolation/rlimits.rs`)
+  - `RLIMIT_DATA` (256 MiB) instead of `RLIMIT_AS` (breaks Go/Java/V8 runtimes)
+  - Added `RLIMIT_CPU`, `RLIMIT_CORE`, `RLIMIT_STACK`
+
+- **Nix flake migrated to flake-parts + import-tree**
+  - Auto-discovery of modules via `import-tree ./nix`
+  - Removed manual `forAllSystems` boilerplate
+  - Restricted to `x86_64-linux` (arm not yet supported)
+
+### Added
+
+- Seccomp user notify support (`SECCOMP_RET_USER_NOTIF`) for optional syscall interception
+- `nix run .#test-all` to run the full security test suite
+- `SECURITY.md` — GitHub standard vulnerability reporting policy
+- `CONTRIBUTING.md` — development setup, testing guide
+- Security hardening roadmap (UDP filtering, /proc restriction, optional PID namespace)
+
+### Removed
+
+- `crates/evalbox-sandbox/src/isolation/namespace.rs` — namespace setup
+- `crates/evalbox-sandbox/src/isolation/rootfs.rs` — pivot_root + bind mounts
+- `nix/lib.nix`, `nix/checks.nix`, `nix/tests/` — replaced by flake-parts modules
+
 ## [0.1.0] - 2025-02-17
 
 ### Added
@@ -19,14 +53,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - Go runtime with compilation caching
   - Shell runtime for script execution
 
-- **Security isolation** (7 layers of defense)
-  - User namespaces (unprivileged containers)
-  - PID namespace (process isolation)
-  - Network namespace (network isolation)
-  - Mount namespace + pivot_root (filesystem isolation)
-  - Landlock LSM (kernel-enforced filesystem rules)
+- **Security isolation**
+  - Landlock v5 (filesystem, network, signal, IPC access control)
   - Seccomp BPF (syscall whitelist with ~100 allowed syscalls)
   - Resource limits (memory, PIDs, file descriptors, timeout)
+  - Privilege hardening (NO_NEW_PRIVS, securebits, capability drop)
 
 - **Seccomp filtering**
   - Whitelist-based syscall filter
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..dd919ef
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,80 @@
+# Contributing
+
+## Development Setup
+
+evalbox uses Nix for a reproducible dev environment:
+
+```bash
+nix develop
+```
+
+This provides the Rust toolchain, GCC (for test payloads), Python, and Go.
+
+## Building
+
+```bash
+cargo build
+```
+
+## Testing
+
+### Fast checks (CI)
+
+```bash
+# Runs via nix: clippy, fmt, unit tests, docs
+nix flake check
+```
+
+Or manually:
+
+```bash
+cargo clippy --all-targets -- -D warnings
+cargo fmt --check
+cargo test --lib
+cargo doc --no-deps
+```
+
+### Full test suite (requires user namespaces)
+
+```bash
+nix run .#test-all
+```
+
+Or manually:
+
+```bash
+cargo build -p evalbox-sandbox
+cargo test -p evalbox-sandbox --test security_tests --ignored -- --test-threads=1
+```
+
+The security tests require Linux with user namespaces enabled. They compile C payloads that attempt real exploit techniques (CVEs, syscall abuse, escape vectors) and verify the sandbox blocks them.
+
+### Running specific test categories
+
+```bash
+cargo test -p evalbox-sandbox --test security_tests seccomp -- --ignored
+cargo test -p evalbox-sandbox --test security_tests filesystem -- --ignored
+cargo test -p evalbox-sandbox --test security_tests network -- --ignored
+cargo test -p evalbox-sandbox --test security_tests cve -- --ignored
+cargo test -p evalbox-sandbox --test security_tests resources -- --ignored
+```
+
+## Project Structure
+
+```
+evalbox/                  # Public API, language runtimes
+evalbox-sandbox/          # Sandbox orchestration, isolation
+evalbox-sys/              # Low-level Linux syscall wrappers
+```
+
+See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for details.
+
+## Pull Requests
+
+- Run `nix flake check` before submitting
+- Security-related changes should include tests in `crates/evalbox-sandbox/tests/security/`
+- Keep the seccomp whitelist minimal: don't add syscalls without justification
+
+## Security
+
+Found a vulnerability? See [SECURITY.md](SECURITY.md) for reporting instructions.
diff --git a/Cargo.lock b/Cargo.lock
index 9e06bdb..bb2e97b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -59,12 +59,6 @@ dependencies = [
  "parking_lot_core",
 ]
 
-[[package]]
-name = "either"
-version = "1.15.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
-
 [[package]]
 name = "env_home"
 version = "0.1.0"
@@ -101,7 +95,7 @@ dependencies = [
  "tempfile",
  "thiserror",
  "walkdir",
- "which 7.0.3",
+ "which",
 ]
 
 [[package]]
@@ -115,7 +109,7 @@ dependencies = [
  "rustix",
  "tempfile",
  "thiserror",
- "which 8.0.0",
+ "which",
 ]
 
 [[package]]
@@ -610,18 +604,6 @@ dependencies = [
  "semver",
 ]
 
-[[package]]
-name = "which"
-version = "7.0.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24d643ce3fd3e5b54854602a080f34fb10ab75e0b813ee32d00ca2b44fa74762"
-dependencies = [
- "either",
- "env_home",
- "rustix",
- "winsafe",
-]
-
 [[package]]
 name = "which"
 version = "8.0.0"
diff --git a/Cargo.toml b/Cargo.toml
index bc6800c..96b7910 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,6 +6,9 @@ members = [
     "crates/evalbox-sandbox",
 ]
 
+[workspace.metadata.crane]
+name = "evalbox"
+
 [workspace.package]
 version = "0.1.0"
 edition = "2024"
@@ -30,7 +33,7 @@ rustix = { version = "1", features = ["event", "process", "system", "mount", "fs
 thiserror = "2"
 tempfile = "3"
 mio = { version = "1.0", features = ["os-poll", "os-ext"] }
-which = "7"
+which = "8"
 
 [workspace.lints.rust]
 unsafe_op_in_unsafe_fn = "warn"
@@ -39,7 +42,6 @@ unused_must_use = "warn"
 
 [workspace.lints.clippy]
 all = { level = "warn", priority = -1 }
-# Useful pedantic lints (not all)
 cast_possible_truncation = "warn"
 cast_sign_loss = "warn"
 cloned_instead_of_copied = "warn"
diff --git a/README.md b/README.md
index 379b8bf..9dae5a0 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ Execute code like `eval()`, but safe. No containers, no VMs, no root.
 - **Simple** - One function call, security handled for you
 - **Multi-language** - Python, Go, and shell/terminal commands
 - **Fast** - Millisecond startup, no containers or VMs
-- **Secure** - 7 layers of isolation (namespaces, Landlock, seccomp, rlimits)
+- **Secure** - Landlock v5 + seccomp-BPF + rlimits, no namespaces needed
 
 ## Quick Start
 
@@ -38,8 +38,8 @@ let output = shell::run("curl https://example.com")
 
 ## Requirements
 
-- Linux kernel 5.13+ (Landlock ABI 1+)
-- User namespaces enabled
+- Linux kernel 6.12+ (Landlock ABI 5)
+- Seccomp enabled
 
 ## Installation
 
@@ -50,15 +50,16 @@ evalbox = { version = "0.1", features = ["python", "go", "shell"] }
 
 ## Security
 
-7 layers of isolation: user namespaces, PID namespace, network namespace, mount namespace + pivot_root, Landlock LSM, seccomp BPF, rlimits.
+Isolation via Landlock v5 (filesystem + network + signal + IPC scoping), seccomp-BPF (syscall whitelist), rlimits, privilege hardening (NO_NEW_PRIVS, securebits, capability drop).
 
-See [SECURITY.md](docs/SECURITY.md) for threat model and CVE protections.
+See [Security Model](docs/SECURITY_MODEL.md) for threat model and CVE protections.
 
 ## Documentation
 
 - [Architecture](docs/ARCHITECTURE.md)
-- [Security Model](docs/SECURITY.md)
+- [Security Model](docs/SECURITY_MODEL.md)
 - [Roadmap](docs/ROADMAP.md)
+- [Contributing](CONTRIBUTING.md)
 
 ## License
 
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 0000000..0ce141b
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,42 @@
+# Security Policy
+
+## Supported Versions
+
+| Version | Supported |
+|---------|-----------|
+| 0.1.x   | Yes       |
+
+## Reporting a Vulnerability
+
+If you discover a security vulnerability in evalbox, **please do not open a public issue.**
+
+Instead, report it privately via [GitHub Security Advisories](https://github.com/fullzer4/evalbox/security/advisories/new).
+
+Include:
+- Description of the vulnerability
+- Steps to reproduce
+- Which isolation mechanism is affected (Landlock, seccomp, rlimits, privilege hardening)
+- Impact assessment (sandbox escape, info leak, DoS, etc.)
+
+You should receive a response within **72 hours**. Critical sandbox escape vulnerabilities are treated as highest priority.
+
+## Scope
+
+evalbox provides isolation via Landlock v5, seccomp-BPF, rlimits, and privilege hardening. The following are in scope for security reports:
+
+- Sandbox escape (code executing outside isolation)
+- Filesystem access beyond Landlock-allowed paths
+- Network access when disabled
+- Privilege escalation from sandbox
+- Seccomp filter bypass
+- Landlock rule bypass
+- Resource limit bypass (memory, PIDs, file descriptors)
+
+See [docs/SECURITY_MODEL.md](docs/SECURITY_MODEL.md) for the full threat model and isolation architecture.
+
+## Out of Scope
+
+- Kernel 0-day exploits (requires kernel hardening)
+- CPU side-channel attacks (Spectre/Meltdown)
+- Denial of service against the host kernel
+- Issues requiring non-default kernel configurations
diff --git a/crates/evalbox-sandbox/Cargo.toml b/crates/evalbox-sandbox/Cargo.toml
index 482e35e..7b3224e 100644
--- a/crates/evalbox-sandbox/Cargo.toml
+++ b/crates/evalbox-sandbox/Cargo.toml
@@ -14,7 +14,7 @@ rustix.workspace = true
 tempfile.workspace = true
 mio.workspace = true
 thiserror.workspace = true
-which = "8"
+which.workspace = true
 
 [build-dependencies]
 cc = "1.2"
diff --git a/crates/evalbox-sandbox/src/executor.rs b/crates/evalbox-sandbox/src/executor.rs
index 7d659de..57512fb 100644
--- a/crates/evalbox-sandbox/src/executor.rs
+++ b/crates/evalbox-sandbox/src/executor.rs
@@ -39,7 +39,7 @@ use std::collections::HashMap;
 use std::ffi::CString;
 use std::io::{self, Write as _};
 use std::os::fd::{AsRawFd, OwnedFd, RawFd};
-use std::path::{Path, PathBuf};
+use std::path::PathBuf;
 use std::time::{Duration, Instant};
 
 use mio::unix::SourceFd;
@@ -48,14 +48,16 @@ use rustix::io::Errno;
 use rustix::process::{Pid, PidfdFlags, Signal, pidfd_open, pidfd_send_signal};
 use thiserror::Error;
 
-use evalbox_sys::{check, last_errno};
-
-use crate::isolation::{
-    LockdownError, bind_mount, lockdown, make_rprivate, mount_minimal_dev, mount_proc,
-    pivot_root_and_cleanup, set_hostname, setup_id_maps,
+use evalbox_sys::seccomp::{
+    DEFAULT_WHITELIST, NOTIFY_FS_SYSCALLS, SockFprog, build_notify_filter, build_whitelist_filter,
 };
+use evalbox_sys::seccomp_notify::seccomp_set_mode_filter_listener;
+use evalbox_sys::{check, last_errno, seccomp::seccomp_set_mode_filter};
+
+use crate::isolation::{LockdownError, close_extra_fds, lockdown};
 use crate::monitor::{Output, Status, monitor, set_nonblocking, wait_for_exit, write_stdin};
-use crate::plan::{Mount, Plan};
+use crate::notify::scm_rights;
+use crate::plan::{Mount, NotifyMode, Plan};
 use crate::resolve::{ResolvedBinary, resolve_binary};
 use crate::validate::validate_cmd;
 use crate::workspace::Workspace;
@@ -75,15 +77,6 @@ pub enum ExecutorError {
     #[error("fork: {0}")]
     Fork(Errno),
 
-    #[error("unshare: {0}")]
-    Unshare(Errno),
-
-    #[error("id map: {0}")]
-    IdMap(io::Error),
-
-    #[error("rootfs: {0}")]
-    Rootfs(Errno),
-
     #[error("lockdown: {0}")]
     Lockdown(#[from] LockdownError),
 
@@ -102,6 +95,9 @@ pub enum ExecutorError {
     #[error("command not found: {0}")]
     CommandNotFound(String),
 
+    #[error("seccomp notify: {0}")]
+    SeccompNotify(String),
+
     #[error("io: {0}")]
     Io(#[from] io::Error),
 }
@@ -155,12 +151,17 @@ impl ExecutionInfo {
 }
 
 /// A spawned sandbox that hasn't been waited on yet.
+///
+/// Some fields are never read but kept alive for RAII (fd lifetime, temp dir cleanup).
+#[allow(dead_code)]
 struct SpawnedSandbox {
     pidfd: OwnedFd,
     stdin_fd: RawFd,
     stdout_fd: RawFd,
     stderr_fd: RawFd,
-    #[allow(dead_code)]
+    /// Seccomp listener fd kept alive for RAII; future supervisor integration.
+    notify_fd: Option<OwnedFd>,
+    /// Workspace kept alive so temp directory isn't deleted while sandbox runs.
     workspace: std::mem::ManuallyDrop<Workspace>,
 }
 
@@ -235,15 +236,22 @@ impl Executor {
 
         let workspace = Workspace::with_prefix("evalbox-").map_err(ExecutorError::Workspace)?;
 
+        workspace
+            .setup_sandbox_dirs()
+            .map_err(ExecutorError::Workspace)?;
         for file in &plan.user_files {
+            let work_path = format!("work/{}", file.path);
             workspace
-                .write_file(&file.path, &file.content, file.executable)
+                .write_file(&work_path, &file.content, file.executable)
                 .map_err(ExecutorError::Workspace)?;
         }
-        workspace
-            .setup_sandbox_dirs()
-            .map_err(ExecutorError::Workspace)?;
-        create_mount_dirs(&workspace, &exec_info, &plan)?;
+
+        // Create socketpair for notify fd transfer (if needed)
+        let notify_sockets = if plan.notify_mode != NotifyMode::Disabled {
+            Some(scm_rights::create_socketpair().map_err(ExecutorError::Workspace)?)
+        } else {
+            None
+        };
 
         let child_pid = unsafe { libc::fork() };
         if child_pid < 0 {
@@ -251,7 +259,9 @@ impl Executor {
         }
 
         if child_pid == 0 {
-            match child_process(&workspace, &plan, &exec_info) {
+            // In child: close parent's socket end
+            let child_socket = notify_sockets.map(|(_, child)| child);
+            match child_process(&workspace, &plan, &exec_info, child_socket.as_ref()) {
                 Ok(()) => unsafe { libc::_exit(127) },
                 Err(e) => {
                     writeln!(io::stderr(), "sandbox error: {e}").ok();
@@ -263,7 +273,22 @@ impl Executor {
         let pid = unsafe { Pid::from_raw_unchecked(child_pid) };
         let pidfd = pidfd_open(pid, PidfdFlags::empty()).map_err(ExecutorError::Pidfd)?;
 
-        blocking_parent(child_pid, pidfd, workspace, plan)
+        // Parent: receive notify fd if applicable
+        let notify_fd = if let Some((parent_socket, _)) = notify_sockets {
+            poll_or_kill(
+                parent_socket.as_raw_fd(),
+                child_pid,
+                "timeout waiting for notify fd",
+            )?;
+            Some(
+                scm_rights::recv_fd(parent_socket.as_raw_fd())
+                    .map_err(|e| ExecutorError::SeccompNotify(e.to_string()))?,
+            )
+        } else {
+            None
+        };
+
+        blocking_parent(child_pid, pidfd, notify_fd, workspace, plan)
     }
 
     /// Spawn a new sandbox. Returns immediately with a [`SandboxId`].
@@ -534,6 +559,50 @@ impl Executor {
     }
 }
 
+/// Close the parent-side pipe ends that the child uses (stdin read, stdout write, stderr write).
+fn close_parent_pipe_ends(workspace: &Workspace) {
+    unsafe {
+        libc::close(workspace.pipes.stdin.read.as_raw_fd());
+        libc::close(workspace.pipes.stdout.write.as_raw_fd());
+        libc::close(workspace.pipes.stderr.write.as_raw_fd());
+    }
+}
+
+/// Poll an fd with a 30-second timeout; kill the child on timeout or error.
+fn poll_or_kill(fd: RawFd, child_pid: libc::pid_t, msg: &str) -> Result<(), ExecutorError> {
+    let mut pfd = libc::pollfd {
+        fd,
+        events: libc::POLLIN,
+        revents: 0,
+    };
+    if unsafe { libc::poll(&mut pfd, 1, 30000) } <= 0 {
+        unsafe { libc::kill(child_pid, libc::SIGKILL) };
+        return Err(ExecutorError::ChildSetup(msg.into()));
+    }
+    Ok(())
+}
+
+/// Wait for the child to signal readiness via eventfd, then signal back.
+fn sync_with_child(workspace: &Workspace, child_pid: libc::pid_t) -> Result<(), ExecutorError> {
+    let child_ready_fd = workspace.pipes.sync.child_ready_fd();
+    poll_or_kill(child_ready_fd, child_pid, "timeout waiting for child")?;
+
+    let mut value: u64 = 0;
+    if unsafe { libc::read(child_ready_fd, (&mut value as *mut u64).cast(), 8) } != 8 {
+        unsafe { libc::kill(child_pid, libc::SIGKILL) };
+        return Err(ExecutorError::ChildSetup("eventfd read failed".into()));
+    }
+
+    let parent_done_fd = workspace.pipes.sync.parent_done_fd();
+    let signal_value: u64 = 1;
+    if unsafe { libc::write(parent_done_fd, (&signal_value as *const u64).cast(), 8) } != 8 {
+        unsafe { libc::kill(child_pid, libc::SIGKILL) };
+        return Err(ExecutorError::ChildSetup("eventfd write failed".into()));
+    }
+
+    Ok(())
+}
+
 fn spawn_sandbox(plan: Plan) -> Result<SpawnedSandbox, ExecutorError> {
     let cmd_refs: Vec<&str> = plan.cmd.iter().map(|s| s.as_str()).collect();
     validate_cmd(&cmd_refs).map_err(ExecutorError::Validation)?;
@@ -552,15 +621,22 @@ fn spawn_sandbox(plan: Plan) -> Result<SpawnedSandbox, ExecutorError> {
 
     let workspace = Workspace::with_prefix("evalbox-").map_err(ExecutorError::Workspace)?;
 
+    workspace
+        .setup_sandbox_dirs()
+        .map_err(ExecutorError::Workspace)?;
     for file in &plan.user_files {
+        let work_path = format!("work/{}", file.path);
         workspace
-            .write_file(&file.path, &file.content, file.executable)
+            .write_file(&work_path, &file.content, file.executable)
             .map_err(ExecutorError::Workspace)?;
     }
-    workspace
-        .setup_sandbox_dirs()
-        .map_err(ExecutorError::Workspace)?;
-    create_mount_dirs(&workspace, &exec_info, &plan)?;
+
+    // Create socketpair for notify fd transfer (if needed)
+    let notify_sockets = if plan.notify_mode != NotifyMode::Disabled {
+        Some(scm_rights::create_socketpair().map_err(ExecutorError::Workspace)?)
+    } else {
+        None
+    };
 
     let child_pid = unsafe { libc::fork() };
     if child_pid < 0 {
@@ -568,7 +644,8 @@ fn spawn_sandbox(plan: Plan) -> Result<SpawnedSandbox, ExecutorError> {
     }
 
     if child_pid == 0 {
-        match child_process(&workspace, &plan, &exec_info) {
+        let child_socket = notify_sockets.map(|(_, child)| child);
+        match child_process(&workspace, &plan, &exec_info, child_socket.as_ref()) {
             Ok(()) => unsafe { libc::_exit(127) },
             Err(e) => {
                 writeln!(io::stderr(), "sandbox error: {e}").ok();
@@ -580,47 +657,28 @@ fn spawn_sandbox(plan: Plan) -> Result<SpawnedSandbox, ExecutorError> {
     let pid = unsafe { Pid::from_raw_unchecked(child_pid) };
     let pidfd = pidfd_open(pid, PidfdFlags::empty()).map_err(ExecutorError::Pidfd)?;
 
-    // Parent: close unused pipe ends
     let stdin_write_fd = workspace.pipes.stdin.write.as_raw_fd();
     let stdout_read_fd = workspace.pipes.stdout.read.as_raw_fd();
     let stderr_read_fd = workspace.pipes.stderr.read.as_raw_fd();
 
-    unsafe {
-        libc::close(workspace.pipes.stdin.read.as_raw_fd());
-        libc::close(workspace.pipes.stdout.write.as_raw_fd());
-        libc::close(workspace.pipes.stderr.write.as_raw_fd());
-    }
+    close_parent_pipe_ends(&workspace);
 
-    // Wait for child to signal readiness
-    let child_ready_fd = workspace.pipes.sync.child_ready_fd();
-    let mut pfd = libc::pollfd {
-        fd: child_ready_fd,
-        events: libc::POLLIN,
-        revents: 0,
+    // Receive notify fd from child if applicable
+    let notify_fd = if let Some((parent_socket, _)) = notify_sockets {
+        poll_or_kill(
+            parent_socket.as_raw_fd(),
+            child_pid,
+            "timeout waiting for notify fd",
+        )?;
+        Some(
+            scm_rights::recv_fd(parent_socket.as_raw_fd())
+                .map_err(|e| ExecutorError::SeccompNotify(e.to_string()))?,
+        )
+    } else {
+        None
     };
 
-    if unsafe { libc::poll(&mut pfd, 1, 30000) } <= 0 {
-        unsafe { libc::kill(child_pid, libc::SIGKILL) };
-        return Err(ExecutorError::ChildSetup(
-            "timeout waiting for child".into(),
-        ));
-    }
-
-    let mut value: u64 = 0;
-    if unsafe { libc::read(child_ready_fd, (&mut value as *mut u64).cast(), 8) } != 8 {
-        unsafe { libc::kill(child_pid, libc::SIGKILL) };
-        return Err(ExecutorError::ChildSetup("eventfd read failed".into()));
-    }
-
-    setup_id_maps(child_pid).map_err(ExecutorError::IdMap)?;
-
-    // Signal child to continue
-    let parent_done_fd = workspace.pipes.sync.parent_done_fd();
-    let signal_value: u64 = 1;
-    if unsafe { libc::write(parent_done_fd, (&signal_value as *const u64).cast(), 8) } != 8 {
-        unsafe { libc::kill(child_pid, libc::SIGKILL) };
-        return Err(ExecutorError::ChildSetup("eventfd write failed".into()));
-    }
+    sync_with_child(&workspace, child_pid)?;
 
     // Write stdin if provided
     if let Some(ref stdin_data) = plan.stdin {
@@ -647,6 +705,7 @@ fn spawn_sandbox(plan: Plan) -> Result<SpawnedSandbox, ExecutorError> {
         },
         stdout_fd: stdout_read_fd,
         stderr_fd: stderr_read_fd,
+        notify_fd,
         workspace: std::mem::ManuallyDrop::new(workspace),
     })
 }
@@ -654,45 +713,15 @@ fn spawn_sandbox(plan: Plan) -> Result<SpawnedSandbox, ExecutorError> {
 fn blocking_parent(
     child_pid: libc::pid_t,
     pidfd: OwnedFd,
+    _notify_fd: Option<OwnedFd>,
     workspace: Workspace,
     plan: Plan,
 ) -> Result<Output, ExecutorError> {
     let workspace = std::mem::ManuallyDrop::new(workspace);
 
-    unsafe {
-        libc::close(workspace.pipes.stdin.read.as_raw_fd());
-        libc::close(workspace.pipes.stdout.write.as_raw_fd());
-        libc::close(workspace.pipes.stderr.write.as_raw_fd());
-    }
+    close_parent_pipe_ends(&workspace);
 
-    let child_ready_fd = workspace.pipes.sync.child_ready_fd();
-    let mut pfd = libc::pollfd {
-        fd: child_ready_fd,
-        events: libc::POLLIN,
-        revents: 0,
-    };
-
-    if unsafe { libc::poll(&mut pfd, 1, 30000) } <= 0 {
-        unsafe { libc::kill(child_pid, libc::SIGKILL) };
-        return Err(ExecutorError::ChildSetup(
-            "timeout waiting for child".into(),
-        ));
-    }
-
-    let mut value: u64 = 0;
-    if unsafe { libc::read(child_ready_fd, (&mut value as *mut u64).cast(), 8) } != 8 {
-        unsafe { libc::kill(child_pid, libc::SIGKILL) };
-        return Err(ExecutorError::ChildSetup("eventfd read failed".into()));
-    }
-
-    setup_id_maps(child_pid).map_err(ExecutorError::IdMap)?;
-
-    let parent_done_fd = workspace.pipes.sync.parent_done_fd();
-    let signal_value: u64 = 1;
-    if unsafe { libc::write(parent_done_fd, (&signal_value as *const u64).cast(), 8) } != 8 {
-        unsafe { libc::kill(child_pid, libc::SIGKILL) };
-        return Err(ExecutorError::ChildSetup("eventfd write failed".into()));
-    }
+    sync_with_child(&workspace, child_pid)?;
 
     if let Some(ref stdin_data) = plan.stdin {
         write_stdin(&workspace, stdin_data).map_err(ExecutorError::Monitor)?;
@@ -711,100 +740,93 @@ fn blocking_parent(
     result
 }
 
+/// Child process flow (runs after fork in the child).
+///
+/// 1. Close parent pipe ends
+/// 2. Setup stdio (dup2 stdin/stdout/stderr)
+/// 3. chdir(workspace/work)
+/// 4. Landlock v5 + rlimits + securebits + drop caps (lockdown)
+/// 5. If `notify_mode` != Disabled: install notify filter, send listener fd
+/// 6. Install kill seccomp filter (whitelist)
+/// 7. Signal parent readiness
+/// 8. Wait for parent signal
+/// 9. `close_range(3, MAX, 0)`
+/// 10. execve
 fn child_process(
     workspace: &Workspace,
     plan: &Plan,
     exec_info: &ExecutionInfo,
+    notify_socket: Option<&OwnedFd>,
 ) -> Result<(), ExecutorError> {
+    // 1. Close parent pipe ends
     unsafe {
         libc::close(workspace.pipes.stdin.write.as_raw_fd());
         libc::close(workspace.pipes.stdout.read.as_raw_fd());
         libc::close(workspace.pipes.stderr.read.as_raw_fd());
     }
 
-    if unsafe { libc::unshare(libc::CLONE_NEWUSER) } != 0 {
-        return Err(ExecutorError::Unshare(last_errno()));
+    // 2. Setup stdio
+    setup_stdio(workspace)?;
+
+    // 3. chdir to workspace/work
+    let work_dir = workspace.root().join("work");
+    let work_cstr = CString::new(work_dir.to_string_lossy().as_bytes())
+        .map_err(|_| ExecutorError::Exec(Errno::INVAL))?;
+    if unsafe { libc::chdir(work_cstr.as_ptr()) } != 0 {
+        return Err(ExecutorError::Exec(last_errno()));
+    }
+
+    // 4. Apply lockdown (Landlock v5 + rlimits + securebits + drop caps)
+    let extra_paths: Vec<&str> = exec_info
+        .extra_mounts
+        .iter()
+        .filter_map(|m| m.source.to_str())
+        .collect();
+    lockdown(plan, workspace.root(), &extra_paths).map_err(ExecutorError::Lockdown)?;
+
+    // 5. If notify mode != Disabled: install notify seccomp filter, send listener fd
+    if plan.notify_mode != NotifyMode::Disabled {
+        let notify_filter = build_notify_filter(NOTIFY_FS_SYSCALLS);
+        let fprog = SockFprog {
+            len: notify_filter.len() as u16,
+            filter: notify_filter.as_ptr(),
+        };
+        let listener_fd = unsafe { seccomp_set_mode_filter_listener(&fprog) }.map_err(|e| {
+            ExecutorError::SeccompNotify(format!("failed to install notify filter: {e}"))
+        })?;
+
+        // Send listener fd to parent via SCM_RIGHTS
+        if let Some(sock) = notify_socket {
+            scm_rights::send_fd(sock.as_raw_fd(), listener_fd.as_raw_fd()).map_err(|e| {
+                ExecutorError::SeccompNotify(format!("failed to send listener fd: {e}"))
+            })?;
+        }
     }
 
+    // 6. Install kill seccomp filter (whitelist)
+    apply_seccomp(plan)?;
+
+    // 7. Signal parent readiness
     let child_ready_fd = workspace.pipes.sync.child_ready_fd();
     let signal_value: u64 = 1;
     if unsafe { libc::write(child_ready_fd, (&signal_value as *const u64).cast(), 8) } != 8 {
         return Err(ExecutorError::ChildSetup("eventfd write failed".into()));
     }
 
+    // 8. Wait for parent signal
     let parent_done_fd = workspace.pipes.sync.parent_done_fd();
     let mut value: u64 = 0;
     if unsafe { libc::read(parent_done_fd, (&mut value as *mut u64).cast(), 8) } != 8 {
         return Err(ExecutorError::ChildSetup("eventfd read failed".into()));
     }
 
-    if unsafe { libc::unshare(libc::CLONE_NEWNS | libc::CLONE_NEWUTS | libc::CLONE_NEWIPC) } != 0 {
-        return Err(ExecutorError::Unshare(last_errno()));
-    }
-
-    setup_rootfs(workspace, plan, exec_info)?;
-    setup_stdio(workspace)?;
-
-    let extra_paths: Vec<&str> = exec_info
-        .extra_mounts
-        .iter()
-        .filter_map(|m| m.target.to_str())
-        .collect();
-    lockdown(plan, None, &extra_paths).map_err(ExecutorError::Lockdown)?;
-
-    let cwd = CString::new(plan.cwd.as_bytes()).map_err(|_| ExecutorError::Exec(Errno::INVAL))?;
-    if unsafe { libc::chdir(cwd.as_ptr()) } != 0 {
-        return Err(ExecutorError::Exec(last_errno()));
-    }
+    // 9. Close all fds except 0,1,2
+    close_extra_fds();
 
+    // 10. execve
     exec_command(plan, exec_info)
 }
 
-fn setup_rootfs(
-    workspace: &Workspace,
-    plan: &Plan,
-    exec_info: &ExecutionInfo,
-) -> Result<(), ExecutorError> {
-    let sandbox_root = workspace.root();
-
-    make_rprivate().map_err(ExecutorError::Rootfs)?;
-
-    for mount in &exec_info.extra_mounts {
-        let target = sandbox_root.join(mount.target.strip_prefix("/").unwrap_or(&mount.target));
-        if mount.source.exists() {
-            bind_mount(&mount.source, &target, !mount.writable).map_err(ExecutorError::Rootfs)?;
-        }
-    }
-
-    for mount in &plan.mounts {
-        let target = sandbox_root.join(mount.target.strip_prefix("/").unwrap_or(&mount.target));
-        if let Some(parent) = target.parent() {
-            std::fs::create_dir_all(parent).map_err(ExecutorError::Workspace)?;
-        }
-        std::fs::create_dir_all(&target).map_err(ExecutorError::Workspace)?;
-        if mount.source.exists() {
-            bind_mount(&mount.source, &target, !mount.writable).map_err(ExecutorError::Rootfs)?;
-        }
-    }
-
-    mount_proc(&sandbox_root.join("proc")).map_err(ExecutorError::Rootfs)?;
-    mount_minimal_dev(&sandbox_root.join("dev")).map_err(ExecutorError::Rootfs)?;
-
-    for file in &plan.user_files {
-        let target_path = if file.path.starts_with('/') {
-            file.path[1..].to_string()
-        } else {
-            format!("work/{}", file.path)
-        };
-        workspace
-            .write_file(&target_path, &file.content, file.executable)
-            .map_err(ExecutorError::Workspace)?;
-    }
-
-    set_hostname("sandbox").map_err(ExecutorError::Rootfs)?;
-    pivot_root_and_cleanup(sandbox_root).map_err(ExecutorError::Rootfs)
-}
-
 fn setup_stdio(workspace: &Workspace) -> Result<(), ExecutorError> {
     let stdin_fd = workspace.pipes.stdin.read.as_raw_fd();
     let stdout_fd = workspace.pipes.stdout.write.as_raw_fd();
@@ -827,6 +849,33 @@ fn setup_stdio(workspace: &Workspace) -> Result<(), ExecutorError> {
     Ok(())
 }
 
+fn apply_seccomp(plan: &Plan) -> Result<(), ExecutorError> {
+    let whitelist: Vec<i64> = if let Some(ref syscalls) = plan.syscalls {
+        let mut wl: Vec<i64> = DEFAULT_WHITELIST
+            .iter()
+            .copied()
+            .filter(|s| !syscalls.denied.contains(s))
+            .collect();
+        for s in &syscalls.allowed {
+            if !wl.contains(s) {
+                wl.push(*s);
+            }
+        }
+        wl
+    } else {
+        DEFAULT_WHITELIST.to_vec()
+    };
+
+    let filter = build_whitelist_filter(&whitelist);
+    let fprog = SockFprog {
+        len: filter.len() as u16,
+        filter: filter.as_ptr(),
+    };
+    unsafe { seccomp_set_mode_filter(&fprog) }
+        .map_err(|e| ExecutorError::Lockdown(LockdownError::Seccomp(e)))?;
+    Ok(())
+}
+
 fn exec_command(plan: &Plan, exec_info: &ExecutionInfo) -> Result<(), ExecutorError> {
     let cmd_path = CString::new(exec_info.binary_path.to_string_lossy().as_bytes())
         .map_err(|_| ExecutorError::Exec(Errno::INVAL))?;
@@ -861,36 +910,6 @@ fn exec_command(plan: &Plan, exec_info: &ExecutionInfo) -> Result<(), ExecutorEr
     Err(ExecutorError::Exec(last_errno()))
 }
 
-fn create_mount_dirs(
-    workspace: &Workspace,
-    exec_info: &ExecutionInfo,
-    plan: &Plan,
-) -> Result<(), ExecutorError> {
-    for mount in &exec_info.extra_mounts {
-        create_mount_dir(workspace, &mount.target)?;
-    }
-    for mount in &plan.mounts {
-        create_mount_dir(workspace, &mount.target)?;
-    }
-    Ok(())
-}
-
-fn create_mount_dir(workspace: &Workspace, target: &Path) -> Result<(), ExecutorError> {
-    if let Some(parent) = target.parent() {
-        if parent != Path::new("/") {
-            let target_dir = workspace
-                .root()
-                .join(parent.strip_prefix("/").unwrap_or(parent));
-            std::fs::create_dir_all(&target_dir).map_err(ExecutorError::Workspace)?;
-        }
-    }
-    let mount_point = workspace
-        .root()
-        .join(target.strip_prefix("/").unwrap_or(target));
-    std::fs::create_dir_all(&mount_point).map_err(ExecutorError::Workspace)?;
-    Ok(())
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/crates/evalbox-sandbox/src/isolation/lockdown.rs b/crates/evalbox-sandbox/src/isolation/lockdown.rs
index 227725a..3c0006e 100644
--- a/crates/evalbox-sandbox/src/isolation/lockdown.rs
+++ b/crates/evalbox-sandbox/src/isolation/lockdown.rs
@@ -1,41 +1,43 @@
 //! Security lockdown for sandboxed processes.
 //!
-//! Applies all security restrictions to the child process after `pivot_root`.
+//! Applies all security restrictions to the child process.
 //! The order of operations is critical for security:
 //!
-//! 1. **Landlock** - Filesystem and network access control (ABI 4+)
-//! 2. **Seccomp** - Syscall whitelist filter (BPF)
-//! 3. **Rlimits** - Resource limits (memory, CPU, files, processes)
-//! 4. **Capabilities** - Drop all capabilities, set `NO_NEW_PRIVS`
-//! 5. **Close FDs** - Close all file descriptors except stdin/stdout/stderr
+//! 0. **`NO_NEW_PRIVS`** - Required before Landlock and seccomp
+//! 1. **Landlock v5** - Filesystem, network, signal, and IPC access control
+//! 2. **Rlimits** - Resource limits (memory, CPU, files, processes)
+//! 3. **Securebits** - Lock capability state permanently
+//! 4. **Capabilities** - Drop all capabilities
+//!
+//! Note: Seccomp filters and fd closing are handled separately in `child_process()`
+//! because the notify filter must return a listener fd that gets sent to the parent.
 //!
 //! After lockdown, the process cannot:
 //! - Access files outside allowed paths
-//! - Make network connections (if landlock ABI >= 4)
-//! - Call restricted syscalls (ptrace, mount, reboot, etc.)
+//! - Make network connections (if network blocked, requires Landlock ABI 4+)
+//! - Send signals to processes outside the sandbox (Landlock ABI 5+)
+//! - Connect to abstract unix sockets outside the sandbox (Landlock ABI 5+)
 //! - Exceed resource limits
 //! - Gain new privileges
 
 use std::ffi::CString;
-use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd};
+use std::os::fd::{AsRawFd, FromRawFd, OwnedFd};
 use std::os::unix::ffi::OsStrExt;
 use std::path::Path;
 
 use evalbox_sys::landlock::{
-    self, LANDLOCK_ACCESS_FS_EXECUTE, LANDLOCK_ACCESS_FS_MAKE_DIR, LANDLOCK_ACCESS_FS_MAKE_REG,
-    LANDLOCK_ACCESS_FS_READ_DIR, LANDLOCK_ACCESS_FS_READ_FILE, LANDLOCK_ACCESS_FS_REMOVE_DIR,
-    LANDLOCK_ACCESS_FS_REMOVE_FILE, LANDLOCK_ACCESS_FS_TRUNCATE, LANDLOCK_ACCESS_FS_WRITE_FILE,
-    LandlockPathBeneathAttr, LandlockRulesetAttr, fs_access_for_abi, landlock_add_rule_path,
-    landlock_create_ruleset, landlock_restrict_self, net_access_for_abi,
+    self, LANDLOCK_ACCESS_FS_EXECUTE, LANDLOCK_ACCESS_FS_MAKE_DIR, LANDLOCK_ACCESS_FS_MAKE_FIFO,
+    LANDLOCK_ACCESS_FS_MAKE_REG, LANDLOCK_ACCESS_FS_MAKE_SYM, LANDLOCK_ACCESS_FS_READ_DIR,
+    LANDLOCK_ACCESS_FS_READ_FILE, LANDLOCK_ACCESS_FS_REMOVE_DIR, LANDLOCK_ACCESS_FS_REMOVE_FILE,
+    LANDLOCK_ACCESS_FS_TRUNCATE, LANDLOCK_ACCESS_FS_WRITE_FILE, LandlockPathBeneathAttr,
+    LandlockRulesetAttr, fs_access_for_abi, landlock_add_rule_path, landlock_create_ruleset,
+    landlock_restrict_self, net_access_for_abi, scope_for_abi,
 };
 use evalbox_sys::last_errno;
-use evalbox_sys::seccomp::{
-    DEFAULT_WHITELIST, SockFprog, build_whitelist_filter, seccomp_set_mode_filter,
-};
 use rustix::io::Errno;
 use thiserror::Error;
 
-use super::rootfs::apply_rlimits;
+use super::rlimits::apply_rlimits;
 use crate::plan::Plan;
 
 /// Error during security lockdown.
@@ -53,26 +55,34 @@ pub enum LockdownError {
     #[error("capability: {0}")]
     Capability(Errno),
 
-    #[error("close fds: {0}")]
-    CloseFds(Errno),
+    #[error("securebits: {0}")]
+    Securebits(Errno),
 }
 
+/// Apply security lockdown to the current process.
+///
+/// `workspace_root` is the real absolute path to the workspace directory
+/// (no `pivot_root`, so we use real paths).
+///
+/// `extra_readonly_paths` are additional paths that should be readable
+/// (e.g., resolved binary mount paths).
 pub fn lockdown(
     plan: &Plan,
-    workspace_path: Option<&Path>,
+    workspace_root: &Path,
     extra_readonly_paths: &[&str],
 ) -> Result<(), LockdownError> {
-    apply_landlock(plan, workspace_path, extra_readonly_paths)?;
-    apply_seccomp()?;
+    // NO_NEW_PRIVS must be set before landlock_restrict_self and seccomp.
+    set_no_new_privs()?;
+    apply_landlock_v5(plan, workspace_root, extra_readonly_paths)?;
     apply_rlimits(plan).map_err(LockdownError::Rlimit)?;
+    apply_securebits()?;
     drop_all_caps()?;
-    close_extra_fds()?;
     Ok(())
 }
 
-fn apply_landlock(
+fn apply_landlock_v5(
     plan: &Plan,
-    workspace_path: Option<&Path>,
+    workspace_root: &Path,
     extra_readonly_paths: &[&str],
 ) -> Result<(), LockdownError> {
     let abi = match landlock::landlock_abi_version() {
@@ -80,16 +90,22 @@ fn apply_landlock(
         Err(_) => return Ok(()), // Landlock not available
     };
 
+    if abi < 5 {
+        eprintln!("warning: landlock ABI {abi} < 5, signal/IPC scoping unavailable");
+    }
+
     let fs_access = fs_access_for_abi(abi);
     let net_access = if plan.network_blocked && abi >= 4 {
         net_access_for_abi(abi)
     } else {
         0
     };
+    let scoped = scope_for_abi(abi);
 
     let attr = LandlockRulesetAttr {
         handled_access_fs: fs_access,
         handled_access_net: net_access,
+        scoped,
     };
     let ruleset_fd = landlock_create_ruleset(&attr).map_err(LockdownError::Landlock)?;
 
@@ -99,11 +115,13 @@ fn apply_landlock(
         | LANDLOCK_ACCESS_FS_WRITE_FILE
         | LANDLOCK_ACCESS_FS_MAKE_REG
         | LANDLOCK_ACCESS_FS_MAKE_DIR
+        | LANDLOCK_ACCESS_FS_MAKE_SYM
+        | LANDLOCK_ACCESS_FS_MAKE_FIFO
         | LANDLOCK_ACCESS_FS_REMOVE_FILE
         | LANDLOCK_ACCESS_FS_REMOVE_DIR
         | LANDLOCK_ACCESS_FS_TRUNCATE;
 
-    // Read-only paths from plan.mounts (pre-computed by evalbox, includes system paths)
+    // Read-only mounts from plan (system paths computed by evalbox or user-specified)
     for mount in &plan.mounts {
         if !mount.writable {
             let access = if mount.executable {
@@ -111,32 +129,45 @@ fn apply_landlock(
             } else {
                 read_access & !LANDLOCK_ACCESS_FS_EXECUTE
             };
-            add_path_rule(&ruleset_fd, &mount.target, access);
+            add_path_rule(&ruleset_fd, &mount.source, access);
         }
     }
 
+    // Extra readonly paths (resolved binary mounts)
     for path in extra_readonly_paths {
         add_path_rule(&ruleset_fd, path, read_access);
     }
 
-    // Pre-pivot_root workspace path
-    if let Some(ws_path) = workspace_path {
-        add_path_rule(&ruleset_fd, ws_path, write_access);
+    // Writable workspace paths (real absolute paths, no pivot_root)
+    add_path_rule(&ruleset_fd, workspace_root.join("work"), write_access);
+    add_path_rule(&ruleset_fd, workspace_root.join("tmp"), write_access);
+    add_path_rule(&ruleset_fd, workspace_root.join("home"), write_access);
+
+    // System paths (read-only with execute)
+    for path in ["/usr", "/bin", "/lib", "/lib64", "/etc"] {
+        add_path_rule(&ruleset_fd, path, read_access);
     }
 
-    // Writable paths
-    for path in ["/work", "/tmp", "/home"] {
-        add_path_rule(&ruleset_fd, path, write_access);
+    // NixOS store
+    if Path::new("/nix/store").exists() {
+        add_path_rule(&ruleset_fd, "/nix/store", read_access);
+    }
+    if Path::new("/run/current-system").exists() {
+        add_path_rule(&ruleset_fd, "/run/current-system", read_access);
     }
 
     // Proc (read-only)
-    add_path_rule(&ruleset_fd, "/proc", read_access);
+    add_path_rule(
+        &ruleset_fd,
+        "/proc",
+        read_access & !LANDLOCK_ACCESS_FS_EXECUTE,
+    );
 
     // Dev (read + write for /dev/null etc.)
     add_path_rule(
         &ruleset_fd,
         "/dev",
-        read_access | LANDLOCK_ACCESS_FS_WRITE_FILE,
+        (read_access & !LANDLOCK_ACCESS_FS_EXECUTE) | LANDLOCK_ACCESS_FS_WRITE_FILE,
     );
 
     landlock_restrict_self(&ruleset_fd).map_err(LockdownError::Landlock)
@@ -144,9 +175,9 @@ fn apply_landlock(
 
 /// Add a path rule to the Landlock ruleset.
 ///
-/// Errors are logged to stderr but not propagated - the path simply won't be
-/// accessible in the sandbox. This is intentional: missing paths (like /nix/store
-/// on non-NixOS) should not prevent sandbox creation.
+/// Errors are logged but not propagated - the path simply won't be
+/// accessible in the sandbox. Missing paths (like /nix/store on non-NixOS)
+/// should not prevent sandbox creation.
 fn add_path_rule(ruleset_fd: &OwnedFd, path: impl AsRef<Path>, access: u64) {
     let path = path.as_ref();
     let fd = match open_path(path) {
@@ -159,7 +190,6 @@ fn add_path_rule(ruleset_fd: &OwnedFd, path: impl AsRef<Path>, access: u64) {
         parent_fd: fd.as_raw_fd(),
     };
     if let Err(e) = landlock_add_rule_path(ruleset_fd, &rule) {
-        // Log but don't fail - path won't be accessible in sandbox
         eprintln!("warning: landlock rule for {path:?} failed: {e}");
     }
 }
@@ -175,13 +205,45 @@ fn open_path(path: impl AsRef<Path>) -> Result<OwnedFd, Errno> {
     }
 }
 
-fn apply_seccomp() -> Result<(), LockdownError> {
-    let filter = build_whitelist_filter(DEFAULT_WHITELIST);
-    let fprog = SockFprog {
-        len: filter.len() as u16,
-        filter: filter.as_ptr(),
-    };
-    unsafe { seccomp_set_mode_filter(&fprog) }.map_err(LockdownError::Seccomp)
+// Securebits constants (from <linux/securebits.h>)
+const SECBIT_NOROOT: u64 = 1 << 0;
+const SECBIT_NOROOT_LOCKED: u64 = 1 << 1;
+const SECBIT_NO_SETUID_FIXUP: u64 = 1 << 2;
+const SECBIT_NO_SETUID_FIXUP_LOCKED: u64 = 1 << 3;
+const SECBIT_KEEP_CAPS_LOCKED: u64 = 1 << 5;
+const SECBIT_NO_CAP_AMBIENT_RAISE: u64 = 1 << 6;
+const SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED: u64 = 1 << 7;
+
+/// Apply securebits to lock capability state permanently.
+///
+/// This prevents the process from ever regaining capabilities through
+/// any mechanism (exec of setuid, ambient capabilities, etc.).
+fn apply_securebits() -> Result<(), LockdownError> {
+    let bits = SECBIT_NOROOT
+        | SECBIT_NOROOT_LOCKED
+        | SECBIT_NO_SETUID_FIXUP
+        | SECBIT_NO_SETUID_FIXUP_LOCKED
+        | SECBIT_KEEP_CAPS_LOCKED
+        | SECBIT_NO_CAP_AMBIENT_RAISE
+        | SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED;
+
+    let ret = unsafe { libc::prctl(libc::PR_SET_SECUREBITS, bits, 0, 0, 0) };
+    if ret != 0 {
+        // Not fatal — securebits may require capabilities we don't have.
+        // The important thing is NO_NEW_PRIVS + dropping all caps.
+        eprintln!("warning: PR_SET_SECUREBITS failed: {}", last_errno());
+    }
+    Ok(())
+}
+
+/// Set `PR_SET_NO_NEW_PRIVS` — required before `landlock_restrict_self` and seccomp.
+fn set_no_new_privs() -> Result<(), LockdownError> {
+    let ret = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
+    if ret != 0 {
+        Err(LockdownError::Capability(last_errno()))
+    } else {
+        Ok(())
+    }
 }
 
 fn drop_all_caps() -> Result<(), LockdownError> {
@@ -197,33 +259,18 @@ fn drop_all_caps() -> Result<(), LockdownError> {
             libc::prctl(libc::PR_CAPBSET_DROP, cap, 0, 0, 0);
         }
     }
-
-    let ret = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
-    if ret != 0 {
-        Err(LockdownError::Capability(last_errno()))
-    } else {
-        Ok(())
-    }
+    Ok(())
 }
 
-fn close_extra_fds() -> Result<(), LockdownError> {
-    let mut fds_to_close = Vec::new();
-
-    if let Ok(entries) = std::fs::read_dir("/proc/self/fd") {
-        for entry in entries.flatten() {
-            if let Ok(fd) = entry.file_name().to_string_lossy().parse::<RawFd>() {
-                if fd > 2 {
-                    fds_to_close.push(fd);
-                }
-            }
-        }
-    }
-
-    for fd in fds_to_close {
-        unsafe { libc::close(fd) };
+/// Close all file descriptors > 2 using `close_range` syscall.
+///
+/// This is called separately from lockdown because it must happen after
+/// seccomp filter installation and listener fd transfer.
+pub fn close_extra_fds() {
+    // close_range(3, MAX, 0) — close all fds from 3 to MAX
+    unsafe {
+        libc::syscall(libc::SYS_close_range, 3u32, u32::MAX, 0u32);
     }
-
-    Ok(())
 }
 
 #[cfg(test)]
diff --git a/crates/evalbox-sandbox/src/isolation/mod.rs b/crates/evalbox-sandbox/src/isolation/mod.rs
index 4b1c334..4e69f46 100644
--- a/crates/evalbox-sandbox/src/isolation/mod.rs
+++ b/crates/evalbox-sandbox/src/isolation/mod.rs
@@ -2,16 +2,10 @@
 //!
 //! This module contains all the security isolation layers:
 //!
-//! - **namespace** - User namespace and ID mapping setup
-//! - **rootfs** - Filesystem setup (bind mounts, `pivot_root`, rlimits)
-//! - **lockdown** - Security restrictions (Landlock, seccomp, capabilities)
+//! - **lockdown** - Security restrictions (Landlock v5, seccomp, securebits, capabilities)
+//! - **rlimits** - Resource limits (memory, CPU, files, processes)
 
 mod lockdown;
-mod namespace;
-mod rootfs;
+pub mod rlimits;
 
-pub use lockdown::{LockdownError, lockdown};
-pub use namespace::setup_id_maps;
-pub use rootfs::{
-    bind_mount, make_rprivate, mount_minimal_dev, mount_proc, pivot_root_and_cleanup, set_hostname,
-};
+pub use lockdown::{LockdownError, close_extra_fds, lockdown};
diff --git a/crates/evalbox-sandbox/src/isolation/namespace.rs b/crates/evalbox-sandbox/src/isolation/namespace.rs
deleted file mode 100644
index 23ad83b..0000000
--- a/crates/evalbox-sandbox/src/isolation/namespace.rs
+++ /dev/null
@@ -1,83 +0,0 @@
-//! User namespace and ID mapping setup.
-//!
-//! Sets up UID/GID mappings so the sandboxed process runs as root (UID 0)
-//! inside the namespace, but maps to the real user outside.
-//!
-//! ## How It Works
-//!
-//! ```text
-//! Outside namespace:  uid=1000 (real user)
-//!                          │
-//!                    ┌─────▼─────┐
-//!                    │  uid_map  │  "0 1000 1"
-//!                    └─────┬─────┘
-//!                          │
-//! Inside namespace:   uid=0 (appears as root)
-//! ```
-//!
-//! ## Security
-//!
-//! - `deny_setgroups` must be called BEFORE writing `gid_map` (kernel requirement)
-//! - The process appears as root inside but has no real privileges
-//! - This enables `pivot_root` and mount operations inside the namespace
-
-use std::fs;
-use std::io;
-
-/// Write UID mapping for a process in a user namespace.
-///
-/// Maps `inside_uid` (seen inside namespace) to `outside_uid` (real UID).
-/// The "1" at the end means we map exactly one UID.
-pub fn write_uid_map(pid: libc::pid_t, inside_uid: u32, outside_uid: u32) -> io::Result<()> {
-    fs::write(
-        format!("/proc/{pid}/uid_map"),
-        format!("{inside_uid} {outside_uid} 1\n"),
-    )
-}
-
-/// Write GID mapping for a process in a user namespace.
-///
-/// Maps `inside_gid` (seen inside namespace) to `outside_gid` (real GID).
-pub fn write_gid_map(pid: libc::pid_t, inside_gid: u32, outside_gid: u32) -> io::Result<()> {
-    fs::write(
-        format!("/proc/{pid}/gid_map"),
-        format!("{inside_gid} {outside_gid} 1\n"),
-    )
-}
-
-/// Deny setgroups syscall for a process.
-///
-/// # Safety Order
-///
-/// MUST be called before `write_gid_map`. The kernel requires this to prevent
-/// privilege escalation via group manipulation.
-pub fn deny_setgroups(pid: libc::pid_t) -> io::Result<()> {
-    fs::write(format!("/proc/{pid}/setgroups"), "deny\n")
-}
-
-/// Set up complete ID mappings for a child process.
-///
-/// Maps UID 0 and GID 0 inside the namespace to the current user's
-/// real UID/GID outside. This allows the sandboxed process to appear
-/// as root while having no actual privileges.
-pub fn setup_id_maps(child_pid: libc::pid_t) -> io::Result<()> {
-    // SAFETY: getuid/getgid are always safe to call
-    let uid = unsafe { libc::getuid() };
-    let gid = unsafe { libc::getgid() };
-
-    // SAFETY: deny_setgroups MUST come before write_gid_map
-    deny_setgroups(child_pid)?;
-    write_uid_map(child_pid, 0, uid)?;
-    write_gid_map(child_pid, 0, gid)
-}
-
-#[cfg(test)]
-mod tests {
-    #[test]
-    fn current_uid_gid() {
-        // SAFETY: getuid/getgid are always safe
-        let uid = unsafe { libc::getuid() };
-        let gid = unsafe { libc::getgid() };
-        assert!(uid > 0 || gid > 0);
-    }
-}
diff --git a/crates/evalbox-sandbox/src/isolation/rlimits.rs b/crates/evalbox-sandbox/src/isolation/rlimits.rs
new file mode 100644
index 0000000..67d97b8
--- /dev/null
+++ b/crates/evalbox-sandbox/src/isolation/rlimits.rs
@@ -0,0 +1,71 @@
+//! Resource limits for sandboxed processes.
+//!
+//! Sets kernel-enforced resource limits to prevent denial-of-service.
+//!
+//! ## Limits Applied
+//!
+//! | Limit | Purpose | Default |
+//! |-------|---------|---------|
+//! | `RLIMIT_DATA` | Memory usage | 256 MiB |
+//! | `RLIMIT_CPU` | CPU time | timeout * 2 + 60s |
+//! | `RLIMIT_FSIZE` | Output file size | 16 MiB |
+//! | `RLIMIT_NOFILE` | Open file descriptors | 256 |
+//! | `RLIMIT_NPROC` | Max processes | 64 |
+//! | `RLIMIT_CORE` | Core dump size | 0 (disabled) |
+//! | `RLIMIT_STACK` | Stack size | 8 MiB |
+//!
+//! ## Note on `RLIMIT_AS`
+//!
+//! We intentionally do NOT set `RLIMIT_AS` (virtual address space).
+//! Modern runtimes like Go, Java, and V8 pre-allocate large virtual address
+//! ranges but only commit small portions. `RLIMIT_AS` would break these
+//! runtimes. `RLIMIT_DATA` limits actual memory and is more appropriate.
+
+use evalbox_sys::last_errno;
+use rustix::io::Errno;
+
+use crate::plan::Plan;
+
+/// Apply resource limits based on the sandbox plan.
+pub fn apply_rlimits(plan: &Plan) -> Result<(), Errno> {
+    let cpu_secs = plan.timeout.as_secs().saturating_mul(2).saturating_add(60);
+
+    set_rlimit(libc::RLIMIT_DATA, plan.memory_limit)?;
+    set_rlimit(libc::RLIMIT_CPU, cpu_secs)?;
+    set_rlimit(libc::RLIMIT_FSIZE, plan.max_output)?;
+    set_rlimit(libc::RLIMIT_NOFILE, 256)?;
+    set_rlimit(libc::RLIMIT_NPROC, u64::from(plan.max_pids))?;
+    set_rlimit(libc::RLIMIT_CORE, 0)?;
+    set_rlimit(libc::RLIMIT_STACK, 8 * 1024 * 1024)?;
+    Ok(())
+}
+
+#[inline]
+fn set_rlimit(resource: libc::__rlimit_resource_t, limit: u64) -> Result<(), Errno> {
+    let rlim = libc::rlimit {
+        rlim_cur: limit,
+        rlim_max: limit,
+    };
+    // SAFETY: rlim is valid, resource is a valid constant.
+    if unsafe { libc::setrlimit(resource, &rlim) } != 0 {
+        Err(last_errno())
+    } else {
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn get_current_nofile() {
+        let mut rlim = libc::rlimit {
+            rlim_cur: 0,
+            rlim_max: 0,
+        };
+        assert_eq!(
+            unsafe { libc::getrlimit(libc::RLIMIT_NOFILE, &mut rlim) },
+            0
+        );
+        assert!(rlim.rlim_cur > 0);
+    }
+}
diff --git a/crates/evalbox-sandbox/src/isolation/rootfs.rs b/crates/evalbox-sandbox/src/isolation/rootfs.rs
deleted file mode 100644
index 09892ba..0000000
--- a/crates/evalbox-sandbox/src/isolation/rootfs.rs
+++ /dev/null
@@ -1,267 +0,0 @@
-//! Rootfs setup and resource limits for sandboxed processes.
-//!
-//! This module handles:
-//! - Bind mounts for the sandbox filesystem
-//! - Pivot root to isolate the filesystem
-//! - Resource limits (rlimits)
-//!
-//! ## Filesystem Layout (after `pivot_root`)
-//!
-//! ```text
-//! /
-//! ├── bin/      → bind mount from /bin (read-only)
-//! ├── dev/      → bind mounts: null, zero, urandom, random + symlinks
-//! ├── etc/      → bind mount from /etc (read-only)
-//! ├── home/     → empty, writable
-//! ├── lib/      → bind mount from /lib (read-only)
-//! ├── lib64/    → bind mount from /lib64 if exists (read-only)
-//! ├── nix/      → bind mount from /nix/store on NixOS (read-only)
-//! ├── proc/     → bind mount from /proc (read-only)
-//! ├── tmp/      → empty, writable
-//! ├── usr/      → bind mount from /usr (read-only)
-//! └── work/     → user code directory, writable
-//! ```
-
-use std::ffi::CString;
-use std::os::unix::ffi::OsStrExt;
-use std::path::Path;
-
-use evalbox_sys::last_errno;
-use rustix::io::Errno;
-use rustix::process::pivot_root;
-
-use crate::plan::Plan;
-
-/// Make all mounts private recursively.
-pub fn make_rprivate() -> Result<(), Errno> {
-    let ret = unsafe {
-        libc::mount(
-            std::ptr::null(),
-            c"/".as_ptr(),
-            std::ptr::null(),
-            libc::MS_REC | libc::MS_PRIVATE,
-            std::ptr::null(),
-        )
-    };
-    if ret != 0 { Err(last_errno()) } else { Ok(()) }
-}
-
-/// Mount proc filesystem (bind-mounted read-only from host).
-pub fn mount_proc(target: &Path) -> Result<(), Errno> {
-    let target_c = path_to_cstring(target)?;
-
-    let ret = unsafe {
-        libc::mount(
-            c"/proc".as_ptr(),
-            target_c.as_ptr(),
-            std::ptr::null(),
-            libc::MS_BIND | libc::MS_REC,
-            std::ptr::null(),
-        )
-    };
-    if ret != 0 {
-        return Err(last_errno());
-    }
-
-    let ret = unsafe {
-        libc::mount(
-            std::ptr::null(),
-            target_c.as_ptr(),
-            std::ptr::null(),
-            libc::MS_BIND
-                | libc::MS_REMOUNT
-                | libc::MS_RDONLY
-                | libc::MS_NOSUID
-                | libc::MS_NODEV
-                | libc::MS_NOEXEC,
-            std::ptr::null(),
-        )
-    };
-    if ret != 0 { Err(last_errno()) } else { Ok(()) }
-}
-
-/// Create minimal /dev with null, zero, urandom (bind-mounted from host).
-pub fn mount_minimal_dev(target: &Path) -> Result<(), Errno> {
-    for dev in ["null", "zero", "urandom", "random"] {
-        bind_mount_dev(target, dev)?;
-    }
-
-    let fd_path = target.join("fd");
-    let fd_c = path_to_cstring(&fd_path)?;
-    if unsafe { libc::symlink(c"/proc/self/fd".as_ptr(), fd_c.as_ptr()) } != 0 {
-        return Err(last_errno());
-    }
-
-    for (name, num) in [("stdin", 0), ("stdout", 1), ("stderr", 2)] {
-        let link_path = target.join(name);
-        let link_c = path_to_cstring(&link_path)?;
-        let target_str = CString::new(format!("/proc/self/fd/{num}")).map_err(|_| Errno::INVAL)?;
-        if unsafe { libc::symlink(target_str.as_ptr(), link_c.as_ptr()) } != 0 {
-            return Err(last_errno());
-        }
-    }
-
-    Ok(())
-}
-
-fn bind_mount_dev(target_dev: &Path, name: &str) -> Result<(), Errno> {
-    let source = Path::new("/dev").join(name);
-    let target = target_dev.join(name);
-    let target_c = path_to_cstring(&target)?;
-    let source_c = path_to_cstring(&source)?;
-
-    let fd = unsafe { libc::open(target_c.as_ptr(), libc::O_CREAT | libc::O_WRONLY, 0o644) };
-    if fd < 0 {
-        return Err(last_errno());
-    }
-    unsafe { libc::close(fd) };
-
-    let ret = unsafe {
-        libc::mount(
-            source_c.as_ptr(),
-            target_c.as_ptr(),
-            std::ptr::null(),
-            libc::MS_BIND,
-            std::ptr::null(),
-        )
-    };
-    if ret != 0 { Err(last_errno()) } else { Ok(()) }
-}
-
-/// Bind mount a path.
-pub fn bind_mount(source: &Path, target: &Path, readonly: bool) -> Result<(), Errno> {
-    let source_c = path_to_cstring(source)?;
-    let target_c = path_to_cstring(target)?;
-
-    let ret = unsafe {
-        libc::mount(
-            source_c.as_ptr(),
-            target_c.as_ptr(),
-            std::ptr::null(),
-            libc::MS_BIND | libc::MS_REC,
-            std::ptr::null(),
-        )
-    };
-    if ret != 0 {
-        return Err(last_errno());
-    }
-
-    if readonly {
-        let ret = unsafe {
-            libc::mount(
-                std::ptr::null(),
-                target_c.as_ptr(),
-                std::ptr::null(),
-                libc::MS_BIND | libc::MS_REMOUNT | libc::MS_RDONLY,
-                std::ptr::null(),
-            )
-        };
-        if ret != 0 {
-            return Err(last_errno());
-        }
-    }
-
-    Ok(())
-}
-
-/// Perform `pivot_root` and clean up the old root.
-pub fn pivot_root_and_cleanup(new_root: &Path) -> Result<(), Errno> {
-    let new_root_c = path_to_cstring(new_root)?;
-
-    let ret = unsafe {
-        libc::mount(
-            new_root_c.as_ptr(),
-            new_root_c.as_ptr(),
-            std::ptr::null(),
-            libc::MS_BIND | libc::MS_REC,
-            std::ptr::null(),
-        )
-    };
-    if ret != 0 {
-        return Err(last_errno());
-    }
-
-    let old_root = new_root.join(".old_root");
-    let old_root_c = path_to_cstring(&old_root)?;
-    unsafe { libc::mkdir(old_root_c.as_ptr(), 0o700) };
-
-    let new_root_cstr = CString::new(new_root_c.as_bytes()).map_err(|_| Errno::INVAL)?;
-    let old_root_cstr = CString::new(old_root_c.as_bytes()).map_err(|_| Errno::INVAL)?;
-    pivot_root(new_root_cstr.as_c_str(), old_root_cstr.as_c_str())?;
-
-    unsafe {
-        libc::chdir(c"/".as_ptr());
-        libc::umount2(c"/.old_root".as_ptr(), libc::MNT_DETACH);
-        libc::rmdir(c"/.old_root".as_ptr());
-    }
-
-    Ok(())
-}
-
-/// Set the hostname.
-pub fn set_hostname(name: &str) -> Result<(), Errno> {
-    let ret = unsafe { libc::sethostname(name.as_ptr().cast::<libc::c_char>(), name.len()) };
-    if ret != 0 { Err(last_errno()) } else { Ok(()) }
-}
-
-#[inline]
-fn path_to_cstring(path: &Path) -> Result<CString, Errno> {
-    CString::new(path.as_os_str().as_bytes()).map_err(|_| Errno::INVAL)
-}
-
-/// Apply resource limits based on the sandbox plan.
-pub fn apply_rlimits(plan: &Plan) -> Result<(), Errno> {
-    let cpu_secs = plan.timeout.as_secs().saturating_mul(2).saturating_add(60);
-
-    // Note: We intentionally do NOT set RLIMIT_AS (address space).
-    // RLIMIT_AS limits virtual memory, which can be much larger than actual usage.
-    // Modern runtimes like Go, Java, and V8 pre-allocate large virtual address ranges
-    // but only commit (use) small portions. RLIMIT_AS would break these runtimes.
-    // RLIMIT_DATA limits the data segment and is more appropriate for real memory control.
-    set_rlimit(libc::RLIMIT_DATA, plan.memory_limit)?;
-    set_rlimit(libc::RLIMIT_CPU, cpu_secs)?;
-    set_rlimit(libc::RLIMIT_FSIZE, plan.max_output)?;
-    set_rlimit(libc::RLIMIT_NOFILE, 256)?;
-    set_rlimit(libc::RLIMIT_NPROC, u64::from(plan.max_pids))?;
-    set_rlimit(libc::RLIMIT_CORE, 0)?;
-    set_rlimit(libc::RLIMIT_STACK, 8 * 1024 * 1024)?;
-    Ok(())
-}
-
-#[inline]
-fn set_rlimit(resource: libc::__rlimit_resource_t, limit: u64) -> Result<(), Errno> {
-    let rlim = libc::rlimit {
-        rlim_cur: limit,
-        rlim_max: limit,
-    };
-    // SAFETY: rlim is valid, resource is a valid constant.
-    if unsafe { libc::setrlimit(resource, &rlim) } != 0 {
-        Err(last_errno())
-    } else {
-        Ok(())
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn path_to_cstring_valid() {
-        let cstr = path_to_cstring(Path::new("/tmp/test")).unwrap();
-        assert_eq!(cstr.as_bytes(), b"/tmp/test");
-    }
-
-    #[test]
-    fn get_current_nofile() {
-        let mut rlim = libc::rlimit {
-            rlim_cur: 0,
-            rlim_max: 0,
-        };
-        assert_eq!(
-            unsafe { libc::getrlimit(libc::RLIMIT_NOFILE, &mut rlim) },
-            0
-        );
-        assert!(rlim.rlim_cur > 0);
-    }
-}
diff --git a/crates/evalbox-sandbox/src/lib.rs b/crates/evalbox-sandbox/src/lib.rs
index bfd4bfb..006b14a 100644
--- a/crates/evalbox-sandbox/src/lib.rs
+++ b/crates/evalbox-sandbox/src/lib.rs
@@ -3,12 +3,13 @@
 //! This crate provides secure sandboxed execution of untrusted code on Linux.
 //! It combines multiple isolation mechanisms for defense in depth:
 //!
-//! - **User namespaces** - Unprivileged containers, UID 0 inside = real user outside
-//! - **Mount namespaces** - Private filesystem view with minimal bind mounts
-//! - **Pivot root** - Change root directory, unmount host filesystem
-//! - **Landlock** - Filesystem and network access control (kernel 5.13+)
+//! - **Landlock v5** - Filesystem, network, signal, and IPC access control
 //! - **Seccomp-BPF** - Syscall whitelist (~40 allowed syscalls)
+//! - **Seccomp User Notify** - Optional syscall interception for FS virtualization
 //! - **Rlimits** - Resource limits (memory, CPU, files, processes)
+//! - **Capabilities** - All capabilities dropped, `NO_NEW_PRIVS` enforced
+//!
+//! No user namespaces required — works inside Docker with default seccomp profile.
 //!
 //! ## Quick Start
 //!
@@ -22,8 +23,7 @@
 //!
 //! ## Requirements
 //!
-//! - Linux kernel 5.13+ (for Landlock ABI 1+)
-//! - User namespaces enabled (`/proc/sys/kernel/unprivileged_userns_clone = 1`)
+//! - Linux kernel 6.12+ (for Landlock ABI 5)
 //! - Seccomp enabled in kernel
 
 #![allow(clippy::cast_possible_truncation)]
@@ -32,6 +32,7 @@
 pub mod executor;
 pub mod isolation;
 pub mod monitor;
+pub mod notify;
 pub mod plan;
 pub mod resolve;
 pub mod sysinfo;
@@ -40,10 +41,5 @@ pub mod workspace;
 
 pub use executor::{Event, Executor, ExecutorError, SandboxId};
 pub use monitor::{Output, Status};
-pub use plan::{Landlock, Mount, Plan, Syscalls, UserFile};
+pub use plan::{Landlock, Mount, NotifyMode, Plan, Syscalls, UserFile};
 pub use resolve::{ResolveError, ResolvedBinary, resolve_binary};
-
-// Backwards compatibility
-#[allow(deprecated)]
-#[doc(hidden)]
-pub use plan::SandboxPlan;
diff --git a/crates/evalbox-sandbox/src/notify/mod.rs b/crates/evalbox-sandbox/src/notify/mod.rs
new file mode 100644
index 0000000..d4912f2
--- /dev/null
+++ b/crates/evalbox-sandbox/src/notify/mod.rs
@@ -0,0 +1,17 @@
+//! Seccomp user notification support.
+//!
+//! This module provides the supervisor side of seccomp user notification,
+//! enabling syscall interception without Linux user namespaces.
+//!
+//! ## Modules
+//!
+//! - **supervisor** - Main notification loop that handles intercepted syscalls
+//! - **`virtual_fs`** - Path translation for filesystem virtualization
+//! - **`scm_rights`** - Unix socket fd passing (child → parent listener fd transfer)
+
+pub mod scm_rights;
+pub mod supervisor;
+pub mod virtual_fs;
+
+pub use supervisor::{NotifyEvent, Supervisor};
+pub use virtual_fs::VirtualFs;
diff --git a/crates/evalbox-sandbox/src/notify/scm_rights.rs b/crates/evalbox-sandbox/src/notify/scm_rights.rs
new file mode 100644
index 0000000..3acf6c4
--- /dev/null
+++ b/crates/evalbox-sandbox/src/notify/scm_rights.rs
@@ -0,0 +1,158 @@
+//! Unix socket fd passing via `SCM_RIGHTS`.
+//!
+//! After the child installs its seccomp notify filter, it receives a listener fd.
+//! This fd must be passed to the parent process so the parent can handle
+//! notifications. We use `SCM_RIGHTS` over an `AF_UNIX` socketpair to transfer
+//! the fd across the fork boundary.
+
+use std::io;
+use std::os::fd::{FromRawFd, OwnedFd, RawFd};
+
+/// Create an `AF_UNIX SOCK_STREAM` socketpair.
+///
+/// Returns `(parent_sock, child_sock)`. After fork, parent closes `child_sock`
+/// and child closes `parent_sock`.
+pub fn create_socketpair() -> io::Result<(OwnedFd, OwnedFd)> {
+    let mut fds = [0i32; 2];
+    let ret = unsafe {
+        libc::socketpair(
+            libc::AF_UNIX,
+            libc::SOCK_STREAM | libc::SOCK_CLOEXEC,
+            0,
+            fds.as_mut_ptr(),
+        )
+    };
+    if ret < 0 {
+        return Err(io::Error::last_os_error());
+    }
+    Ok(unsafe { (OwnedFd::from_raw_fd(fds[0]), OwnedFd::from_raw_fd(fds[1])) })
+}
+
+/// Send a file descriptor over a unix socket using `SCM_RIGHTS`.
+pub fn send_fd(socket: RawFd, fd: RawFd) -> io::Result<()> {
+    let data = [0u8; 1];
+    let iov = libc::iovec {
+        iov_base: data.as_ptr() as *mut libc::c_void,
+        iov_len: 1,
+    };
+
+    // cmsg buffer: header + one fd
+    let cmsg_space = unsafe { libc::CMSG_SPACE(size_of::<RawFd>() as u32) } as usize;
+    let mut cmsg_buf = vec![0u8; cmsg_space];
+
+    let mut msg: libc::msghdr = unsafe { std::mem::zeroed() };
+    msg.msg_iov = &iov as *const _ as *mut _;
+    msg.msg_iovlen = 1;
+    msg.msg_control = cmsg_buf.as_mut_ptr().cast();
+    msg.msg_controllen = cmsg_space;
+
+    // Fill control message
+    let cmsg = unsafe { libc::CMSG_FIRSTHDR(&msg) };
+    if cmsg.is_null() {
+        return Err(io::Error::other("CMSG_FIRSTHDR null"));
+    }
+    unsafe {
+        (*cmsg).cmsg_level = libc::SOL_SOCKET;
+        (*cmsg).cmsg_type = libc::SCM_RIGHTS;
+        (*cmsg).cmsg_len = libc::CMSG_LEN(size_of::<RawFd>() as u32) as usize;
+        let data_ptr = libc::CMSG_DATA(cmsg);
+        std::ptr::copy_nonoverlapping(
+            (&fd as *const RawFd).cast::<u8>(),
+            data_ptr,
+            size_of::<RawFd>(),
+        );
+    }
+
+    let ret = unsafe { libc::sendmsg(socket, &msg, 0) };
+    if ret < 0 {
+        Err(io::Error::last_os_error())
+    } else {
+        Ok(())
+    }
+}
+
+/// Receive a file descriptor from a unix socket using `SCM_RIGHTS`.
+pub fn recv_fd(socket: RawFd) -> io::Result<OwnedFd> {
+    let mut data = [0u8; 1];
+    let mut iov = libc::iovec {
+        iov_base: data.as_mut_ptr().cast(),
+        iov_len: 1,
+    };
+
+    let cmsg_space = unsafe { libc::CMSG_SPACE(size_of::<RawFd>() as u32) } as usize;
+    let mut cmsg_buf = vec![0u8; cmsg_space];
+
+    let mut msg: libc::msghdr = unsafe { std::mem::zeroed() };
+    msg.msg_iov = &mut iov;
+    msg.msg_iovlen = 1;
+    msg.msg_control = cmsg_buf.as_mut_ptr().cast();
+    msg.msg_controllen = cmsg_space;
+
+    let ret = unsafe { libc::recvmsg(socket, &mut msg, 0) };
+    if ret < 0 {
+        return Err(io::Error::last_os_error());
+    }
+
+    let cmsg = unsafe { libc::CMSG_FIRSTHDR(&msg) };
+    if cmsg.is_null() {
+        return Err(io::Error::new(
+            io::ErrorKind::InvalidData,
+            "no control message received",
+        ));
+    }
+
+    unsafe {
+        if (*cmsg).cmsg_level != libc::SOL_SOCKET || (*cmsg).cmsg_type != libc::SCM_RIGHTS {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                "unexpected control message type",
+            ));
+        }
+        let mut fd: RawFd = 0;
+        let data_ptr = libc::CMSG_DATA(cmsg);
+        std::ptr::copy_nonoverlapping(
+            data_ptr,
+            (&mut fd as *mut RawFd).cast::<u8>(),
+            size_of::<RawFd>(),
+        );
+        Ok(OwnedFd::from_raw_fd(fd))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::os::fd::AsRawFd;
+
+    use super::*;
+
+    #[test]
+    fn socketpair_creation() {
+        let (a, b) = create_socketpair().unwrap();
+        assert!(a.as_raw_fd() >= 0);
+        assert!(b.as_raw_fd() >= 0);
+        assert_ne!(a.as_raw_fd(), b.as_raw_fd());
+    }
+
+    #[test]
+    fn send_recv_fd() {
+        let (parent, child) = create_socketpair().unwrap();
+
+        // Create a pipe and send its read end
+        let mut pipe_fds = [0i32; 2];
+        unsafe { libc::pipe(pipe_fds.as_mut_ptr()) };
+        let pipe_read = pipe_fds[0];
+        let pipe_write = pipe_fds[1];
+
+        send_fd(child.as_raw_fd(), pipe_read).unwrap();
+        let received = recv_fd(parent.as_raw_fd()).unwrap();
+
+        // The received fd should be valid and different from the original
+        assert!(received.as_raw_fd() >= 0);
+
+        // Clean up
+        unsafe {
+            libc::close(pipe_read);
+            libc::close(pipe_write);
+        }
+    }
+}
diff --git a/crates/evalbox-sandbox/src/notify/supervisor.rs b/crates/evalbox-sandbox/src/notify/supervisor.rs
new file mode 100644
index 0000000..cf458d6
--- /dev/null
+++ b/crates/evalbox-sandbox/src/notify/supervisor.rs
@@ -0,0 +1,274 @@
+//! Seccomp notification supervisor.
+//!
+//! Runs in the parent process, handling intercepted syscalls from the sandboxed child.
+//! The supervisor receives notifications via the seccomp listener fd and decides
+//! how to respond based on the configured [`NotifyMode`].
+//!
+//! ## Modes
+//!
+//! - **Monitor**: Log syscall and return `SECCOMP_USER_NOTIF_FLAG_CONTINUE`
+//! - **Virtualize**: Translate filesystem paths via [`VirtualFs`], inject fds via `ADDFD`
+
+use std::fs::File;
+use std::io::{self, Read, Seek, SeekFrom};
+use std::os::fd::{AsRawFd, OwnedFd, RawFd};
+
+use evalbox_sys::seccomp_notify::{
+    SECCOMP_ADDFD_FLAG_SEND, SECCOMP_USER_NOTIF_FLAG_CONTINUE, SeccompNotif, SeccompNotifAddfd,
+    SeccompNotifResp, notif_addfd, notif_id_valid, notif_recv, notif_send,
+};
+
+use super::virtual_fs::VirtualFs;
+use crate::plan::NotifyMode;
+
+/// Events emitted by the supervisor for future user-facing notifications.
+#[derive(Debug)]
+pub enum NotifyEvent {
+    /// A syscall was intercepted and handled.
+    SyscallHandled {
+        /// PID of the process that made the syscall.
+        pid: u32,
+        /// Syscall number.
+        syscall_nr: i32,
+        /// Whether the syscall was allowed.
+        allowed: bool,
+    },
+}
+
+/// Seccomp notification supervisor.
+pub struct Supervisor {
+    listener_fd: OwnedFd,
+    mode: NotifyMode,
+    vfs: VirtualFs,
+}
+
+impl Supervisor {
+    /// Create a new supervisor.
+    pub fn new(listener_fd: OwnedFd, mode: NotifyMode, vfs: VirtualFs) -> Self {
+        Self {
+            listener_fd,
+            mode,
+            vfs,
+        }
+    }
+
+    /// Get the raw fd for registering with poll/mio.
+    pub fn fd(&self) -> RawFd {
+        self.listener_fd.as_raw_fd()
+    }
+
+    /// Handle a notification event. Call when the listener fd is readable.
+    ///
+    /// Returns `Some(NotifyEvent)` on success, `None` if the notification was
+    /// stale (child died or already handled).
+    pub fn handle_event(&self) -> io::Result<Option<NotifyEvent>> {
+        let mut notif = SeccompNotif::default();
+
+        if let Err(e) = notif_recv(self.listener_fd.as_raw_fd(), &mut notif) {
+            // ENOENT means the target process died before we could receive
+            if e == rustix::io::Errno::NOENT {
+                return Ok(None);
+            }
+            return Err(io::Error::from_raw_os_error(e.raw_os_error()));
+        }
+
+        match self.mode {
+            NotifyMode::Disabled => {
+                debug_assert!(
+                    false,
+                    "supervisor received notification with NotifyMode::Disabled"
+                );
+                self.respond_continue(&notif)?;
+                Ok(None)
+            }
+            NotifyMode::Monitor => self.handle_monitor(&notif),
+            NotifyMode::Virtualize => self.handle_virtualize(&notif),
+        }
+    }
+
+    fn handle_monitor(&self, notif: &SeccompNotif) -> io::Result<Option<NotifyEvent>> {
+        let syscall_name = syscall_name(notif.data.nr);
+        eprintln!(
+            "[notify] pid={} syscall={}({}) args=[{:#x}, {:#x}, {:#x}]",
+            notif.pid,
+            syscall_name,
+            notif.data.nr,
+            notif.data.args[0],
+            notif.data.args[1],
+            notif.data.args[2],
+        );
+
+        self.respond_continue(notif)?;
+
+        Ok(Some(NotifyEvent::SyscallHandled {
+            pid: notif.pid,
+            syscall_nr: notif.data.nr,
+            allowed: true,
+        }))
+    }
+
+    fn handle_virtualize(&self, notif: &SeccompNotif) -> io::Result<Option<NotifyEvent>> {
+        let syscall_nr = notif.data.nr;
+
+        // For openat-family syscalls, args[1] is the pathname pointer
+        // For open/creat, args[0] is the pathname pointer
+        let path_addr = if syscall_nr == libc::SYS_openat as i32
+            || syscall_nr == libc::SYS_newfstatat as i32
+            || syscall_nr == libc::SYS_faccessat as i32
+            || syscall_nr == libc::SYS_faccessat2 as i32
+            || syscall_nr == libc::SYS_readlinkat as i32
+        {
+            notif.data.args[1]
+        } else {
+            notif.data.args[0]
+        };
+
+        // Read path from child's memory
+        let path = match self.read_child_string(notif.pid, path_addr) {
+            Ok(p) => p,
+            Err(_) => {
+                // Can't read memory, let syscall proceed
+                self.respond_continue(notif)?;
+                return Ok(None);
+            }
+        };
+
+        // TOCTOU check: verify notification is still valid after reading memory
+        if notif_id_valid(self.listener_fd.as_raw_fd(), notif.id).is_err() {
+            return Ok(None); // Notification is stale
+        }
+
+        // Try to translate path
+        if let Some(real_path) = self.vfs.translate(&path) {
+            // For openat: open the file ourselves and inject the fd
+            if syscall_nr == libc::SYS_openat as i32
+                || syscall_nr == libc::SYS_open as i32
+                || syscall_nr == libc::SYS_creat as i32
+            {
+                let flags = if syscall_nr == libc::SYS_openat as i32 {
+                    notif.data.args[2] as i32
+                } else {
+                    notif.data.args[1] as i32
+                };
+
+                match self.open_and_inject(notif, &real_path, flags) {
+                    Ok(()) => {
+                        return Ok(Some(NotifyEvent::SyscallHandled {
+                            pid: notif.pid,
+                            syscall_nr,
+                            allowed: true,
+                        }));
+                    }
+                    Err(_) => {
+                        // Fall through to continue
+                    }
+                }
+            }
+        }
+
+        // No translation or non-open syscall: let it proceed as-is
+        self.respond_continue(notif)?;
+        Ok(Some(NotifyEvent::SyscallHandled {
+            pid: notif.pid,
+            syscall_nr,
+            allowed: true,
+        }))
+    }
+
+    fn respond_continue(&self, notif: &SeccompNotif) -> io::Result<()> {
+        let resp = SeccompNotifResp {
+            id: notif.id,
+            val: 0,
+            error: 0,
+            flags: SECCOMP_USER_NOTIF_FLAG_CONTINUE,
+        };
+        notif_send(self.listener_fd.as_raw_fd(), &resp)
+            .map_err(|e| io::Error::from_raw_os_error(e.raw_os_error()))
+    }
+
+    fn open_and_inject(
+        &self,
+        notif: &SeccompNotif,
+        real_path: &std::path::Path,
+        flags: i32,
+    ) -> io::Result<()> {
+        use std::ffi::CString;
+        use std::os::unix::ffi::OsStrExt;
+
+        let path_c = CString::new(real_path.as_os_str().as_bytes())
+            .map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "invalid path"))?;
+
+        // Open the file at the translated path
+        let fd = unsafe { libc::open(path_c.as_ptr(), flags & !libc::O_CLOEXEC, 0o666) };
+        if fd < 0 {
+            return Err(io::Error::last_os_error());
+        }
+
+        // Inject the fd into the child and atomically respond
+        let addfd = SeccompNotifAddfd {
+            id: notif.id,
+            flags: SECCOMP_ADDFD_FLAG_SEND,
+            srcfd: fd as u32,
+            newfd: 0,
+            newfd_flags: 0,
+        };
+
+        let result = notif_addfd(self.listener_fd.as_raw_fd(), &addfd)
+            .map_err(|e| io::Error::from_raw_os_error(e.raw_os_error()));
+
+        // Close our copy of the fd
+        unsafe { libc::close(fd) };
+
+        result.map(|_| ())
+    }
+
+    /// Read a null-terminated string from the child's memory via `/proc/pid/mem`.
+    fn read_child_string(&self, pid: u32, addr: u64) -> io::Result<String> {
+        let mem_path = format!("/proc/{pid}/mem");
+        let mut file = File::open(&mem_path)?;
+        file.seek(SeekFrom::Start(addr))?;
+
+        let mut buf = vec![0u8; 4096];
+        let n = file.read(&mut buf)?;
+        buf.truncate(n);
+
+        // Find null terminator
+        if let Some(nul_pos) = buf.iter().position(|&b| b == 0) {
+            buf.truncate(nul_pos);
+        }
+
+        String::from_utf8(buf)
+            .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "invalid UTF-8 in path"))
+    }
+}
+
+/// Map syscall number to name for logging.
+fn syscall_name(nr: i32) -> &'static str {
+    match nr as i64 {
+        libc::SYS_openat => "openat",
+        libc::SYS_open => "open",
+        libc::SYS_creat => "creat",
+        libc::SYS_access => "access",
+        libc::SYS_faccessat => "faccessat",
+        libc::SYS_faccessat2 => "faccessat2",
+        libc::SYS_stat => "stat",
+        libc::SYS_lstat => "lstat",
+        libc::SYS_newfstatat => "newfstatat",
+        libc::SYS_statx => "statx",
+        libc::SYS_readlink => "readlink",
+        libc::SYS_readlinkat => "readlinkat",
+        _ => "unknown",
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn syscall_names() {
+        assert_eq!(syscall_name(libc::SYS_openat as i32), "openat");
+        assert_eq!(syscall_name(libc::SYS_stat as i32), "stat");
+        assert_eq!(syscall_name(9999), "unknown");
+    }
+}
diff --git a/crates/evalbox-sandbox/src/notify/virtual_fs.rs b/crates/evalbox-sandbox/src/notify/virtual_fs.rs
new file mode 100644
index 0000000..7c7df5b
--- /dev/null
+++ b/crates/evalbox-sandbox/src/notify/virtual_fs.rs
@@ -0,0 +1,139 @@
+//! Virtual filesystem path translation.
+//!
+//! Maps paths from the child's perspective to real paths on the host.
+//! Used by the supervisor in `Virtualize` mode to translate filesystem
+//! syscalls to the correct workspace paths.
+//!
+//! ## Default Mappings
+//!
+//! | Child sees | Host path |
+//! |-----------|-----------|
+//! | `/work` | `{workspace}/work` |
+//! | `/tmp` | `{workspace}/tmp` |
+//! | `/home` | `{workspace}/home` |
+
+use std::collections::HashMap;
+use std::path::{Path, PathBuf};
+
+/// Virtual filesystem with path translation.
+#[derive(Debug, Clone)]
+pub struct VirtualFs {
+    /// Maps virtual prefix → real prefix.
+    mappings: HashMap<PathBuf, PathBuf>,
+}
+
+impl VirtualFs {
+    /// Create a new `VirtualFs` with default mappings for the given workspace root.
+    pub fn new(workspace_root: &Path) -> Self {
+        let mut mappings = HashMap::new();
+        mappings.insert(PathBuf::from("/work"), workspace_root.join("work"));
+        mappings.insert(PathBuf::from("/tmp"), workspace_root.join("tmp"));
+        mappings.insert(PathBuf::from("/home"), workspace_root.join("home"));
+        Self { mappings }
+    }
+
+    /// Create an empty `VirtualFs` with no mappings.
+    pub fn empty() -> Self {
+        Self {
+            mappings: HashMap::new(),
+        }
+    }
+
+    /// Add a path mapping.
+    pub fn add_mapping(&mut self, virtual_path: impl Into<PathBuf>, real_path: impl Into<PathBuf>) {
+        self.mappings.insert(virtual_path.into(), real_path.into());
+    }
+
+    /// Translate a path from child's view to host's view.
+    ///
+    /// Returns `Some(real_path)` if the path matches a mapping,
+    /// `None` if the path should be accessed as-is (passthrough).
+    pub fn translate(&self, path: &str) -> Option<PathBuf> {
+        let path = Path::new(path);
+        for (virtual_prefix, real_prefix) in &self.mappings {
+            if let Ok(suffix) = path.strip_prefix(virtual_prefix) {
+                return Some(real_prefix.join(suffix));
+            }
+        }
+        None
+    }
+
+    /// Check if a path is within any allowed scope.
+    ///
+    /// In `Virtualize` mode, only paths within mappings or system paths are allowed.
+    pub fn is_allowed(&self, path: &str) -> bool {
+        let path = Path::new(path);
+
+        // Check virtual mappings
+        for virtual_prefix in self.mappings.keys() {
+            if path.starts_with(virtual_prefix) {
+                return true;
+            }
+        }
+
+        // Allow common system paths (read-only, handled by Landlock)
+        let system_prefixes = ["/usr", "/bin", "/lib", "/lib64", "/etc", "/proc", "/dev"];
+        for prefix in &system_prefixes {
+            if path.starts_with(prefix) {
+                return true;
+            }
+        }
+
+        false
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn default_mappings() {
+        let vfs = VirtualFs::new(Path::new("/tmp/evalbox-abc123"));
+
+        assert_eq!(
+            vfs.translate("/work/main.py"),
+            Some(PathBuf::from("/tmp/evalbox-abc123/work/main.py"))
+        );
+        assert_eq!(
+            vfs.translate("/tmp/output.txt"),
+            Some(PathBuf::from("/tmp/evalbox-abc123/tmp/output.txt"))
+        );
+        assert_eq!(
+            vfs.translate("/home/.bashrc"),
+            Some(PathBuf::from("/tmp/evalbox-abc123/home/.bashrc"))
+        );
+    }
+
+    #[test]
+    fn no_translation_for_system_paths() {
+        let vfs = VirtualFs::new(Path::new("/tmp/evalbox-abc123"));
+        assert_eq!(vfs.translate("/usr/bin/python3"), None);
+        assert_eq!(vfs.translate("/etc/passwd"), None);
+    }
+
+    #[test]
+    fn is_allowed_checks() {
+        let vfs = VirtualFs::new(Path::new("/tmp/evalbox-abc123"));
+
+        assert!(vfs.is_allowed("/work/test.py"));
+        assert!(vfs.is_allowed("/tmp/output"));
+        assert!(vfs.is_allowed("/usr/bin/python3"));
+        assert!(vfs.is_allowed("/etc/passwd"));
+        assert!(vfs.is_allowed("/proc/self/status"));
+        assert!(!vfs.is_allowed("/root/.ssh/id_rsa"));
+        assert!(!vfs.is_allowed("/var/log/syslog"));
+    }
+
+    #[test]
+    fn custom_mapping() {
+        let mut vfs = VirtualFs::empty();
+        vfs.add_mapping("/data", "/mnt/shared/data");
+
+        assert_eq!(
+            vfs.translate("/data/file.csv"),
+            Some(PathBuf::from("/mnt/shared/data/file.csv"))
+        );
+        assert_eq!(vfs.translate("/work/test"), None);
+    }
+}
diff --git a/crates/evalbox-sandbox/src/plan.rs b/crates/evalbox-sandbox/src/plan.rs
index 31dc2a6..62a31ae 100644
--- a/crates/evalbox-sandbox/src/plan.rs
+++ b/crates/evalbox-sandbox/src/plan.rs
@@ -41,6 +41,22 @@ use std::collections::{HashMap, HashSet};
 use std::path::PathBuf;
 use std::time::Duration;
 
+/// Seccomp user notification mode.
+///
+/// Controls how the supervisor handles intercepted syscalls from the sandboxed child.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+pub enum NotifyMode {
+    /// No seccomp notify filter installed. Zero overhead. Default.
+    #[default]
+    Disabled,
+    /// Supervisor logs syscalls and returns `SECCOMP_USER_NOTIF_FLAG_CONTINUE`.
+    /// Minimal overhead. For debugging/auditing.
+    Monitor,
+    /// Supervisor intercepts FS syscalls, translates paths via `VirtualFs`,
+    /// opens files at translated paths, injects fd via `SECCOMP_IOCTL_NOTIF_ADDFD`.
+    Virtualize,
+}
+
 /// Mount point configuration.
 ///
 /// This is the canonical Mount type used throughout evalbox.
@@ -273,7 +289,7 @@ impl UserFile {
 #[derive(Debug, Clone)]
 pub struct Plan {
     pub cmd: Vec<String>,
-    /// Pre-resolved binary path. If set, sandbox uses this instead of resolving `cmd\[0\]`.
+    /// Pre-resolved binary path. If set, sandbox uses this instead of resolving `cmd[0]`.
     /// This allows evalbox to do binary resolution before calling sandbox.
     pub binary_path: Option<PathBuf>,
     pub env: HashMap<String, String>,
@@ -291,12 +307,10 @@ pub struct Plan {
     pub syscalls: Option<Syscalls>,
     /// Custom Landlock configuration.
     pub landlock: Option<Landlock>,
+    /// Seccomp user notification mode.
+    pub notify_mode: NotifyMode,
 }
 
-/// Type alias for backwards compatibility.
-#[deprecated(since = "0.2.0", note = "Use `Plan` instead")]
-pub type SandboxPlan = Plan;
-
 impl Default for Plan {
     fn default() -> Self {
         Self {
@@ -315,6 +329,7 @@ impl Default for Plan {
             network_blocked: true,
             syscalls: None,
             landlock: None,
+            notify_mode: NotifyMode::Disabled,
         }
     }
 }
@@ -425,6 +440,16 @@ impl Plan {
         self
     }
 
+    /// Set the seccomp user notification mode.
+    ///
+    /// - `Disabled` (default): No notify filter, zero overhead.
+    /// - `Monitor`: Log intercepted syscalls for debugging.
+    /// - `Virtualize`: Full filesystem virtualization via path translation.
+    pub fn notify_mode(mut self, mode: NotifyMode) -> Self {
+        self.notify_mode = mode;
+        self
+    }
+
     /// Execute this plan (convenience method).
     ///
     /// Equivalent to `Executor::run(self)`.
diff --git a/crates/evalbox-sandbox/src/resolve.rs b/crates/evalbox-sandbox/src/resolve.rs
index 1a7b757..7fd934d 100644
--- a/crates/evalbox-sandbox/src/resolve.rs
+++ b/crates/evalbox-sandbox/src/resolve.rs
@@ -101,7 +101,8 @@ mod tests {
         let sys_paths = &*SYSTEM_PATHS;
         let mounts = detect_mounts(Path::new("/usr/bin/echo"), sys_paths);
 
-        if sys_paths.system_type == SystemType::Fhs {
+        // Only check for /usr mount if we're on an actual FHS system with /usr
+        if sys_paths.system_type == SystemType::Fhs && Path::new("/usr").exists() {
             assert!(mounts.iter().any(|m| m.source == Path::new("/usr")));
         }
     }
diff --git a/crates/evalbox-sandbox/src/workspace.rs b/crates/evalbox-sandbox/src/workspace.rs
index 68b2001..dcf5d42 100644
--- a/crates/evalbox-sandbox/src/workspace.rs
+++ b/crates/evalbox-sandbox/src/workspace.rs
@@ -1,14 +1,14 @@
 //! Workspace and pipe management for sandboxed execution.
 //!
-//! The workspace is a temporary directory that becomes the sandbox root after `pivot_root`.
-//! It contains all the pipes for parent-child communication.
+//! The workspace is a temporary directory containing the sandbox's writable areas
+//! and all the pipes for parent-child communication.
 //!
 //! ## Pipes
 //!
 //! - **stdin**: Parent writes → Child reads
 //! - **stdout**: Child writes → Parent reads
 //! - **stderr**: Child writes → Parent reads
-//! - **sync**: Eventfd pair for parent-child synchronization (UID map setup)
+//! - **sync**: Eventfd for parent-child synchronization
 //!
 //! ## Important: Pipe Hygiene
 //!
@@ -59,6 +59,9 @@ impl Pipe {
 }
 
 /// Eventfd-based parent-child synchronization.
+///
+/// Used when `NotifyMode::Disabled` — the child signals readiness via eventfd
+/// after completing setup, and the parent writes back to let it proceed to exec.
 #[derive(Debug)]
 pub struct SyncPair {
     pub child_ready: OwnedFd,
@@ -150,7 +153,6 @@ impl Workspace {
         fs::write(&full, content)?;
 
         if executable {
-            // Set executable permission (rwxr-xr-x)
             fs::set_permissions(&full, fs::Permissions::from_mode(0o755))?;
         }
 
@@ -163,57 +165,14 @@ impl Workspace {
         Ok(full)
     }
 
+    /// Create standard sandbox directories.
+    ///
+    /// Only creates the writable workspace directories (work, tmp, home).
+    /// No rootfs directories (proc, dev, etc.) needed since we don't use `pivot_root`.
     pub fn setup_sandbox_dirs(&self) -> io::Result<()> {
-        for dir in [
-            "proc", "dev", "tmp", "home", "work", "usr", "bin", "lib", "lib64", "etc",
-        ] {
+        for dir in ["work", "tmp", "home"] {
             self.create_dir(dir)?;
         }
-        self.setup_minimal_etc()?;
-        Ok(())
-    }
-
-    /// Create minimal /etc files to prevent information leakage.
-    ///
-    /// Instead of mounting the host's /etc (which contains sensitive info like
-    /// /etc/passwd, /etc/shadow), we create a minimal /etc with only essential files.
-    pub fn setup_minimal_etc(&self) -> io::Result<()> {
-        let etc = self.root.join("etc");
-
-        // Minimal /etc/passwd - just nobody user
-        fs::write(
-            etc.join("passwd"),
-            "nobody:x:65534:65534:Unprivileged user:/nonexistent:/usr/sbin/nologin\n",
-        )?;
-
-        // Minimal /etc/group - just nobody group
-        fs::write(etc.join("group"), "nogroup:x:65534:\n")?;
-
-        // Minimal /etc/hosts - localhost only
-        fs::write(etc.join("hosts"), "127.0.0.1 localhost\n::1 localhost\n")?;
-
-        // Minimal /etc/nsswitch.conf - required for name resolution
-        fs::write(
-            etc.join("nsswitch.conf"),
-            "passwd: files\ngroup: files\nhosts: files dns\n",
-        )?;
-
-        // Copy /etc/ld.so.cache from host if it exists (needed for dynamic linking)
-        let host_ldcache = Path::new("/etc/ld.so.cache");
-        if host_ldcache.exists() {
-            if let Ok(content) = fs::read(host_ldcache) {
-                fs::write(etc.join("ld.so.cache"), content)?;
-            }
-        }
-
-        // Create /etc/ssl directory for certificates
-        let ssl_dir = etc.join("ssl");
-        fs::create_dir_all(&ssl_dir)?;
-
-        // Minimal /etc/resolv.conf - empty (network is blocked by default)
-        // When network is enabled, Landlock will allow DNS
-        fs::write(etc.join("resolv.conf"), "# DNS disabled in sandbox\n")?;
-
         Ok(())
     }
 }
@@ -252,4 +211,13 @@ mod tests {
         let perms = std::fs::metadata(&path).unwrap().permissions();
         assert_eq!(perms.mode() & 0o777, 0o755);
     }
+
+    #[test]
+    fn workspace_sandbox_dirs() {
+        let ws = Workspace::new().unwrap();
+        ws.setup_sandbox_dirs().unwrap();
+        assert!(ws.root().join("work").exists());
+        assert!(ws.root().join("tmp").exists());
+        assert!(ws.root().join("home").exists());
+    }
 }
diff --git a/crates/evalbox-sandbox/tests/common/mod.rs b/crates/evalbox-sandbox/tests/common/mod.rs
index d863f1b..f125ca5 100644
--- a/crates/evalbox-sandbox/tests/common/mod.rs
+++ b/crates/evalbox-sandbox/tests/common/mod.rs
@@ -16,43 +16,40 @@ pub fn payload(name: &str) -> Vec<u8> {
 
 /// Find payload in cargo's build directory structure.
 fn find_payload(name: &str) -> Option<PathBuf> {
-    // Get the workspace root by looking for Cargo.toml
+    // 1. Next to the test executable (Nix builds)
+    if let Ok(exe) = std::env::current_exe() {
+        if let Some(exe_dir) = exe.parent() {
+            let path = exe_dir.join("payloads").join(name);
+            if path.exists() {
+                return Some(path);
+            }
+        }
+    }
+
+    // 2. Cargo build directory (development)
     let manifest_dir = std::env::var("CARGO_MANIFEST_DIR")
         .map(PathBuf::from)
         .unwrap_or_else(|_| PathBuf::from("."));
 
-    // The workspace root is two levels up from crates/evalbox-sandbox
     let workspace_root = manifest_dir.parent()?.parent()?;
-    let target_dir = workspace_root.join("target");
 
-    // Look in both debug and release builds
-    for profile in ["debug", "release"] {
-        let build_dir = target_dir.join(profile).join("build");
-        if let Ok(entries) = std::fs::read_dir(&build_dir) {
-            for entry in entries.flatten() {
-                let dir_name = entry.file_name();
-                if dir_name.to_string_lossy().starts_with("evalbox-sandbox-") {
-                    let payload_path = entry.path().join("out").join("payloads").join(name);
-                    if payload_path.exists() {
-                        return Some(payload_path);
-                    }
-                }
-            }
-        }
-    }
+    let target_dirs: Vec<PathBuf> = std::iter::once(workspace_root.join("target"))
+        .chain(std::env::var("CARGO_TARGET_DIR").ok().map(PathBuf::from))
+        .collect();
 
-    // Also try CARGO_TARGET_DIR if set
-    if let Ok(target) = std::env::var("CARGO_TARGET_DIR") {
-        let target_dir = PathBuf::from(target);
+    for target_dir in target_dirs {
         for profile in ["debug", "release"] {
             let build_dir = target_dir.join(profile).join("build");
             if let Ok(entries) = std::fs::read_dir(&build_dir) {
                 for entry in entries.flatten() {
-                    let dir_name = entry.file_name();
-                    if dir_name.to_string_lossy().starts_with("evalbox-sandbox-") {
-                        let payload_path = entry.path().join("out").join("payloads").join(name);
-                        if payload_path.exists() {
-                            return Some(payload_path);
+                    if entry
+                        .file_name()
+                        .to_string_lossy()
+                        .starts_with("evalbox-sandbox-")
+                    {
+                        let path = entry.path().join("out").join("payloads").join(name);
+                        if path.exists() {
+                            return Some(path);
                         }
                     }
                 }
@@ -63,33 +60,6 @@ fn find_payload(name: &str) -> Option<PathBuf> {
     None
 }
 
-/// Check if we have permission to create user namespaces.
-pub fn can_create_namespaces() -> bool {
-    // Check kernel parameter
-    if let Ok(content) = std::fs::read_to_string("/proc/sys/kernel/unprivileged_userns_clone") {
-        if content.trim() == "0" {
-            return false;
-        }
-    }
-
-    // Try to actually create a namespace
-    let result = std::process::Command::new("unshare")
-        .args(["--user", "--map-root-user", "true"])
-        .output();
-
-    result.map(|o| o.status.success()).unwrap_or(false)
-}
-
-/// Skip test if namespaces aren't available. Call at start of test.
-pub fn skip_if_no_namespaces() -> bool {
-    if !can_create_namespaces() {
-        eprintln!("Skipping: Cannot create user namespaces");
-        true
-    } else {
-        false
-    }
-}
-
 /// SIGSYS signal number (seccomp violation).
 pub const SIGSYS: i32 = 31;
 
diff --git a/crates/evalbox-sandbox/tests/security/cve.rs b/crates/evalbox-sandbox/tests/security/cve.rs
index a46007e..933c0a9 100644
--- a/crates/evalbox-sandbox/tests/security/cve.rs
+++ b/crates/evalbox-sandbox/tests/security/cve.rs
@@ -7,7 +7,7 @@ use std::time::Duration;
 
 use evalbox_sandbox::{Executor, Plan};
 
-use crate::common::{SIGSYS, payload, skip_if_no_namespaces};
+use crate::common::{SIGSYS, payload};
 
 // =============================================================================
 // CVE-2024-1086: nf_tables Use-After-Free
@@ -23,13 +23,10 @@ use crate::common::{SIGSYS, payload, skip_if_no_namespaces};
 #[test]
 #[ignore]
 fn test_cve_2024_1086_nftables_blocked() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
-        Plan::new(["/work/payload"])
+        Plan::new(["./payload"])
             .executable("payload", payload("cve_2024_1086_nftables"))
+            .binary_path("./payload")
             .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
@@ -57,13 +54,10 @@ fn test_cve_2024_1086_nftables_blocked() {
 #[test]
 #[ignore]
 fn test_cve_2022_0185_fsconfig_blocked() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
-        Plan::new(["/work/payload"])
+        Plan::new(["./payload"])
             .executable("payload", payload("cve_2022_0185_fsconfig"))
+            .binary_path("./payload")
             .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
@@ -91,13 +85,10 @@ fn test_cve_2022_0185_fsconfig_blocked() {
 #[test]
 #[ignore]
 fn test_cve_2017_5226_tiocsti_blocked() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
-        Plan::new(["/work/payload"])
+        Plan::new(["./payload"])
             .executable("payload", payload("cve_2017_5226_tiocsti"))
+            .binary_path("./payload")
             .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
@@ -125,13 +116,10 @@ fn test_cve_2017_5226_tiocsti_blocked() {
 #[test]
 #[ignore]
 fn test_cve_2022_0492_cgroups_blocked() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
-        Plan::new(["/work/payload"])
+        Plan::new(["./payload"])
             .executable("payload", payload("cve_2022_0492_cgroups"))
+            .binary_path("./payload")
             .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
@@ -159,13 +147,10 @@ fn test_cve_2022_0492_cgroups_blocked() {
 #[test]
 #[ignore]
 fn test_fileless_memfd_blocked() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
-        Plan::new(["/work/payload"])
+        Plan::new(["./payload"])
             .executable("payload", payload("fileless_memfd"))
+            .binary_path("./payload")
             .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
@@ -190,13 +175,10 @@ fn test_fileless_memfd_blocked() {
 #[test]
 #[ignore]
 fn test_ioctl_tioclinux_blocked() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
-        Plan::new(["/work/payload"])
+        Plan::new(["./payload"])
             .executable("payload", payload("ioctl_tioclinux"))
+            .binary_path("./payload")
             .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
@@ -218,13 +200,10 @@ fn test_ioctl_tioclinux_blocked() {
 #[test]
 #[ignore]
 fn test_ioctl_tiocsetd_blocked() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
-        Plan::new(["/work/payload"])
+        Plan::new(["./payload"])
             .executable("payload", payload("ioctl_tiocsetd"))
+            .binary_path("./payload")
             .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
@@ -250,13 +229,10 @@ fn test_ioctl_tiocsetd_blocked() {
 #[test]
 #[ignore]
 fn test_userns_creation_blocked() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
-        Plan::new(["/work/payload"])
+        Plan::new(["./payload"])
             .executable("payload", payload("userns_escape"))
+            .binary_path("./payload")
             .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
@@ -284,13 +260,10 @@ fn test_userns_creation_blocked() {
 #[test]
 #[ignore]
 fn test_ptrace_blocked() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
-        Plan::new(["/work/payload"])
+        Plan::new(["./payload"])
             .executable("payload", payload("ptrace_escape"))
+            .binary_path("./payload")
             .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
@@ -317,13 +290,10 @@ fn test_ptrace_blocked() {
 #[test]
 #[ignore]
 fn test_cve_2019_10063_ioctl_bypass_blocked() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
-        Plan::new(["/work/payload"])
+        Plan::new(["./payload"])
             .executable("payload", payload("cve_2019_10063_ioctl_bypass"))
+            .binary_path("./payload")
             .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
diff --git a/crates/evalbox-sandbox/tests/security/filesystem.rs b/crates/evalbox-sandbox/tests/security/filesystem.rs
index 93ed59d..fe08618 100644
--- a/crates/evalbox-sandbox/tests/security/filesystem.rs
+++ b/crates/evalbox-sandbox/tests/security/filesystem.rs
@@ -1,23 +1,20 @@
 //! Filesystem isolation tests.
 //!
 //! These tests verify that sandboxed processes cannot access
-//! files outside their allowed mounts.
+//! files outside their Landlock-allowed paths.
+//!
+//! Without `pivot_root`, the child process chdir's to `{workspace}/work`.
+//! Landlock restricts filesystem access to only allowed paths.
 
 use std::time::Duration;
 
 use evalbox_sandbox::{Executor, Plan};
 
-use crate::common::skip_if_no_namespaces;
-
 /// Test that /etc/shadow is not accessible.
-/// This file contains password hashes and should never be readable.
+/// Landlock only grants read access to /etc, and /etc/shadow requires root.
 #[test]
 #[ignore]
 fn test_cannot_read_etc_shadow() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(Plan::new(["cat", "/etc/shadow"]).timeout(Duration::from_secs(5)))
         .expect("Executor should run");
 
@@ -31,13 +28,10 @@ fn test_cannot_read_etc_shadow() {
 }
 
 /// Test that /etc/passwd cannot be written to.
+/// Landlock grants read-only access to /etc, so writes should be blocked.
 #[test]
 #[ignore]
 fn test_cannot_write_etc_passwd() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
         Plan::new(["sh", "-c", "echo 'hacked:x:0:0::/:/bin/sh' >> /etc/passwd"])
             .timeout(Duration::from_secs(5)),
@@ -48,143 +42,109 @@ fn test_cannot_write_etc_passwd() {
 }
 
 /// Test that /root is not accessible.
+/// Landlock has no rule for /root, so access should be denied.
 #[test]
 #[ignore]
 fn test_cannot_access_root_home() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(Plan::new(["ls", "/root"]).timeout(Duration::from_secs(5)))
         .expect("Executor should run");
 
     assert!(!output.success(), "/root should not be accessible");
 }
 
-/// Test that the work directory is writable.
+/// Test that the work directory (CWD) is writable.
+/// The child chdir's to {workspace}/work, which Landlock grants read/write access to.
 #[test]
 #[ignore]
 fn test_work_dir_is_writable() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
         Plan::new([
             "sh",
             "-c",
-            "echo 'test content' > /work/test.txt && cat /work/test.txt",
+            "echo 'test content' > ./test.txt && cat ./test.txt",
         ])
         .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
 
-    assert!(output.success(), "Should be able to write to /work");
+    assert!(
+        output.success(),
+        "Should be able to write to CWD (work dir)"
+    );
     assert_eq!(output.stdout_str().trim(), "test content");
 }
 
-/// Test that /tmp is writable.
+/// Test that the workspace tmp directory is writable.
+/// The workspace tmp dir is at ../tmp relative to CWD ({workspace}/work).
+/// Landlock grants read/write access to the workspace root.
 #[test]
 #[ignore]
 fn test_tmp_is_writable() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
         Plan::new([
             "sh",
             "-c",
-            "echo 'temp data' > /tmp/test.txt && cat /tmp/test.txt",
+            "echo 'temp data' > ../tmp/test.txt && cat ../tmp/test.txt",
         ])
         .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
 
-    assert!(output.success(), "Should be able to write to /tmp");
+    assert!(
+        output.success(),
+        "Should be able to write to workspace tmp (../tmp)"
+    );
     assert_eq!(output.stdout_str().trim(), "temp data");
 }
 
-/// Test that path traversal attempts are blocked.
+/// Test that path traversal attempts are blocked by Landlock.
 ///
-/// The sandbox creates a minimal /etc with only essential files (passwd, group, hosts).
-/// Path traversal should only see the sandbox's minimal /etc, not the host's.
+/// Without `pivot_root`, path traversal from CWD goes up the real filesystem.
+/// Landlock should block access to paths outside the allowed set.
 #[test]
 #[ignore]
 fn test_path_traversal_blocked() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
-    let output = Executor::run(
-        Plan::new(["cat", "/work/../../../etc/passwd"]).timeout(Duration::from_secs(5)),
-    )
-    .expect("Executor should run");
+    let output =
+        Executor::run(Plan::new(["cat", "../../../etc/shadow"]).timeout(Duration::from_secs(5)))
+            .expect("Executor should run");
 
-    // The path resolves to /etc/passwd which is the sandbox's minimal passwd
-    if output.success() {
-        let content = output.stdout_str();
-
-        // Verify this is NOT the real host passwd
-        // Real passwd would have many entries (root, daemon, bin, etc.)
-        let line_count = content.lines().count();
-        let has_nixbld = content.contains("nixbld"); // NixOS specific
-        let has_root = content.contains("root:");
-        let has_real_users = content.contains("daemon:") || content.contains("bin:");
-
-        assert!(
-            !has_nixbld && !has_real_users && line_count <= 5,
-            "Path traversal should not leak real /etc/passwd.\n\
-             Expected minimal sandbox passwd, got {line_count} lines:\n{content}"
-        );
-
-        // If there's root: it should be the sandbox's nobody-only passwd
-        if has_root {
-            panic!("Path traversal leaked real /etc/passwd with root entry:\n{content}");
-        }
-    }
+    // Landlock should block access to /etc/shadow (no read on shadow, even via traversal)
+    assert!(
+        !output.success(),
+        "Path traversal to /etc/shadow should be blocked by Landlock"
+    );
 }
 
 /// Test that symlink attacks are prevented.
+/// Landlock controls access at the kernel level, so symlinks to restricted
+/// paths should still be blocked.
 #[test]
 #[ignore]
 fn test_symlink_escape_blocked() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
-        Plan::new([
-            "sh",
-            "-c",
-            "ln -s /etc/shadow /work/shadow && cat /work/shadow",
-        ])
-        .timeout(Duration::from_secs(5)),
+        Plan::new(["sh", "-c", "ln -s /etc/shadow ./shadow && cat ./shadow"])
+            .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
 
-    // Either symlink creation fails or reading it fails
+    // Either symlink creation fails or reading the target fails due to Landlock
     assert!(!output.success(), "Symlink escape should be blocked");
 }
 
-/// Test that /proc/self/exe cannot be used to escape.
+/// Test that /proc/self/exe is safe.
+/// Without `pivot_root`, /proc/self/exe reveals the real binary path on the host.
+/// This is expected behavior -- we just verify the sandbox doesn't crash.
 #[test]
 #[ignore]
 fn test_proc_self_exe_safe() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output =
         Executor::run(Plan::new(["readlink", "/proc/self/exe"]).timeout(Duration::from_secs(5)))
             .expect("Executor should run");
 
-    // Should not reveal host paths
-    if output.success() {
-        let exe_path = output.stdout_str();
-        assert!(
-            !exe_path.contains("/home/") && !exe_path.contains("/usr/"),
-            "/proc/self/exe should not reveal host paths: {exe_path}"
-        );
-    }
+    // Without pivot_root, /proc/self/exe will show the real host path.
+    // This is expected -- just verify the command runs without crashing.
+    assert!(
+        output.exit_code.is_some(),
+        "/proc/self/exe readlink should complete without crashing"
+    );
 }
diff --git a/crates/evalbox-sandbox/tests/security/network.rs b/crates/evalbox-sandbox/tests/security/network.rs
index ef884cf..2603c6a 100644
--- a/crates/evalbox-sandbox/tests/security/network.rs
+++ b/crates/evalbox-sandbox/tests/security/network.rs
@@ -7,17 +7,11 @@ use std::time::Duration;
 
 use evalbox_sandbox::{Executor, Plan};
 
-use crate::common::skip_if_no_namespaces;
-
 /// Test that network is blocked by default.
 /// curl should fail to connect to any external host.
 #[test]
 #[ignore]
 fn test_network_blocked_by_default() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
         Plan::new(["sh", "-c", "curl -s --connect-timeout 2 http://example.com || wget -q -O- --timeout=2 http://example.com"])
             .timeout(Duration::from_secs(5)),
@@ -31,29 +25,21 @@ fn test_network_blocked_by_default() {
 #[test]
 #[ignore]
 fn test_localhost_blocked() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
         Plan::new(["sh", "-c", "echo test | nc -w1 127.0.0.1 80 2>/dev/null"])
             .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
 
-    // Should fail - network namespace isolates us
+    // Should fail - seccomp blocks socket creation
     assert!(!output.success(), "Localhost should not be reachable");
 }
 
 /// Test that external DNS resolution fails when network is blocked.
-/// Note: /etc/hosts lookups may still work since the file exists in sandbox.
+/// Note: /etc/hosts lookups may still work since the file exists on the host.
 #[test]
 #[ignore]
 fn test_external_dns_blocked() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     // Use a domain that definitely isn't in /etc/hosts
     let output = Executor::run(
         Plan::new([
@@ -83,10 +69,6 @@ fn test_external_dns_blocked() {
 #[test]
 #[ignore]
 fn test_network_flag_enabled() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     // Just verify that enabling network doesn't break sandbox execution
     let output = Executor::run(
         Plan::new(["sh", "-c", "echo 'network flag test'"])
@@ -109,10 +91,6 @@ fn test_network_flag_enabled() {
 #[test]
 #[ignore]
 fn test_loopback_isolated() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
         Plan::new([
             "sh",
@@ -123,14 +101,14 @@ fn test_loopback_isolated() {
     )
     .expect("Executor should run");
 
-    // The loopback might exist in the network namespace but be isolated
-    // This is more of a sanity check
+    // Without network namespaces, the host loopback is visible but
+    // seccomp blocks socket creation so it can't be used to connect.
+    // This is more of a sanity check that the command runs.
     if output.success() {
-        // If lo exists, verify it's the sandbox's own interface
         let stdout = output.stdout_str();
         assert!(
             stdout.contains("lo") || stdout.contains("127.0.0.1"),
-            "Loopback should be visible if network namespace is active"
+            "Loopback should be visible"
         );
     }
 }
diff --git a/crates/evalbox-sandbox/tests/security/resources.rs b/crates/evalbox-sandbox/tests/security/resources.rs
index 2075527..fa51dd0 100644
--- a/crates/evalbox-sandbox/tests/security/resources.rs
+++ b/crates/evalbox-sandbox/tests/security/resources.rs
@@ -7,16 +7,12 @@ use std::time::Duration;
 
 use evalbox_sandbox::{Executor, Plan, Status};
 
-use crate::common::{payload, skip_if_no_namespaces};
+use crate::common::payload;
 
 /// Test that timeout is enforced.
 #[test]
 #[ignore]
 fn test_timeout_enforced() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let start = std::time::Instant::now();
 
     let output = Executor::run(Plan::new(["sleep", "60"]).timeout(Duration::from_millis(500)))
@@ -36,10 +32,6 @@ fn test_timeout_enforced() {
 #[test]
 #[ignore]
 fn test_infinite_loop_timeout() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
         Plan::new(["sh", "-c", "while true; do :; done"]).timeout(Duration::from_millis(500)),
     )
@@ -53,13 +45,10 @@ fn test_infinite_loop_timeout() {
 #[test]
 #[ignore]
 fn test_max_pids_enforced() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
-        Plan::new(["/work/payload"])
+        Plan::new(["./payload"])
             .executable("payload", payload("fork_bomb"))
+            .binary_path("./payload")
             .max_pids(10)
             .timeout(Duration::from_secs(5)),
     )
@@ -80,10 +69,6 @@ fn test_max_pids_enforced() {
 #[test]
 #[ignore]
 fn test_output_limit_enforced() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
         Plan::new(["sh", "-c", "yes | head -c 100000"]) // 100KB of 'y'
             .max_output(1024) // 1KB limit
@@ -113,10 +98,6 @@ fn test_output_limit_enforced() {
 #[test]
 #[ignore]
 fn test_memory_limit_set() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     // Check that the memory rlimit is set correctly
     let output = Executor::run(
         Plan::new([
@@ -146,10 +127,6 @@ fn test_memory_limit_set() {
 #[test]
 #[ignore]
 fn test_fd_limit_set() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     // Check the fd limit using ulimit
     let output =
         Executor::run(Plan::new(["sh", "-c", "ulimit -n"]).timeout(Duration::from_secs(5)))
@@ -173,10 +150,6 @@ fn test_fd_limit_set() {
 #[test]
 #[ignore]
 fn test_cpu_intensive_timeout() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let start = std::time::Instant::now();
 
     // CPU-intensive work that doesn't sleep
diff --git a/crates/evalbox-sandbox/tests/security/seccomp.rs b/crates/evalbox-sandbox/tests/security/seccomp.rs
index bbfdc00..937dd68 100644
--- a/crates/evalbox-sandbox/tests/security/seccomp.rs
+++ b/crates/evalbox-sandbox/tests/security/seccomp.rs
@@ -7,20 +7,17 @@ use std::time::Duration;
 
 use evalbox_sandbox::{Executor, Plan};
 
-use crate::common::{SIGSYS, payload, skip_if_no_namespaces};
+use crate::common::{SIGSYS, payload};
 
 /// Test that a simple payload can execute successfully.
 /// This is a control test to verify the sandbox is working.
 #[test]
 #[ignore]
 fn test_payload_can_execute() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
-        Plan::new(["/work/payload"])
+        Plan::new(["./payload"])
             .executable("payload", payload("success"))
+            .binary_path("./payload")
             .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
@@ -39,13 +36,10 @@ fn test_payload_can_execute() {
 #[test]
 #[ignore]
 fn test_ptrace_blocked() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
-        Plan::new(["/work/payload"])
+        Plan::new(["./payload"])
             .executable("payload", payload("syscall_ptrace"))
+            .binary_path("./payload")
             .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
@@ -64,13 +58,10 @@ fn test_ptrace_blocked() {
 #[test]
 #[ignore]
 fn test_mount_blocked() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
-        Plan::new(["/work/payload"])
+        Plan::new(["./payload"])
             .executable("payload", payload("syscall_mount"))
+            .binary_path("./payload")
             .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
@@ -89,13 +80,10 @@ fn test_mount_blocked() {
 #[test]
 #[ignore]
 fn test_reboot_blocked() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
-        Plan::new(["/work/payload"])
+        Plan::new(["./payload"])
             .executable("payload", payload("syscall_reboot"))
+            .binary_path("./payload")
             .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
@@ -114,13 +102,10 @@ fn test_reboot_blocked() {
 #[test]
 #[ignore]
 fn test_clone_newuser_blocked() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
-        Plan::new(["/work/payload"])
+        Plan::new(["./payload"])
             .executable("payload", payload("syscall_clone_ns"))
+            .binary_path("./payload")
             .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
@@ -139,13 +124,10 @@ fn test_clone_newuser_blocked() {
 #[test]
 #[ignore]
 fn test_socket_netlink_blocked() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
-        Plan::new(["/work/payload"])
+        Plan::new(["./payload"])
             .executable("payload", payload("socket_netlink"))
+            .binary_path("./payload")
             .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
@@ -164,13 +146,10 @@ fn test_socket_netlink_blocked() {
 #[test]
 #[ignore]
 fn test_socket_raw_blocked() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
-        Plan::new(["/work/payload"])
+        Plan::new(["./payload"])
             .executable("payload", payload("socket_raw"))
+            .binary_path("./payload")
             .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
@@ -190,13 +169,10 @@ fn test_socket_raw_blocked() {
 #[test]
 #[ignore]
 fn test_keyctl_blocked() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
-        Plan::new(["/work/payload"])
+        Plan::new(["./payload"])
             .executable("payload", payload("syscall_keyctl"))
+            .binary_path("./payload")
             .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
@@ -216,13 +192,10 @@ fn test_keyctl_blocked() {
 #[test]
 #[ignore]
 fn test_bpf_blocked() {
-    if skip_if_no_namespaces() {
-        return;
-    }
-
     let output = Executor::run(
-        Plan::new(["/work/payload"])
+        Plan::new(["./payload"])
             .executable("payload", payload("syscall_bpf"))
+            .binary_path("./payload")
             .timeout(Duration::from_secs(5)),
     )
     .expect("Executor should run");
diff --git a/crates/evalbox-sys/src/check.rs b/crates/evalbox-sys/src/check.rs
index f88e3c2..6076036 100644
--- a/crates/evalbox-sys/src/check.rs
+++ b/crates/evalbox-sys/src/check.rs
@@ -7,9 +7,8 @@
 //!
 //! | Feature | Minimum | Check Method |
 //! |---------|---------|--------------|
-//! | Kernel | 5.13 | `uname` syscall |
-//! | Landlock | ABI 1 | `landlock_create_ruleset` with VERSION flag |
-//! | User NS | enabled | `/proc/sys/kernel/unprivileged_userns_clone` or fork+unshare test |
+//! | Kernel | 6.12 | `uname` syscall |
+//! | Landlock | ABI 5 | `landlock_create_ruleset` with VERSION flag |
 //! | Seccomp | enabled | `prctl(PR_GET_SECCOMP)` |
 //!
 //! ## Usage
@@ -20,13 +19,6 @@
 //!     Err(e) => eprintln!("System not supported: {}", e),
 //! }
 //! ```
-//!
-//! ## User Namespaces
-//!
-//! User namespace support varies by distribution:
-//! - **Debian/Ubuntu**: `/proc/sys/kernel/unprivileged_userns_clone`
-//! - **NixOS/Fedora**: `/proc/sys/user/max_user_namespaces`
-//! - **Fallback**: Fork + unshare test
 
 use std::sync::OnceLock;
 
@@ -41,7 +33,6 @@ use crate::seccomp;
 pub struct SystemInfo {
     pub kernel_version: (u32, u32, u32),
     pub landlock_abi: u32,
-    pub user_ns_enabled: bool,
     pub seccomp_enabled: bool,
 }
 
@@ -57,8 +48,8 @@ pub enum CheckError {
     #[error("landlock is not available")]
     LandlockNotAvailable,
 
-    #[error("user namespaces are disabled")]
-    UserNamespacesDisabled,
+    #[error("landlock ABI {found} is too old, need at least ABI {required}")]
+    LandlockAbiTooOld { required: u32, found: u32 },
 
     #[error("seccomp is not available")]
     SeccompNotAvailable,
@@ -67,8 +58,9 @@ pub enum CheckError {
     KernelVersionReadFailed,
 }
 
-// Minimum kernel version: 5.13 (first with Landlock)
-const MIN_KERNEL_VERSION: (u32, u32, u32) = (5, 13, 0);
+// Minimum kernel version: 6.12 (Landlock ABI 5 with SCOPE_SIGNAL + SCOPE_ABSTRACT_UNIX_SOCKET)
+const MIN_KERNEL_VERSION: (u32, u32, u32) = (6, 12, 0);
+const MIN_LANDLOCK_ABI: u32 = 5;
 
 static SYSTEM_INFO: OnceLock<Result<SystemInfo, CheckError>> = OnceLock::new();
 
@@ -93,10 +85,11 @@ fn check_impl() -> Result<SystemInfo, CheckError> {
     if landlock_abi == 0 {
         return Err(CheckError::LandlockNotAvailable);
     }
-
-    let user_ns_enabled = check_user_namespaces();
-    if !user_ns_enabled {
-        return Err(CheckError::UserNamespacesDisabled);
+    if landlock_abi < MIN_LANDLOCK_ABI {
+        return Err(CheckError::LandlockAbiTooOld {
+            required: MIN_LANDLOCK_ABI,
+            found: landlock_abi,
+        });
     }
 
     let seccomp_enabled = seccomp::seccomp_available();
@@ -107,7 +100,6 @@ fn check_impl() -> Result<SystemInfo, CheckError> {
     Ok(SystemInfo {
         kernel_version,
         landlock_abi,
-        user_ns_enabled,
         seccomp_enabled,
     })
 }
@@ -145,36 +137,6 @@ fn parse_kernel_version(release: &str) -> Result<(u32, u32, u32), CheckError> {
     Ok((major, minor, patch))
 }
 
-fn check_user_namespaces() -> bool {
-    // Check sysctl first (Debian/Ubuntu)
-    if let Ok(content) = std::fs::read_to_string("/proc/sys/kernel/unprivileged_userns_clone") {
-        return content.trim() == "1";
-    }
-
-    // Check max_user_namespaces (NixOS and others)
-    if let Ok(content) = std::fs::read_to_string("/proc/sys/user/max_user_namespaces")
-        && content.trim().parse::<u32>().unwrap_or(0) > 0
-    {
-        return true;
-    }
-
-    // Last resort: fork + unshare test (must fork to avoid polluting parent)
-    // SAFETY: fork/unshare/waitpid are safe when used correctly. Child exits immediately.
-    unsafe {
-        let pid = libc::fork();
-        if pid < 0 {
-            return false;
-        }
-        if pid == 0 {
-            let ret = libc::unshare(libc::CLONE_NEWUSER);
-            libc::_exit(if ret == 0 { 0 } else { 1 });
-        }
-        let mut status: i32 = 0;
-        libc::waitpid(pid, &mut status, 0);
-        libc::WIFEXITED(status) && libc::WEXITSTATUS(status) == 0
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -187,6 +149,7 @@ mod tests {
             parse_kernel_version("5.4.0-150-generic").unwrap(),
             (5, 4, 0)
         );
+        assert_eq!(parse_kernel_version("6.12.0").unwrap(), (6, 12, 0));
     }
 
     #[test]
@@ -195,7 +158,6 @@ mod tests {
             Ok(info) => {
                 println!("Kernel version: {:?}", info.kernel_version);
                 println!("Landlock ABI: {}", info.landlock_abi);
-                println!("User NS enabled: {}", info.user_ns_enabled);
                 println!("Seccomp enabled: {}", info.seccomp_enabled);
             }
             Err(e) => {
diff --git a/crates/evalbox-sys/src/landlock.rs b/crates/evalbox-sys/src/landlock.rs
index 6a7599d..c862cd4 100644
--- a/crates/evalbox-sys/src/landlock.rs
+++ b/crates/evalbox-sys/src/landlock.rs
@@ -12,6 +12,7 @@
 //! | 2 | 5.19 | `REFER` (cross-directory rename/link) |
 //! | 3 | 6.2 | `TRUNCATE` (file truncation) |
 //! | 4 | 6.7 | `IOCTL_DEV`, TCP network access |
+//! | 5 | 6.12 | `SCOPE_SIGNAL`, `SCOPE_ABSTRACT_UNIX_SOCKET` |
 //!
 //! ## Usage
 //!
@@ -75,11 +76,19 @@ pub const LANDLOCK_ACCESS_FS_IOCTL_DEV: u64 = 1 << 15;
 pub const LANDLOCK_ACCESS_NET_BIND_TCP: u64 = 1 << 0;
 pub const LANDLOCK_ACCESS_NET_CONNECT_TCP: u64 = 1 << 1;
 
+// ABI v5 - Scoped restrictions
+/// Block abstract unix socket connections outside the sandbox.
+pub const LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET: u64 = 1 << 0;
+/// Block signals to processes outside the sandbox.
+pub const LANDLOCK_SCOPE_SIGNAL: u64 = 1 << 1;
+
 #[repr(C)]
 #[derive(Debug, Default)]
 pub struct LandlockRulesetAttr {
     pub handled_access_fs: u64,
     pub handled_access_net: u64,
+    /// ABI 5+: Scoped restrictions (signal and abstract unix socket isolation).
+    pub scoped: u64,
 }
 
 #[repr(C)]
@@ -205,6 +214,18 @@ pub fn net_access_for_abi(abi: u32) -> u64 {
     }
 }
 
+/// Returns the scoped restriction flags for the given ABI version.
+///
+/// ABI 5+ supports signal isolation and abstract unix socket isolation,
+/// replacing the need for PID and IPC namespaces.
+pub fn scope_for_abi(abi: u32) -> u64 {
+    if abi >= 5 {
+        LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET | LANDLOCK_SCOPE_SIGNAL
+    } else {
+        0
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/crates/evalbox-sys/src/lib.rs b/crates/evalbox-sys/src/lib.rs
index 59a11d7..78a4688 100644
--- a/crates/evalbox-sys/src/lib.rs
+++ b/crates/evalbox-sys/src/lib.rs
@@ -5,8 +5,9 @@
 //!
 //! ## Modules
 //!
-//! - **landlock** - Landlock LSM for filesystem/network access control (kernel 5.13+)
+//! - **landlock** - Landlock LSM for filesystem/network/scope access control (kernel 5.13+)
 //! - **seccomp** - Seccomp-BPF syscall filtering
+//! - **`seccomp_notify`** - Seccomp user notification (`SECCOMP_RET_USER_NOTIF`)
 //! - **check** - Runtime system capability detection
 //!
 //! ## Landlock
@@ -16,6 +17,7 @@
 //! - ABI 2: File truncation (kernel 5.19)
 //! - ABI 3: File permission changes (kernel 6.2)
 //! - ABI 4: Network TCP access control (kernel 6.7)
+//! - ABI 5: Scoped signals and abstract unix sockets (kernel 6.12)
 //!
 //! ## Seccomp-BPF
 //!
@@ -23,6 +25,11 @@
 //! a whitelist-based filter that allows ~40 safe syscalls and kills the process
 //! on any other syscall.
 //!
+//! ## Seccomp User Notify
+//!
+//! Seccomp user notification allows a supervisor process to intercept syscalls
+//! from a sandboxed child, enabling filesystem virtualization without namespaces.
+//!
 //! # Safety
 //!
 //! This crate contains raw syscall wrappers. Casts between integer types
@@ -34,6 +41,7 @@
 pub mod check;
 pub mod landlock;
 pub mod seccomp;
+pub mod seccomp_notify;
 
 pub use check::{CheckError, SystemInfo, check};
 
diff --git a/crates/evalbox-sys/src/seccomp.rs b/crates/evalbox-sys/src/seccomp.rs
index b2b57d6..1fd50d4 100644
--- a/crates/evalbox-sys/src/seccomp.rs
+++ b/crates/evalbox-sys/src/seccomp.rs
@@ -47,14 +47,14 @@
 //! - `memfd_create` + `execveat` - Enables fileless execution (bypass Landlock)
 //! - `setresuid`/`setresgid` - No reason to change UID in sandbox
 //! - `setsid`/`setpgid` - Session manipulation, unnecessary
-//! - `ioctl` - Too powerful without argument filtering (TODO: whitelist specific codes)
+//! - `ioctl` - Allowed with argument filtering (TIOCSTI, TIOCSETD, TIOCLINUX blocked)
 //!
 //! ## Security Notes
 //!
 //! - Filter is permanent - cannot be removed once applied
 //! - Requires `PR_SET_NO_NEW_PRIVS` first
 //! - Blocked syscall = immediate process termination (SIGSYS)
-//! - `kill`/`tgkill` are safe because we use PID namespace (`CLONE_NEWPID`)
+//! - `kill`/`tgkill` are safe due to Landlock v5 `SCOPE_SIGNAL` isolation
 //! - `prctl` allowed but `PR_SET_SECCOMP` has no effect (filter already applied)
 
 use rustix::io::Errno;
@@ -64,6 +64,7 @@ use crate::last_errno;
 // Seccomp constants
 const SECCOMP_SET_MODE_FILTER: u32 = 1;
 const SECCOMP_RET_KILL_PROCESS: u32 = 0x80000000;
+const SECCOMP_RET_USER_NOTIF: u32 = 0x7fc00000;
 const SECCOMP_RET_ALLOW: u32 = 0x7fff0000;
 // Return ENOSYS (38) to allow graceful fallback
 const SECCOMP_RET_ERRNO_ENOSYS: u32 = 0x00050000 | 38;
@@ -172,7 +173,7 @@ pub struct SockFprog {
 /// - `setsid`/`setpgid` - Session manipulation unnecessary
 ///
 /// ## Notes:
-/// - `kill`/`tgkill` safe due to PID namespace isolation
+/// - `kill`/`tgkill` safe due to Landlock v5 `SCOPE_SIGNAL` isolation
 /// - `prctl` kept for runtime needs (`PR_SET_NAME`, etc.)
 pub const DEFAULT_WHITELIST: &[i64] = &[
     // === Basic I/O ===
@@ -291,7 +292,7 @@ pub const DEFAULT_WHITELIST: &[i64] = &[
     libc::SYS_fchdir,
     libc::SYS_readlink,
     libc::SYS_readlinkat,
-    // === Signals (safe due to PID namespace) ===
+    // === Signals (safe due to Landlock SCOPE_SIGNAL) ===
     libc::SYS_rt_sigaction,
     libc::SYS_rt_sigprocmask,
     libc::SYS_rt_sigreturn,
@@ -299,9 +300,9 @@ pub const DEFAULT_WHITELIST: &[i64] = &[
     libc::SYS_rt_sigpending,
     libc::SYS_rt_sigtimedwait,
     libc::SYS_sigaltstack,
-    libc::SYS_kill,   // Safe: PID namespace isolates
-    libc::SYS_tgkill, // Safe: PID namespace isolates
-    libc::SYS_tkill,  // Safe: PID namespace isolates
+    libc::SYS_kill,   // Safe: Landlock SCOPE_SIGNAL isolates
+    libc::SYS_tgkill, // Safe: Landlock SCOPE_SIGNAL isolates
+    libc::SYS_tkill,  // Safe: Landlock SCOPE_SIGNAL isolates
     // === Process control ===
     libc::SYS_execve,
     // execveat REMOVED - with memfd_create enables fileless execution
@@ -567,6 +568,81 @@ pub fn seccomp_available() -> bool {
     unsafe { libc::prctl(libc::PR_GET_SECCOMP, 0, 0, 0, 0) >= 0 }
 }
 
+/// Builds a BPF filter that returns `SECCOMP_RET_USER_NOTIF` for the listed
+/// syscalls and `SECCOMP_RET_ALLOW` for everything else.
+///
+/// This filter is installed *before* the kill filter. The kernel evaluates all
+/// stacked filters and returns the strictest verdict, so:
+/// - Syscall in both ALLOW lists → ALLOW
+/// - Syscall in NOTIFY + ALLOW → NOTIFY (supervisor decides)
+/// - Syscall not in kill filter whitelist → KILL (regardless of notify filter)
+///
+/// # Panics
+///
+/// Panics if `syscalls.len()` > 200 (BPF jump offsets are u8).
+pub fn build_notify_filter(syscalls: &[i64]) -> Vec<SockFilter> {
+    assert!(
+        syscalls.len() <= MAX_WHITELIST_SIZE,
+        "notify syscall list too large: {} > {}",
+        syscalls.len(),
+        MAX_WHITELIST_SIZE
+    );
+
+    let n = syscalls.len();
+    let mut filter = Vec::with_capacity(n + 8);
+
+    // Architecture check
+    filter.push(SockFilter::stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARCH));
+    filter.push(SockFilter::jump(
+        BPF_JMP | BPF_JEQ | BPF_K,
+        AUDIT_ARCH_X86_64,
+        1,
+        0,
+    ));
+    filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
+
+    // Load syscall number
+    filter.push(SockFilter::stmt(
+        BPF_LD | BPF_W | BPF_ABS,
+        OFFSET_SYSCALL_NR,
+    ));
+
+    // Check each syscall → jump to NOTIFY
+    for (i, &nr) in syscalls.iter().enumerate() {
+        let notify_offset = (n - i) as u8; // jump to NOTIFY instruction
+        filter.push(SockFilter::jump(
+            BPF_JMP | BPF_JEQ | BPF_K,
+            nr as u32,
+            notify_offset,
+            0,
+        ));
+    }
+
+    // Default: ALLOW
+    filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW));
+
+    // NOTIFY
+    filter.push(SockFilter::stmt(BPF_RET | BPF_K, SECCOMP_RET_USER_NOTIF));
+
+    filter
+}
+
+/// Syscalls that are intercepted by the notify filter for filesystem virtualization.
+pub const NOTIFY_FS_SYSCALLS: &[i64] = &[
+    libc::SYS_openat,
+    libc::SYS_open,
+    libc::SYS_creat,
+    libc::SYS_access,
+    libc::SYS_faccessat,
+    libc::SYS_faccessat2,
+    libc::SYS_stat,
+    libc::SYS_lstat,
+    libc::SYS_newfstatat,
+    libc::SYS_statx,
+    libc::SYS_readlink,
+    libc::SYS_readlinkat,
+];
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -659,4 +735,20 @@ mod tests {
         let huge: Vec<i64> = (0..300).map(|i| i as i64).collect();
         build_whitelist_filter(&huge);
     }
+
+    #[test]
+    fn notify_filter_structure() {
+        let syscalls = &[libc::SYS_openat, libc::SYS_open, libc::SYS_stat];
+        let filter = build_notify_filter(syscalls);
+        // 3 (arch) + 1 (load) + 3 (checks) + 1 (allow) + 1 (notify) = 9
+        assert_eq!(filter.len(), 9);
+    }
+
+    #[test]
+    fn notify_fs_syscalls_present() {
+        assert!(NOTIFY_FS_SYSCALLS.contains(&libc::SYS_openat));
+        assert!(NOTIFY_FS_SYSCALLS.contains(&libc::SYS_open));
+        assert!(NOTIFY_FS_SYSCALLS.contains(&libc::SYS_stat));
+        assert!(NOTIFY_FS_SYSCALLS.contains(&libc::SYS_readlink));
+    }
 }
diff --git a/crates/evalbox-sys/src/seccomp_notify.rs b/crates/evalbox-sys/src/seccomp_notify.rs
new file mode 100644
index 0000000..91a2357
--- /dev/null
+++ b/crates/evalbox-sys/src/seccomp_notify.rs
@@ -0,0 +1,246 @@
+//! Seccomp user notification (`SECCOMP_RET_USER_NOTIF`) support.
+//!
+//! Seccomp user notification allows a supervisor process to intercept
+//! syscalls from a sandboxed child and make decisions on its behalf.
+//! This enables filesystem virtualization without user namespaces.
+//!
+//! ## Architecture
+//!
+//! 1. Child installs a seccomp filter with `SECCOMP_FILTER_FLAG_NEW_LISTENER`
+//! 2. This returns a "listener fd" which is passed to the parent via `SCM_RIGHTS`
+//! 3. Parent polls the listener fd; when readable, calls `SECCOMP_IOCTL_NOTIF_RECV`
+//! 4. Parent inspects the syscall and either:
+//!    - Returns `SECCOMP_USER_NOTIF_FLAG_CONTINUE` to let it proceed
+//!    - Returns an error code to deny it
+//!    - Uses `SECCOMP_IOCTL_NOTIF_ADDFD` to inject a file descriptor
+//!
+//! ## TOCTOU Protection
+//!
+//! Between receiving a notification and responding, the child's memory may change.
+//! Always call `SECCOMP_IOCTL_NOTIF_ID_VALID` after reading child memory to verify
+//! the notification is still valid.
+
+use std::os::fd::{FromRawFd, OwnedFd};
+
+use rustix::io::Errno;
+
+use crate::last_errno;
+use crate::seccomp::SockFprog;
+
+// Seccomp constants for notify
+const SECCOMP_SET_MODE_FILTER: u32 = 1;
+pub const SECCOMP_FILTER_FLAG_NEW_LISTENER: u32 = 1 << 3;
+
+/// Let the syscall proceed as-is (supervisor approves).
+pub const SECCOMP_USER_NOTIF_FLAG_CONTINUE: u32 = 1;
+
+/// Atomically inject fd and respond to the notification.
+pub const SECCOMP_ADDFD_FLAG_SEND: u32 = 1 << 0;
+/// Replace an existing fd in the target process.
+pub const SECCOMP_ADDFD_FLAG_SETFD: u32 = 1 << 1;
+
+// ioctl numbers for seccomp notify (from kernel headers)
+// These are architecture-dependent; values below are for x86_64.
+// SECCOMP_IOCTL_NOTIF_RECV = SECCOMP_IOWR(0, struct seccomp_notif)
+// SECCOMP_IOCTL_NOTIF_SEND = SECCOMP_IOWR(1, struct seccomp_notif_resp)
+// SECCOMP_IOCTL_NOTIF_ID_VALID = SECCOMP_IOW(2, __u64)
+// SECCOMP_IOCTL_NOTIF_ADDFD = SECCOMP_IOW(3, struct seccomp_notif_addfd)
+
+/// ioctl to receive a notification from the seccomp listener fd.
+pub const SECCOMP_IOCTL_NOTIF_RECV: u64 = 0xc0502100;
+/// ioctl to send a response to a seccomp notification.
+pub const SECCOMP_IOCTL_NOTIF_SEND: u64 = 0xc0182101;
+/// ioctl to check if a notification ID is still valid (TOCTOU protection).
+pub const SECCOMP_IOCTL_NOTIF_ID_VALID: u64 = 0x40082102;
+/// ioctl to inject a file descriptor into the notifying process.
+pub const SECCOMP_IOCTL_NOTIF_ADDFD: u64 = 0x40182103;
+
+/// Seccomp notification data (mirrors kernel `struct seccomp_data`).
+#[repr(C)]
+#[derive(Debug, Clone, Copy, Default)]
+pub struct SeccompData {
+    /// Syscall number.
+    pub nr: i32,
+    /// Architecture (`AUDIT_ARCH_*`).
+    pub arch: u32,
+    /// Instruction pointer at time of syscall.
+    pub instruction_pointer: u64,
+    /// Syscall arguments.
+    pub args: [u64; 6],
+}
+
+/// Seccomp notification received from the child (mirrors kernel `struct seccomp_notif`).
+#[repr(C)]
+#[derive(Debug, Clone, Copy)]
+pub struct SeccompNotif {
+    /// Unique notification ID.
+    pub id: u64,
+    /// PID of the notifying process (in supervisor's PID namespace).
+    pub pid: u32,
+    /// Flags (currently unused, must be 0).
+    pub flags: u32,
+    /// The syscall data.
+    pub data: SeccompData,
+}
+
+impl Default for SeccompNotif {
+    fn default() -> Self {
+        // SAFETY: SeccompNotif is a plain C struct with no invariants.
+        unsafe { std::mem::zeroed() }
+    }
+}
+
+/// Response to a seccomp notification (mirrors kernel `struct seccomp_notif_resp`).
+#[repr(C)]
+#[derive(Debug, Clone, Copy, Default)]
+pub struct SeccompNotifResp {
+    /// Must match the notification ID.
+    pub id: u64,
+    /// Return value for the syscall.
+    pub val: i64,
+    /// Errno value (negated in kernel).
+    pub error: i32,
+    /// Flags (e.g., `SECCOMP_USER_NOTIF_FLAG_CONTINUE`).
+    pub flags: u32,
+}
+
+/// Inject a file descriptor into the notifying process
+/// (mirrors kernel `struct seccomp_notif_addfd`).
+#[repr(C)]
+#[derive(Debug, Clone, Copy, Default)]
+pub struct SeccompNotifAddfd {
+    /// Must match the notification ID.
+    pub id: u64,
+    /// Flags (e.g., `SECCOMP_ADDFD_FLAG_SEND`).
+    pub flags: u32,
+    /// The fd in the supervisor to inject.
+    pub srcfd: u32,
+    /// The fd number to use in the target (0 = kernel picks).
+    pub newfd: u32,
+    /// Flags for the new fd (e.g., `O_CLOEXEC`).
+    pub newfd_flags: u32,
+}
+
+/// Install a seccomp filter with `SECCOMP_FILTER_FLAG_NEW_LISTENER`.
+///
+/// Returns the listener fd which can be used to receive notifications.
+/// The caller must have already called `PR_SET_NO_NEW_PRIVS`.
+///
+/// # Safety
+///
+/// The filter must be a valid BPF program. This permanently restricts
+/// syscalls for this thread.
+///
+/// # Errors
+///
+/// Returns `Errno` if the filter cannot be installed.
+pub unsafe fn seccomp_set_mode_filter_listener(fprog: &SockFprog) -> Result<OwnedFd, Errno> {
+    unsafe {
+        let ret = libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+        if ret != 0 {
+            return Err(last_errno());
+        }
+
+        let ret = libc::syscall(
+            libc::SYS_seccomp,
+            SECCOMP_SET_MODE_FILTER,
+            SECCOMP_FILTER_FLAG_NEW_LISTENER,
+            fprog as *const _,
+        );
+        if ret < 0 {
+            Err(last_errno())
+        } else {
+            // SAFETY: On success, ret is a valid listener file descriptor.
+            Ok(OwnedFd::from_raw_fd(ret as i32))
+        }
+    }
+}
+
+/// Receive a notification from the seccomp listener fd.
+///
+/// Blocks until a notification is available (or use poll/epoll first).
+///
+/// # Errors
+///
+/// Returns `Errno` on failure (e.g., `ENOENT` if the target died).
+pub fn notif_recv(listener_fd: i32, notif: &mut SeccompNotif) -> Result<(), Errno> {
+    let ret = unsafe {
+        libc::ioctl(
+            listener_fd,
+            SECCOMP_IOCTL_NOTIF_RECV,
+            notif as *mut SeccompNotif,
+        )
+    };
+    if ret < 0 { Err(last_errno()) } else { Ok(()) }
+}
+
+/// Send a response to a seccomp notification.
+///
+/// # Errors
+///
+/// Returns `Errno` on failure.
+pub fn notif_send(listener_fd: i32, resp: &SeccompNotifResp) -> Result<(), Errno> {
+    let ret = unsafe {
+        libc::ioctl(
+            listener_fd,
+            SECCOMP_IOCTL_NOTIF_SEND,
+            resp as *const SeccompNotifResp,
+        )
+    };
+    if ret < 0 { Err(last_errno()) } else { Ok(()) }
+}
+
+/// Check if a notification ID is still valid.
+///
+/// Must be called after reading from child's `/proc/pid/mem` to protect
+/// against TOCTOU attacks.
+///
+/// # Errors
+///
+/// Returns `Errno::NOENT` if the notification is no longer valid.
+pub fn notif_id_valid(listener_fd: i32, id: u64) -> Result<(), Errno> {
+    let ret = unsafe { libc::ioctl(listener_fd, SECCOMP_IOCTL_NOTIF_ID_VALID, &id as *const u64) };
+    if ret < 0 { Err(last_errno()) } else { Ok(()) }
+}
+
+/// Inject a file descriptor into the notifying process.
+///
+/// With `SECCOMP_ADDFD_FLAG_SEND`, this atomically injects the fd and
+/// responds to the notification (the return value becomes the new fd number
+/// in the target process).
+///
+/// # Errors
+///
+/// Returns `Errno` on failure.
+pub fn notif_addfd(listener_fd: i32, addfd: &SeccompNotifAddfd) -> Result<i32, Errno> {
+    let ret = unsafe {
+        libc::ioctl(
+            listener_fd,
+            SECCOMP_IOCTL_NOTIF_ADDFD,
+            addfd as *const SeccompNotifAddfd,
+        )
+    };
+    if ret < 0 { Err(last_errno()) } else { Ok(ret) }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn struct_sizes() {
+        // Verify struct sizes match kernel expectations
+        assert_eq!(size_of::<SeccompData>(), 64);
+        assert_eq!(size_of::<SeccompNotif>(), 80);
+        assert_eq!(size_of::<SeccompNotifResp>(), 24);
+        assert_eq!(size_of::<SeccompNotifAddfd>(), 24);
+    }
+
+    #[test]
+    fn default_notif_is_zeroed() {
+        let notif = SeccompNotif::default();
+        assert_eq!(notif.id, 0);
+        assert_eq!(notif.pid, 0);
+        assert_eq!(notif.data.nr, 0);
+    }
+}
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
index 9f22c66..adeca15 100644
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -2,7 +2,7 @@
 
 ## Overview
 
-evalbox is a secure sandbox for executing untrusted code on Linux. It provides millisecond-startup isolation using Linux namespaces, Landlock LSM, and seccomp-BPF.
+evalbox is a secure sandbox for executing untrusted code on Linux. It provides millisecond-startup isolation using Landlock LSM v5, seccomp-BPF, and rlimits — no namespaces, no containers, no root.
 
 ```
 ┌─────────────────────────────────────────────────────────────────┐
@@ -29,23 +29,23 @@ evalbox is a secure sandbox for executing untrusted code on Linux. It provides m
 │  └──────────────────────────────────────────────────────────┘   │
 │  ┌──────────────────────────────────────────────────────────┐   │
 │  │                     Isolation                             │   │
-│  │   • Namespaces (user, pid, net, mount, uts, ipc)         │   │
-│  │   • pivot_root + minimal rootfs                          │   │
-│  │   • Landlock filesystem rules                            │   │
-│  │   • Seccomp syscall filter                               │   │
+│  │   • Landlock v5 (filesystem, network, signal, IPC)       │   │
+│  │   • Seccomp-BPF (syscall whitelist)                      │   │
+│  │   • rlimits (memory, CPU, PIDs, fds)                     │   │
+│  │   • Privilege hardening (securebits, capability drop)    │   │
 │  └──────────────────────────────────────────────────────────┘   │
 └─────────────────────────────────────────────────────────────────┘
                                 │
                                 ▼
 ┌─────────────────────────────────────────────────────────────────┐
 │                      evalbox-sys                                 │
-│   Raw Linux syscalls: clone3, pidfd, seccomp, landlock          │
+│   Raw Linux syscalls: seccomp, landlock, seccomp_notify          │
 └─────────────────────────────────────────────────────────────────┘
                                 │
                                 ▼
 ┌─────────────────────────────────────────────────────────────────┐
 │                       Linux Kernel                               │
-│   namespaces │ seccomp-bpf │ landlock │ cgroups │ rlimits       │
+│           seccomp-bpf │ landlock │ rlimits                       │
 └─────────────────────────────────────────────────────────────────┘
 ```
 
@@ -68,17 +68,18 @@ evalbox/
 │       ├── workspace.rs     # Temporary filesystem setup
 │       ├── monitor.rs       # Process monitoring, output capture
 │       ├── isolation/       # Isolation mechanisms
-│       │   ├── namespace.rs # User/PID/Net namespace setup
-│       │   ├── rootfs.rs    # Mount namespace, pivot_root
-│       │   └── lockdown.rs  # Landlock + seccomp application
+│       │   ├── lockdown.rs  # Landlock v5 + securebits + cap drop
+│       │   └── rlimits.rs   # Resource limits
+│       ├── notify/          # Seccomp user notify (optional)
 │       ├── validate.rs      # Input validation
-│       └── sysinfo.rs       # System detection (Nix, paths)
+│       └── resolve.rs       # Binary resolution
 │
 └── evalbox-sys/             # Low-level syscalls
     └── src/
         ├── seccomp.rs       # BPF filter generation
+        ├── seccomp_notify.rs # Seccomp user notify support
         ├── landlock.rs      # Landlock ruleset API
-        └── check.rs         # Capability detection
+        └── check.rs         # System capability detection
 ```
 
 ---
@@ -132,13 +133,6 @@ loop {
 }
 ```
 
-### Platform Behavior
-
-| Platform | Process Monitoring | I/O Multiplexing |
-|----------|-------------------|------------------|
-| Linux    | pidfd + epoll     | mio (epoll)      |
-| macOS    | vsock to VM       | mio (kqueue)     |
-
 ---
 
 ## Sandbox Lifecycle
@@ -156,85 +150,93 @@ loop {
 ┌──────────────────────────────────────────────────────────────────┐
 │  2. WORKSPACE PREPARATION                                        │
 │     • Create tempdir (/tmp/evalbox-XXXXX)                       │
-│     • Setup directory structure (/work, /tmp, /etc)             │
-│     • Write user files                                           │
-│     • Create pipes (stdin, stdout, stderr)                      │
+│     • Create writable directories: /work, /tmp, /home           │
+│     • Write user files to /work                                  │
+│     • Create pipes (stdin, stdout, stderr) + eventfd sync       │
 └──────────────────────────────────────────────────────────────────┘
                                 │
                                 ▼
 ┌──────────────────────────────────────────────────────────────────┐
-│  3. CLONE WITH NAMESPACES                                        │
-│     clone3(CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET |        │
-│            CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)           │
+│  3. FORK                                                         │
+│     fork() — plain fork, no CLONE_NEW* flags                    │
 │                                                                  │
 │     Parent                          Child                        │
 │       │                               │                          │
-│       ├─ Write UID/GID maps           ├─ Wait for parent        │
-│       ├─ Signal ready ────────────────►                          │
-│       │                               ├─ Setup isolation        │
-│       │                               │   (see step 4)          │
-│       ▼                               ▼                          │
+│       ├─ Open pidfd                   ├─ Close parent pipe ends  │
+│       ├─ Wait for child ready         ├─ Setup stdio (dup2)     │
+│       ├─ Signal to proceed            ├─ chdir(workspace/work)  │
+│       ▼                               ├─ Apply lockdown (step 4)│
+│                                       ▼                          │
 └──────────────────────────────────────────────────────────────────┘
                                 │
                                 ▼
 ┌──────────────────────────────────────────────────────────────────┐
-│  4. CHILD ISOLATION SETUP                                        │
+│  4. CHILD LOCKDOWN (irreversible)                                │
 │                                                                  │
 │     ┌─────────────────────────────────────────────────────────┐ │
-│     │  a) Mount namespace                                      │ │
-│     │     • Bind mount /usr, /lib, /lib64 (read-only)         │ │
-│     │     • Bind mount workspace to /work                      │ │
-│     │     • Mount minimal /dev (null, zero, urandom)          │ │
-│     │     • pivot_root to new root                            │ │
-│     │     • Unmount old root                                   │ │
+│     │  a) NO_NEW_PRIVS                                        │ │
+│     │     prctl(PR_SET_NO_NEW_PRIVS) — required before        │ │
+│     │     Landlock and seccomp                                │ │
 │     └─────────────────────────────────────────────────────────┘ │
 │     ┌─────────────────────────────────────────────────────────┐ │
-│     │  b) Landlock (kernel 5.13+)                              │ │
-│     │     • Create ruleset with FS restrictions               │ │
-│     │     • Allow read-only: /usr, /lib, /bin, /etc           │ │
-│     │     • Allow read-write: /work, /tmp                     │ │
-│     │     • Enforce ruleset                                    │ │
-│     │     (See SECURITY.md for details)                       │ │
+│     │  b) Landlock v5                                         │ │
+│     │     • Filesystem: read-only /usr, /lib, /etc, /bin      │ │
+│     │       read-write workspace/work, /tmp, /home            │ │
+│     │     • Network: block TCP bind + connect (ABI 4+)        │ │
+│     │     • Signals: block cross-sandbox signals (ABI 5)      │ │
+│     │     • IPC: block abstract unix sockets (ABI 5)          │ │
 │     └─────────────────────────────────────────────────────────┘ │
 │     ┌─────────────────────────────────────────────────────────┐ │
-│     │  c) Seccomp BPF                                          │ │
-│     │     • Load syscall whitelist filter                     │ │
-│     │     • Block dangerous syscalls (ptrace, mount, etc.)    │ │
-│     │     • Filter clone() flags, socket() domains            │ │
-│     │     • Filter dangerous ioctls (TIOCSTI, etc.)           │ │
-│     │     (See SECURITY.md for full policy)                   │ │
+│     │  c) Resource limits (rlimits)                           │ │
+│     │     • RLIMIT_DATA: 256 MiB memory                       │ │
+│     │     • RLIMIT_CPU: timeout * 2 + 60s                     │ │
+│     │     • RLIMIT_NPROC: 64 processes                        │ │
+│     │     • RLIMIT_NOFILE: 256 file descriptors               │ │
+│     │     • RLIMIT_FSIZE: 16 MiB output                       │ │
+│     │     • RLIMIT_CORE: 0 (disabled)                         │ │
+│     │     • RLIMIT_STACK: 8 MiB                               │ │
 │     └─────────────────────────────────────────────────────────┘ │
 │     ┌─────────────────────────────────────────────────────────┐ │
-│     │  d) Resource limits (rlimits)                           │ │
-│     │     • RLIMIT_AS: Memory limit                           │ │
-│     │     • RLIMIT_NPROC: Process limit                       │ │
-│     │     • RLIMIT_NOFILE: File descriptor limit              │ │
-│     │     • RLIMIT_FSIZE: Output file size limit              │ │
+│     │  d) Securebits + capability drop                        │ │
+│     │     • Lock NOROOT, NO_SETUID_FIXUP, KEEP_CAPS,          │ │
+│     │       NO_CAP_AMBIENT_RAISE                              │ │
+│     │     • Drop all 64 capabilities                          │ │
 │     └─────────────────────────────────────────────────────────┘ │
 └──────────────────────────────────────────────────────────────────┘
                                 │
                                 ▼
 ┌──────────────────────────────────────────────────────────────────┐
-│  5. EXECVE TARGET PROGRAM                                        │
-│     execve("/usr/bin/python", ["python", "-c", code], env)      │
+│  5. SECCOMP FILTERS                                              │
+│     • [Optional] Install notify filter for FS syscall            │
+│       interception, send listener fd to parent via SCM_RIGHTS   │
+│     • Install kill filter — whitelist of ~100 safe syscalls     │
+│     • Argument filtering: clone flags, socket domains, ioctls   │
+│     • Violation = SECCOMP_RET_KILL_PROCESS (SIGSYS)             │
+└──────────────────────────────────────────────────────────────────┘
+                                │
+                                ▼
+┌──────────────────────────────────────────────────────────────────┐
+│  6. SIGNAL PARENT + WAIT + EXEC                                  │
+│     • Signal parent readiness (eventfd)                          │
+│     • Wait for parent go-ahead (eventfd)                        │
+│     • close_range(3, MAX, 0) — close all fds except 0,1,2      │
+│     • execve(binary, args, env)                                  │
 │                                                                  │
-│     • All isolation is now permanent                            │
-│     • Seccomp filter cannot be removed                          │
-│     • Landlock rules cannot be relaxed                          │
+│     All isolation is now permanent and cannot be undone.         │
 └──────────────────────────────────────────────────────────────────┘
                                 │
                                 ▼
 ┌──────────────────────────────────────────────────────────────────┐
-│  6. PARENT MONITORS                                              │
+│  7. PARENT MONITORS                                              │
 │     • Poll pidfd for process exit                               │
 │     • Read stdout/stderr via pipes                              │
 │     • Enforce timeout (kill if exceeded)                        │
-│     • Track output size (truncate if exceeded)                  │
+│     • Track output size (kill if exceeded)                      │
 └──────────────────────────────────────────────────────────────────┘
                                 │
                                 ▼
 ┌──────────────────────────────────────────────────────────────────┐
-│  7. CLEANUP                                                      │
+│  8. CLEANUP                                                      │
 │     • Collect exit status                                        │
 │     • Remove workspace tempdir                                   │
 │     • Return Output { stdout, stderr, exit_code, signal }       │
@@ -245,7 +247,7 @@ loop {
 
 ## Security Architecture
 
-evalbox implements **defense in depth** with 7 independent isolation layers:
+evalbox implements **defense in depth** with independent isolation mechanisms:
 
 ```
 ┌─────────────────────────────────────────────────────────────┐
@@ -254,42 +256,16 @@ evalbox implements **defense in depth** with 7 independent isolation layers:
           │
           ▼
 ┌─────────────────────────────────────────────────────────────┐
-│  Layer 1: User Namespace                                     │
-│  • UID 0 inside = real user outside                         │
-│  • No capabilities in parent namespace                      │
-└─────────────────────────────────────────────────────────────┘
-          │
-          ▼
-┌─────────────────────────────────────────────────────────────┐
-│  Layer 2: PID Namespace                                      │
-│  • Isolated process tree (PID 1 inside)                     │
-│  • Cannot see/signal host processes                         │
-└─────────────────────────────────────────────────────────────┘
-          │
-          ▼
-┌─────────────────────────────────────────────────────────────┐
-│  Layer 3: Network Namespace                                  │
-│  • Empty by default (no interfaces)                         │
-│  • Cannot access host network                               │
+│  Landlock v5                                                 │
+│  • Filesystem: read-only system paths, read-write workspace │
+│  • Network: block TCP bind + connect (ABI 4+)               │
+│  • Signals: block cross-sandbox signals (ABI 5)             │
+│  • IPC: block abstract unix sockets (ABI 5)                 │
 └─────────────────────────────────────────────────────────────┘
           │
           ▼
 ┌─────────────────────────────────────────────────────────────┐
-│  Layer 4: Mount Namespace + pivot_root                       │
-│  • Minimal rootfs (no /proc, /sys, /home)                   │
-│  • Host filesystem completely unmounted                     │
-└─────────────────────────────────────────────────────────────┘
-          │
-          ▼
-┌─────────────────────────────────────────────────────────────┐
-│  Layer 5: Landlock LSM                                       │
-│  • Kernel-enforced filesystem rules                         │
-│  • Read-only binaries, read-write workspace only            │
-└─────────────────────────────────────────────────────────────┘
-          │
-          ▼
-┌─────────────────────────────────────────────────────────────┐
-│  Layer 6: Seccomp BPF                                        │
+│  Seccomp BPF                                                 │
 │  • ~100 allowed syscalls (whitelist)                        │
 │  • Blocks ptrace, mount, clone(NEWUSER), AF_NETLINK         │
 │  • SIGSYS on violation (immediate termination)              │
@@ -297,12 +273,20 @@ evalbox implements **defense in depth** with 7 independent isolation layers:
           │
           ▼
 ┌─────────────────────────────────────────────────────────────┐
-│  Layer 7: Resource Limits                                    │
+│  Resource Limits                                             │
 │  • Memory, CPU, processes, file descriptors                 │
 │  • Prevents DoS attacks                                      │
+└─────────────────────────────────────────────────────────────┘
+          │
+          ▼
+┌─────────────────────────────────────────────────────────────┐
+│  Privilege Hardening                                         │
+│  • NO_NEW_PRIVS — cannot gain privileges via exec           │
+│  • Securebits locked — cannot regain capabilities           │
+│  • All 64 capabilities dropped                              │
 └─────────────────────────────────────────────────────────────┘
 
-For detailed security policy and threat model, see SECURITY.md
+For detailed security policy and threat model, see SECURITY_MODEL.md
 ```
 
 ---
@@ -344,7 +328,7 @@ BPF Program Flow:
                KILL ALLOW  KILL ALLOW  KILL ALLOW ALLOW KILL
 ```
 
-For the complete syscall policy, see [SECURITY.md](SECURITY.md#syscall-policy).
+For the complete syscall policy, see [SECURITY_MODEL.md](SECURITY_MODEL.md#syscall-policy).
 
 ---
 
@@ -352,31 +336,15 @@ For the complete syscall policy, see [SECURITY.md](SECURITY.md#syscall-policy).
 
 ```
 /tmp/evalbox-XXXXX/           Workspace root (tmpdir)
-├── root/                     New root filesystem
-│   ├── work/                 User workspace (read-write)
-│   │   ├── script.py         User files
-│   │   └── data.json
-│   ├── tmp/                  Temporary files (read-write)
-│   ├── etc/                  Minimal config
-│   │   ├── passwd            nobody user
-│   │   ├── group             nogroup
-│   │   ├── hosts             localhost
-│   │   └── resolv.conf       DNS (if network enabled)
-│   ├── dev/                  Minimal devices
-│   │   ├── null
-│   │   ├── zero
-│   │   ├── urandom
-│   │   └── fd → /proc/self/fd
-│   ├── usr/ ──────────────── Bind mount (read-only)
-│   ├── lib/ ──────────────── Bind mount (read-only)
-│   ├── lib64/ ────────────── Bind mount (read-only)
-│   └── bin/ ──────────────── Symlink to /usr/bin
-│
-├── stdin                     Input pipe
-├── stdout                    Output pipe
-└── stderr                    Error pipe
+├── work/                     User workspace (read-write via Landlock)
+│   ├── script.py             User files
+│   └── data.json
+├── tmp/                      Temporary files (read-write via Landlock)
+└── home/                     Home directory (read-write via Landlock)
 ```
 
+The workspace is a plain tempdir. No `pivot_root`, no bind mounts, no special rootfs. Landlock rules control which real filesystem paths are accessible.
+
 ---
 
 ## Design Principles
@@ -384,26 +352,26 @@ For the complete syscall policy, see [SECURITY.md](SECURITY.md#syscall-policy).
 ### 1. Simple as eval()
 ```rust
 // One function call to run code safely
-let output = python::run("print('hello')", &config)?;
+let output = python::run("print('hello')").exec()?;
 ```
 
 ### 2. Defense in Depth
-Every isolation mechanism works independently. A bypass of one layer doesn't compromise the sandbox. See [SECURITY.md](SECURITY.md#defense-in-depth).
+Each isolation mechanism works independently. Landlock controls filesystem and network access, seccomp blocks dangerous syscalls, rlimits prevent resource exhaustion. See [SECURITY_MODEL.md](SECURITY_MODEL.md#defense-in-depth).
 
 ### 3. Unprivileged
 - No root required
 - No daemon/service
-- Uses user namespaces
+- No namespaces needed — Landlock + seccomp work unprivileged with `NO_NEW_PRIVS`
 
 ### 4. Minimal Attack Surface
 - Small syscall whitelist (~100 syscalls)
-- Minimal filesystem
-- No /proc, /sys by default
+- Landlock restricts filesystem to minimal paths
+- All capabilities dropped
 
 ### 5. Fast
 - ~5ms sandbox creation
-- No VM boot
-- No container image pull
+- No VM boot, no container image pull
+- Plain `fork()` + lockdown
 
 ### 6. Embeddable
 - Library, not a service
@@ -414,12 +382,11 @@ Every isolation mechanism works independently. A bypass of one layer doesn't com
 
 ## System Requirements
 
-| Requirement | Minimum | Recommended |
-|-------------|---------|-------------|
-| Linux Kernel | 5.13 | 6.1+ |
-| User Namespaces | Required | - |
-| Landlock | Required (ABI 1) | ABI 4 |
-| Seccomp | Required | - |
+| Requirement | Minimum |
+|-------------|---------|
+| Linux Kernel | 6.12 |
+| Landlock | ABI 5 |
+| Seccomp | Required |
 
 Check compatibility:
 ```bash
@@ -430,8 +397,7 @@ evalbox check
 
 ## References
 
-- [SECURITY.md](SECURITY.md) - Detailed security model and threat analysis
-- [ROADMAP.md](ROADMAP.md) - Planned features
-- [Linux namespaces](https://man7.org/linux/man-pages/man7/namespaces.7.html)
+- [Security Model](SECURITY_MODEL.md) - Detailed security model and threat analysis
+- [Roadmap](ROADMAP.md) - Planned features
 - [Landlock LSM](https://docs.kernel.org/userspace-api/landlock.html)
 - [Seccomp BPF](https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html)
diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md
index fb8fba7..32e49f2 100644
--- a/docs/ROADMAP.md
+++ b/docs/ROADMAP.md
@@ -1,5 +1,32 @@
 # Roadmap
 
+## Security Hardening
+
+### Block UDP exfiltration via seccomp
+
+Landlock only controls TCP (`LANDLOCK_ACCESS_NET_{BIND,CONNECT}_TCP`). A sandboxed process can create a `SOCK_DGRAM` socket and `sendto()` data to any IP without Landlock blocking it.
+
+**Fix:** Block `SOCK_DGRAM` in the seccomp socket filter when `plan.network_blocked`. DNS inside the sandbox already doesn't work, so this breaks nothing.
+
+**Tracking:** Landlock ABI v8 RFC patches (Dec 2025) propose `LANDLOCK_ACCESS_NET_{BIND,CONNECT,SENDTO}_UDP`. Once merged, seccomp filtering can be relaxed.
+
+### Restrict /proc access
+
+`/proc` is currently Landlock read-only. Landlock's ptrace scoping already blocks access to `environ`, `maps`, `fd/` of processes outside the sandbox domain. However, `/proc/[pid]/cmdline` is world-readable (`0444`, no ptrace check) — any host process's command line is visible.
+
+**Options:**
+- Remove `/proc` from Landlock entirely (breaks programs that read `/proc/self/`)
+- Accept `cmdline` leak as residual risk (low impact for single-user)
+- Wait for kernel support: `hidepid=` requires mount namespace, Landlock can't target `/proc/self` (magic symlink resolves to fixed inode at `open()` time)
+
+### PID namespace (optional)
+
+Without PID namespace the sandbox can enumerate host PIDs via `/proc`. Combined with `cmdline` being world-readable, this is an information leak. Adding `CLONE_NEWPID` back would fully isolate the process tree, but requires re-introducing namespace setup code.
+
+**Trade-off:** Adds ~0.5ms and complexity. Not needed for single-user code execution, useful for multi-tenant deployments.
+
+---
+
 ## Supervised Execution Mode
 
 Intercept syscalls before execution for AI CLI tools and interactive approval.
diff --git a/docs/SECURITY.md b/docs/SECURITY.md
deleted file mode 100644
index e3ebdf2..0000000
--- a/docs/SECURITY.md
+++ /dev/null
@@ -1,354 +0,0 @@
-# evalbox Security Model
-
-## Defense in Depth
-
-evalbox uses **7 independent isolation layers**. Each layer provides protection even if another layer is bypassed.
-
-```
-┌─────────────────────────────────────────────────────────────┐
-│  Layer 1  │         User Namespaces          │  Identity    │
-├───────────┼──────────────────────────────────┼──────────────┤
-│  Layer 2  │          PID Namespace           │  Process     │
-├───────────┼──────────────────────────────────┼──────────────┤
-│  Layer 3  │        Network Namespace         │  Network     │
-├───────────┼──────────────────────────────────┼──────────────┤
-│  Layer 4  │   Mount Namespace + pivot_root   │  Filesystem  │
-├───────────┼──────────────────────────────────┼──────────────┤
-│  Layer 5  │          Landlock LSM            │  FS Rules    │
-├───────────┼──────────────────────────────────┼──────────────┤
-│  Layer 6  │          Seccomp BPF             │  Syscalls    │
-├───────────┼──────────────────────────────────┼──────────────┤
-│  Layer 7  │           rlimits                │  Resources   │
-└───────────┴──────────────────────────────────┴──────────────┘
-```
-
----
-
-## Isolation Layers
-
-### Layer 1: User Namespaces
-
-User namespaces provide identity isolation.
-
-| Inside Sandbox | Outside Sandbox |
-|----------------|-----------------|
-| UID 0 (root) | Real user UID |
-| GID 0 (root) | Real user GID |
-| Full capabilities | No capabilities |
-
-**Security properties:**
-- Cannot access host user's files (different UID)
-- Capabilities only valid inside namespace
-- Cannot escalate to real root
-
-### Layer 2: PID Namespace
-
-Process isolation prevents interference with host processes.
-
-```
-Host PID Namespace          Sandbox PID Namespace
-┌───────────────────┐      ┌───────────────────┐
-│  PID 1 (init)     │      │  PID 1 (sandbox)  │
-│  PID 1234 (shell) │      │  PID 2 (python)   │
-│  PID 5678 (...)   │      │  PID 3 (child)    │
-└───────────────────┘      └───────────────────┘
-         │                          │
-         │    ✗ Cannot see          │
-         │◄─────────────────────────┤
-         │    ✗ Cannot signal       │
-```
-
-**Security properties:**
-- Sandbox sees only its own processes
-- Cannot enumerate host processes
-- Cannot send signals to host processes
-- kill() safe inside namespace
-
-### Layer 3: Network Namespace
-
-Network isolation blocks all network access by default.
-
-```
-┌─────────────────────────────────────────┐
-│           Host Network                   │
-│  eth0: 192.168.1.100                    │
-│  lo: 127.0.0.1                          │
-│  docker0: 172.17.0.1                    │
-└─────────────────────────────────────────┘
-              ✗ No access
-┌─────────────────────────────────────────┐
-│         Sandbox Network                  │
-│  (empty - no interfaces)                │
-│                                         │
-│  • No loopback                          │
-│  • No external access                   │
-│  • socket() works but connect() fails  │
-└─────────────────────────────────────────┘
-```
-
-**Security properties:**
-- Cannot connect to localhost services
-- Cannot access local network
-- Cannot exfiltrate data via network
-- Optional: Enable with `.network(true)`
-
-### Layer 4: Mount Namespace + pivot_root
-
-Filesystem isolation provides a minimal, controlled view.
-
-```
-Host Filesystem              Sandbox Filesystem
-/                            /
-├── home/                    ├── work/        ← User workspace (rw)
-│   └── user/ ✗              ├── tmp/         ← Temp files (rw)
-├── etc/                     ├── etc/         ← Minimal config
-│   └── shadow ✗             │   ├── passwd   (nobody)
-├── root/ ✗                  │   └── hosts    (localhost)
-├── proc/ ✗                  ├── dev/         ← Minimal devices
-├── sys/ ✗                   │   ├── null
-├── usr/  ───────────────────┼── usr/         ← Bind mount (ro)
-├── lib/  ───────────────────┼── lib/         ← Bind mount (ro)
-└── lib64/ ──────────────────┼── lib64/       ← Bind mount (ro)
-                             └── (host root unmounted)
-```
-
-**Security properties:**
-- Cannot access /home, /root
-- Cannot read /etc/shadow, /etc/passwd (host)
-- Cannot access /proc (no process info)
-- Cannot access /sys (no kernel info)
-- Host filesystem completely unmounted
-
-### Layer 5: Landlock LSM
-
-Kernel-enforced filesystem access control (requires Linux 5.13+).
-
-```rust
-// Landlock ruleset
-Ruleset {
-    read_only: ["/usr", "/lib", "/lib64", "/bin", "/etc"],
-    read_write: ["/work", "/tmp"],
-    execute: ["/usr/bin", "/bin"],
-    no_access: [everything else],
-}
-```
-
-**Landlock ABI versions:**
-| ABI | Kernel | Features |
-|-----|--------|----------|
-| 1 | 5.13 | Basic filesystem |
-| 2 | 5.19 | Truncate control |
-| 3 | 6.2 | File permissions |
-| 4 | 6.7 | Network TCP control |
-
-**Security properties:**
-- Enforced at kernel level (bypass-resistant)
-- Works even if mount namespace bypassed
-- Cannot be disabled after application
-
-### Layer 6: Seccomp BPF
-
-Syscall filtering with immediate termination on violation.
-
-**Filter approach:** Whitelist (allow known-safe syscalls, kill on others)
-
-**Blocked syscall categories:**
-| Category | Syscalls | Reason |
-|----------|----------|--------|
-| Namespaces | `clone(CLONE_NEW*)`, `unshare`, `setns` | Prevent new namespaces |
-| Mounting | `mount`, `umount`, `pivot_root` | Prevent FS manipulation |
-| Debugging | `ptrace`, `process_vm_*` | Prevent process injection |
-| Kernel | `reboot`, `kexec_load`, `init_module` | Prevent system damage |
-| Privilege | `setuid`, `setgid`, `setgroups` | Prevent escalation |
-| Keyring | `keyctl` | Not namespaced |
-| eBPF | `bpf` | Kernel attack surface |
-
-**Argument filtering:**
-| Syscall | Blocked Arguments | Reason |
-|---------|-------------------|--------|
-| `clone` | `CLONE_NEWUSER`, `CLONE_NEWNET`, etc. | Block namespace creation |
-| `socket` | `AF_NETLINK`, `SOCK_RAW` | Block kernel interfaces |
-| `ioctl` | `TIOCSTI`, `TIOCSETD`, `TIOCLINUX` | Block terminal injection |
-
-**Violation behavior:** `SECCOMP_RET_KILL_PROCESS` (SIGSYS, signal 31)
-
-### Layer 7: Resource Limits
-
-Prevent denial-of-service attacks.
-
-| Resource | Limit | Purpose |
-|----------|-------|---------|
-| `RLIMIT_AS` | 256 MB | Memory limit |
-| `RLIMIT_NPROC` | 64 | Fork bomb prevention |
-| `RLIMIT_NOFILE` | 256 | File descriptor limit |
-| `RLIMIT_FSIZE` | 10 MB | Output file size |
-| Timeout | 30s | CPU time limit |
-
----
-
-## Syscall Policy
-
-### Allowed Syscalls (~100)
-
-```
-Basic I/O:     read, write, close, lseek, pread64, pwrite64
-File ops:      openat, stat, fstat, access, readlink
-Memory:        mmap, mprotect, munmap, brk, mremap
-Process:       fork, vfork, execve, exit, exit_group, wait4
-Signals:       rt_sigaction, rt_sigprocmask, rt_sigreturn
-Time:          clock_gettime, nanosleep, gettimeofday
-Sockets:       socket*, connect, bind, listen, accept, send*, recv*
-Events:        epoll_*, poll, select
-```
-
-### Blocked Syscalls (examples)
-
-```
-Dangerous:     ptrace, mount, reboot, kexec_load, init_module
-Namespaces:    clone3, unshare, setns (blocked or filtered)
-Privilege:     setuid, setgid, setresuid, setresgid
-Kernel:        bpf, perf_event_open, keyctl
-Fileless:      memfd_create, execveat (together enable fileless exec)
-```
-
-### Special Handling
-
-| Syscall | Handling |
-|---------|----------|
-| `clone` | Allowed, but `CLONE_NEW*` flags blocked |
-| `clone3` | Returns `ENOSYS` (glibc falls back to `clone`) |
-| `socket` | Allowed, but `AF_NETLINK` and `SOCK_RAW` blocked |
-| `ioctl` | Allowed, but `TIOCSTI`, `TIOCSETD`, `TIOCLINUX` blocked |
-
----
-
-## Threat Model
-
-### In Scope (Protected Against)
-
-| Threat | Mitigation |
-|--------|------------|
-| **Arbitrary code execution** | Sandboxed environment |
-| **Filesystem escape** | Namespaces + Landlock + pivot_root |
-| **Network access** | Network namespace (empty) |
-| **Process injection** | PID namespace + ptrace blocked |
-| **Privilege escalation** | User namespace + seccomp |
-| **Resource exhaustion** | rlimits + timeouts |
-| **Fork bombs** | RLIMIT_NPROC |
-| **Terminal injection** | TIOCSTI/TIOCLINUX blocked |
-| **Fileless malware** | memfd_create + execveat blocked |
-
-### Out of Scope
-
-| Threat | Reason |
-|--------|--------|
-| **Kernel exploits** | Requires kernel hardening (grsecurity, etc.) |
-| **Side-channel attacks** | Spectre/Meltdown require CPU mitigations |
-| **Container breakout via 0-day** | Defense in depth limits impact |
-| **Covert channels** | Timing-based data exfiltration possible |
-
-### CVE Protection
-
-evalbox's seccomp policy blocks attack vectors for many kernel CVEs:
-
-| CVE | Attack Vector | Blocked By |
-|-----|---------------|------------|
-| CVE-2024-1086 | AF_NETLINK + nf_tables | Socket filtering |
-| CVE-2022-0185 | fsconfig + user namespace | CLONE_NEWUSER blocked |
-| CVE-2022-0492 | cgroups + user namespace | CLONE_NEWUSER blocked |
-| CVE-2017-5226 | TIOCSTI terminal injection | ioctl filtering |
-| CVE-2019-13272 | ptrace PTRACE_TRACEME | ptrace blocked |
-| CVE-2021-3490 | eBPF verifier bypass | bpf blocked |
-
----
-
-## Filesystem Access
-
-### Default Mounts
-
-| Path | Access | Source | Purpose |
-|------|--------|--------|---------|
-| `/work` | Read-Write | Workspace | User files |
-| `/tmp` | Read-Write | tmpfs | Temporary files |
-| `/usr` | Read-Only | Host | Binaries, libraries |
-| `/lib` | Read-Only | Host | Shared libraries |
-| `/lib64` | Read-Only | Host | 64-bit libraries |
-| `/etc` | Read-Only | Generated | Minimal config |
-| `/dev` | Read-Only | Generated | null, zero, urandom |
-
-### Not Mounted (Blocked)
-
-| Path | Contains | Risk if Accessible |
-|------|----------|-------------------|
-| `/home` | User data | Data theft |
-| `/root` | Root home | Credential theft |
-| `/proc` | Process info | Info leak, escape vectors |
-| `/sys` | Kernel interfaces | Kernel manipulation |
-| `/var` | System state | Log manipulation |
-| `/run` | Runtime data | Socket access |
-
----
-
-## Verification
-
-### Security Tests
-
-Run the security test suite to verify isolation:
-
-```bash
-# Run all security tests
-cargo test -p evalbox-sandbox --test security_tests -- --ignored
-
-# Run specific category
-cargo test -p evalbox-sandbox --test security_tests seccomp -- --ignored
-cargo test -p evalbox-sandbox --test security_tests filesystem -- --ignored
-cargo test -p evalbox-sandbox --test security_tests network -- --ignored
-cargo test -p evalbox-sandbox --test security_tests cve -- --ignored
-```
-
-### Test Coverage
-
-| Category | Tests | Coverage |
-|----------|-------|----------|
-| Seccomp | 9 | ptrace, mount, reboot, clone, socket, keyctl, bpf |
-| Filesystem | 8 | /etc/shadow, /root, path traversal, symlinks |
-| Network | 5 | External, localhost, loopback, DNS |
-| Resources | 7 | Timeout, memory, PIDs, output limit |
-| CVE | 10 | Real-world exploits blocked |
-
-### Manual Verification
-
-```bash
-# Try to read /etc/shadow (should fail)
-evalbox shell "cat /etc/shadow"
-
-# Try to access network (should fail)
-evalbox shell "curl https://example.com"
-
-# Try ptrace (should be killed with SIGSYS)
-evalbox shell "strace ls"
-```
-
----
-
-## Production Requirements
-
-To deploy evalbox securely, ensure your system meets these requirements:
-
-| Requirement | How to Verify |
-|-------------|---------------|
-| Kernel 5.13+ with Landlock | `cat /sys/kernel/security/lsm` should include `landlock` |
-| User namespaces enabled | `cat /proc/sys/kernel/unprivileged_userns_clone` should be `1` |
-| Seccomp enabled | `grep SECCOMP /boot/config-$(uname -r)` |
-| Unprivileged BPF disabled | `sysctl kernel.unprivileged_bpf_disabled=1` (recommended) |
-
-Run `evalbox check` to verify all requirements automatically.
-
----
-
-## References
-
-- [Architecture Overview](ARCHITECTURE.md)
-- [Linux Namespaces](https://man7.org/linux/man-pages/man7/namespaces.7.html)
-- [Landlock Documentation](https://docs.kernel.org/userspace-api/landlock.html)
-- [Seccomp BPF](https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html)
diff --git a/docs/SECURITY_MODEL.md b/docs/SECURITY_MODEL.md
new file mode 100644
index 0000000..21c12a2
--- /dev/null
+++ b/docs/SECURITY_MODEL.md
@@ -0,0 +1,286 @@
+# evalbox Security Model
+
+## Defense in Depth
+
+evalbox uses **independent isolation mechanisms**. Each provides protection even if another is bypassed.
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│           │      Landlock v5          │  Filesystem, Network│
+│           │                           │  Signal, IPC        │
+├───────────┼───────────────────────────┼─────────────────────┤
+│           │      Seccomp BPF          │  Syscalls            │
+├───────────┼───────────────────────────┼─────────────────────┤
+│           │      rlimits              │  Resources           │
+├───────────┼───────────────────────────┼─────────────────────┤
+│           │   Privilege Hardening     │  NO_NEW_PRIVS,       │
+│           │                           │  securebits, caps    │
+└───────────┴───────────────────────────┴─────────────────────┘
+```
+
+---
+
+## Isolation Mechanisms
+
+### Landlock v5
+
+Kernel-enforced access control (requires Linux 6.12+, Landlock ABI 5).
+
+No namespaces or `pivot_root` needed — Landlock operates on real filesystem paths.
+
+**Filesystem rules:**
+```
+read-only:   /usr, /lib, /lib64, /bin, /etc, /proc, /nix/store*
+read-write:  workspace/work, workspace/tmp, workspace/home
+write:       /dev (for /dev/null, /dev/zero, /dev/urandom)
+no access:   everything else
+```
+
+**Network control (ABI 4+):**
+- Blocks `LANDLOCK_ACCESS_NET_BIND_TCP`
+- Blocks `LANDLOCK_ACCESS_NET_CONNECT_TCP`
+- Optional: enable with `.network(true)`
+
+**Signal isolation (ABI 5):**
+- `LANDLOCK_SCOPE_SIGNAL` — blocks signals to processes outside the sandbox
+
+**IPC isolation (ABI 5):**
+- `LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET` — blocks connections to abstract unix sockets outside the sandbox
+
+**Landlock ABI versions:**
+| ABI | Kernel | Features |
+|-----|--------|----------|
+| 1 | 5.13 | Basic filesystem |
+| 2 | 5.19 | Truncate control |
+| 3 | 6.2 | File permissions |
+| 4 | 6.7 | Network TCP control |
+| 5 | 6.12 | Signal + abstract unix socket scoping |
+
+**Security properties:**
+- Enforced at kernel level (bypass-resistant)
+- Cannot be disabled after application
+- Works unprivileged with `NO_NEW_PRIVS`
+
+### Seccomp BPF
+
+Syscall filtering with immediate termination on violation.
+
+**Filter approach:** Whitelist (allow known-safe syscalls, kill on others)
+
+**Blocked syscall categories:**
+| Category | Syscalls | Reason |
+|----------|----------|--------|
+| Namespaces | `clone(CLONE_NEW*)`, `unshare`, `setns` | Prevent new namespaces |
+| Mounting | `mount`, `umount`, `pivot_root` | Prevent FS manipulation |
+| Debugging | `ptrace`, `process_vm_*` | Prevent process injection |
+| Kernel | `reboot`, `kexec_load`, `init_module` | Prevent system damage |
+| Privilege | `setuid`, `setgid`, `setgroups` | Prevent escalation |
+| Keyring | `keyctl` | Not namespaced |
+| eBPF | `bpf` | Kernel attack surface |
+| Fileless | `memfd_create`, `execveat` | Bypass Landlock |
+
+**Argument filtering:**
+| Syscall | Blocked Arguments | Reason |
+|---------|-------------------|--------|
+| `clone` | `CLONE_NEWUSER`, `CLONE_NEWNET`, `CLONE_NEWNS`, `CLONE_NEWPID`, `CLONE_NEWIPC`, `CLONE_NEWUTS`, `CLONE_NEWCGROUP` | Block namespace creation |
+| `clone3` | Entirely blocked (returns `ENOSYS`) | Cannot inspect flags in userspace struct |
+| `socket` | `AF_NETLINK`, `SOCK_RAW` | Block kernel interfaces |
+| `ioctl` | `TIOCSTI`, `TIOCSETD`, `TIOCLINUX` | Block terminal injection |
+
+**Violation behavior:** `SECCOMP_RET_KILL_PROCESS` (SIGSYS, signal 31)
+
+### Resource Limits
+
+Prevent denial-of-service attacks via kernel-enforced rlimits.
+
+| Resource | Limit | Purpose |
+|----------|-------|---------|
+| `RLIMIT_DATA` | 256 MiB | Memory usage |
+| `RLIMIT_CPU` | timeout * 2 + 60s | CPU time limit |
+| `RLIMIT_FSIZE` | 16 MiB | Output file size |
+| `RLIMIT_NOFILE` | 256 | File descriptor limit |
+| `RLIMIT_NPROC` | 64 | Fork bomb prevention |
+| `RLIMIT_CORE` | 0 | Core dumps disabled |
+| `RLIMIT_STACK` | 8 MiB | Stack size |
+
+Note: `RLIMIT_AS` (virtual address space) is intentionally not set. Modern runtimes like Go, Java, and V8 pre-allocate large virtual ranges but only commit small portions.
+
+### Privilege Hardening
+
+Permanent privilege reduction applied before seccomp:
+
+| Mechanism | Effect |
+|-----------|--------|
+| `PR_SET_NO_NEW_PRIVS` | Cannot gain privileges via exec (setuid, file caps) |
+| `SECBIT_NOROOT` (locked) | Root has no special privilege |
+| `SECBIT_NO_SETUID_FIXUP` (locked) | Capabilities not adjusted on UID change |
+| `SECBIT_KEEP_CAPS` (locked) | Cannot keep caps through exec |
+| `SECBIT_NO_CAP_AMBIENT_RAISE` (locked) | Cannot set ambient capabilities |
+| Drop all 64 capabilities | No capability-based operations possible |
+
+---
+
+## Syscall Policy
+
+### Allowed Syscalls (~100)
+
+```
+Basic I/O:     read, write, close, lseek, pread64, pwrite64
+File ops:      openat, stat, fstat, access, readlink
+Memory:        mmap, mprotect, munmap, brk, mremap
+Process:       fork, vfork, execve, exit, exit_group, wait4
+Signals:       rt_sigaction, rt_sigprocmask, rt_sigreturn, kill, tgkill
+Time:          clock_gettime, nanosleep, gettimeofday
+Sockets:       socket*, connect, bind, listen, accept, send*, recv*
+Events:        epoll_*, poll, select
+```
+
+Note: `kill` and `tgkill` are allowed because Landlock ABI 5 provides signal scoping — signals can only reach processes within the sandbox.
+
+### Blocked Syscalls (examples)
+
+```
+Dangerous:     ptrace, mount, reboot, kexec_load, init_module
+Namespaces:    clone3, unshare, setns (blocked or filtered)
+Privilege:     setuid, setgid, setresuid, setresgid
+Kernel:        bpf, perf_event_open, keyctl
+Fileless:      memfd_create, execveat (together enable fileless exec)
+```
+
+### Special Handling
+
+| Syscall | Handling |
+|---------|----------|
+| `clone` | Allowed, but `CLONE_NEW*` flags blocked |
+| `clone3` | Returns `ENOSYS` (glibc falls back to `clone`) |
+| `socket` | Allowed, but `AF_NETLINK` and `SOCK_RAW` blocked |
+| `ioctl` | Allowed, but `TIOCSTI`, `TIOCSETD`, `TIOCLINUX` blocked |
+
+---
+
+## Threat Model
+
+### In Scope (Protected Against)
+
+| Threat | Mitigation |
+|--------|------------|
+| **Arbitrary code execution** | Sandboxed environment |
+| **Filesystem escape** | Landlock v5 path rules |
+| **Network access** | Landlock network control (ABI 4+) + seccomp socket filtering |
+| **Process injection** | ptrace blocked by seccomp |
+| **Privilege escalation** | NO_NEW_PRIVS + seccomp + capability drop |
+| **Resource exhaustion** | rlimits + timeouts |
+| **Fork bombs** | RLIMIT_NPROC |
+| **Terminal injection** | TIOCSTI/TIOCLINUX blocked by seccomp |
+| **Fileless malware** | memfd_create + execveat blocked by seccomp |
+| **Cross-sandbox signals** | Landlock signal scoping (ABI 5) |
+| **Abstract unix socket abuse** | Landlock IPC scoping (ABI 5) |
+
+### Out of Scope
+
+| Threat | Reason |
+|--------|--------|
+| **Kernel exploits** | Requires kernel hardening (grsecurity, etc.) |
+| **Side-channel attacks** | Spectre/Meltdown require CPU mitigations |
+| **Container breakout via 0-day** | Defense in depth limits impact |
+| **Covert channels** | Timing-based data exfiltration possible |
+
+### CVE Protection
+
+evalbox's seccomp policy blocks attack vectors for many kernel CVEs:
+
+| CVE | Attack Vector | Blocked By |
+|-----|---------------|------------|
+| CVE-2024-1086 | AF_NETLINK + nf_tables | Socket filtering |
+| CVE-2022-0185 | fsconfig + user namespace | CLONE_NEWUSER blocked |
+| CVE-2022-0492 | cgroups + user namespace | CLONE_NEWUSER blocked |
+| CVE-2017-5226 | TIOCSTI terminal injection | ioctl filtering |
+| CVE-2019-13272 | ptrace PTRACE_TRACEME | ptrace blocked |
+| CVE-2021-3490 | eBPF verifier bypass | bpf blocked |
+
+---
+
+## Filesystem Access
+
+### Accessible Paths (via Landlock)
+
+| Path | Access | Purpose |
+|------|--------|---------|
+| `workspace/work` | Read-Write | User files |
+| `workspace/tmp` | Read-Write | Temporary files |
+| `workspace/home` | Read-Write | Home directory |
+| `/usr` | Read-Only + Execute | Binaries, libraries |
+| `/lib` | Read-Only + Execute | Shared libraries |
+| `/lib64` | Read-Only + Execute | 64-bit libraries |
+| `/bin` | Read-Only + Execute | Binaries |
+| `/etc` | Read-Only | System config |
+| `/proc` | Read-Only | Process info (no execute) |
+| `/dev` | Read + Write | null, zero, urandom |
+| `/nix/store` | Read-Only + Execute | NixOS paths (if present) |
+
+### Not Accessible
+
+| Path | Contains | Risk if Accessible |
+|------|----------|-------------------|
+| `/home` (host) | User data | Data theft |
+| `/root` | Root home | Credential theft |
+| `/sys` | Kernel interfaces | Kernel manipulation |
+| `/var` | System state | Log manipulation |
+| `/run` | Runtime data | Socket access |
+
+---
+
+## Verification
+
+### Security Tests
+
+Run the security test suite to verify isolation:
+
+```bash
+# Run all security tests
+cargo test -p evalbox-sandbox --test security_tests -- --ignored
+
+# Run specific category
+cargo test -p evalbox-sandbox --test security_tests seccomp -- --ignored
+cargo test -p evalbox-sandbox --test security_tests filesystem -- --ignored
+cargo test -p evalbox-sandbox --test security_tests network -- --ignored
+cargo test -p evalbox-sandbox --test security_tests cve -- --ignored
+```
+
+Or via Nix:
+
+```bash
+nix run .#test-all
+```
+
+### Test Coverage
+
+| Category | Tests | Coverage |
+|----------|-------|----------|
+| Seccomp | 9 | ptrace, mount, reboot, clone, socket, keyctl, bpf |
+| Filesystem | 8 | /etc/shadow, /root, path traversal, symlinks |
+| Network | 5 | External, localhost, loopback, DNS |
+| Resources | 7 | Timeout, memory, PIDs, output limit |
+| CVE | 10 | Real-world exploits blocked |
+
+---
+
+## Production Requirements
+
+| Requirement | How to Verify |
+|-------------|---------------|
+| Kernel 6.12+ | `uname -r` |
+| Landlock ABI 5 | `cat /sys/kernel/security/lsm` should include `landlock` |
+| Seccomp enabled | `grep SECCOMP /boot/config-$(uname -r)` |
+| Unprivileged BPF disabled | `sysctl kernel.unprivileged_bpf_disabled=1` (recommended) |
+
+Run `evalbox check` to verify all requirements automatically.
+
+---
+
+## References
+
+- [Architecture](ARCHITECTURE.md)
+- [Security Policy](../SECURITY.md)
+- [Landlock Documentation](https://docs.kernel.org/userspace-api/landlock.html)
+- [Seccomp BPF](https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html)
diff --git a/flake.lock b/flake.lock
index 82db2d7..90964ff 100644
--- a/flake.lock
+++ b/flake.lock
@@ -1,12 +1,60 @@
 {
   "nodes": {
+    "crane": {
+      "locked": {
+        "lastModified": 1771121070,
+        "narHash": "sha256-aIlv7FRXF9q70DNJPI237dEDAznSKaXmL5lfK/Id/bI=",
+        "owner": "ipetkov",
+        "repo": "crane",
+        "rev": "a2812c19f1ed2e5ed5ce2ef7109798b575c180e1",
+        "type": "github"
+      },
+      "original": {
+        "owner": "ipetkov",
+        "repo": "crane",
+        "type": "github"
+      }
+    },
+    "flake-parts": {
+      "inputs": {
+        "nixpkgs-lib": "nixpkgs-lib"
+      },
+      "locked": {
+        "lastModified": 1769996383,
+        "narHash": "sha256-AnYjnFWgS49RlqX7LrC4uA+sCCDBj0Ry/WOJ5XWAsa0=",
+        "owner": "hercules-ci",
+        "repo": "flake-parts",
+        "rev": "57928607ea566b5db3ad13af0e57e921e6b12381",
+        "type": "github"
+      },
+      "original": {
+        "owner": "hercules-ci",
+        "repo": "flake-parts",
+        "type": "github"
+      }
+    },
+    "import-tree": {
+      "locked": {
+        "lastModified": 1771045967,
+        "narHash": "sha256-oYO4poyw0Sb/db2PigqugMlDwsvwLg6CSpFrMUWxA3Q=",
+        "owner": "vic",
+        "repo": "import-tree",
+        "rev": "c968d3b54d12cf5d9c13f16f7c545a06c9d1fde6",
+        "type": "github"
+      },
+      "original": {
+        "owner": "vic",
+        "repo": "import-tree",
+        "type": "github"
+      }
+    },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1770562336,
-        "narHash": "sha256-ub1gpAONMFsT/GU2hV6ZWJjur8rJ6kKxdm9IlCT0j84=",
+        "lastModified": 1771008912,
+        "narHash": "sha256-gf2AmWVTs8lEq7z/3ZAsgnZDhWIckkb+ZnAo5RzSxJg=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "d6c71932130818840fc8fe9509cf50be8c64634f",
+        "rev": "a82ccc39b39b621151d6732718e3e250109076fa",
         "type": "github"
       },
       "original": {
@@ -16,8 +64,26 @@
         "type": "github"
       }
     },
+    "nixpkgs-lib": {
+      "locked": {
+        "lastModified": 1769909678,
+        "narHash": "sha256-cBEymOf4/o3FD5AZnzC3J9hLbiZ+QDT/KDuyHXVJOpM=",
+        "owner": "nix-community",
+        "repo": "nixpkgs.lib",
+        "rev": "72716169fe93074c333e8d0173151350670b824c",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-community",
+        "repo": "nixpkgs.lib",
+        "type": "github"
+      }
+    },
     "root": {
       "inputs": {
+        "crane": "crane",
+        "flake-parts": "flake-parts",
+        "import-tree": "import-tree",
         "nixpkgs": "nixpkgs",
         "rust-overlay": "rust-overlay"
       }
@@ -29,11 +95,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1770865833,
-        "narHash": "sha256-oiARqnlvaW6pVGheVi4ye6voqCwhg5hCcGish2ZvQzI=",
+        "lastModified": 1771297684,
+        "narHash": "sha256-wieWskQxZLPlNXX06JEB0bMoS/ZYQ89xBzF0RL9lyLs=",
         "owner": "oxalica",
         "repo": "rust-overlay",
-        "rev": "c8cfbe26238638e2f3a2c0ae7e8d240f5e4ded85",
+        "rev": "755d3669699a7c62aef35af187d75dc2728cfd85",
         "type": "github"
       },
       "original": {
diff --git a/flake.nix b/flake.nix
index e708a37..ccc5bef 100644
--- a/flake.nix
+++ b/flake.nix
@@ -3,47 +3,18 @@
 
   inputs = {
     nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
-    rust-overlay.url = "github:oxalica/rust-overlay";
-    rust-overlay.inputs.nixpkgs.follows = "nixpkgs";
+    flake-parts.url = "github:hercules-ci/flake-parts";
+    import-tree.url = "github:vic/import-tree";
+    rust-overlay = {
+      url = "github:oxalica/rust-overlay";
+      inputs.nixpkgs.follows = "nixpkgs";
+    };
+    crane.url = "github:ipetkov/crane";
   };
 
-  outputs = { self, nixpkgs, rust-overlay }:
-    let
-      systems = [ "x86_64-linux" "aarch64-linux" ];
-      forAllSystems = nixpkgs.lib.genAttrs systems;
-    in
-    {
-      devShells = forAllSystems (system:
-        let
-          pkgs = import nixpkgs {
-            inherit system;
-            overlays = [ rust-overlay.overlays.default ];
-          };
-
-          rust = pkgs.rust-bin.stable.latest.default.override {
-            extensions = [ "rust-src" "rust-analyzer" ];
-          };
-        in
-        {
-          default = pkgs.mkShell {
-            buildInputs = with pkgs; [
-              rust
-              pkg-config
-
-              mold
-              clang
-
-              python3
-              go
-            ];
-
-            RUST_BACKTRACE = "1";
-
-            shellHook = ''
-              echo "evalbox dev environment"
-              echo "Rust: $(rustc --version)"
-            '';
-          };
-        });
-    };
+  outputs = inputs:
+    inputs.flake-parts.lib.mkFlake { inherit inputs; }
+      (inputs.import-tree ./nix // {
+        systems = [ "x86_64-linux" ];
+      });
 }
diff --git a/nix/devshell.nix b/nix/devshell.nix
new file mode 100644
index 0000000..001172d
--- /dev/null
+++ b/nix/devshell.nix
@@ -0,0 +1,17 @@
+{ ... }:
+{
+  perSystem = { pkgs, toolchainWithExtensions, ... }: {
+    devShells.default = pkgs.mkShell {
+      name = "evalbox-dev";
+      buildInputs = with pkgs; [
+        toolchainWithExtensions
+        pkg-config
+        gcc
+        python3
+        go
+      ];
+      RUST_SRC_PATH = "${toolchainWithExtensions}/lib/rustlib/src/rust/library";
+      RUST_BACKTRACE = "1";
+    };
+  };
+}
diff --git a/nix/packages.nix b/nix/packages.nix
new file mode 100644
index 0000000..54b4f4c
--- /dev/null
+++ b/nix/packages.nix
@@ -0,0 +1,57 @@
+{ ... }:
+{
+  perSystem = { pkgs, craneLib, src, commonArgs, cargoArtifacts, ... }:
+    let
+      srcWithPayloads = pkgs.lib.cleanSourceWith {
+        src = ./..;
+        filter = path: type:
+          (craneLib.filterCargoSources path type)
+          || (builtins.match ".*\\.c$" path != null);
+      };
+    in {
+    packages = {
+      default = craneLib.buildPackage (commonArgs // {
+        inherit cargoArtifacts;
+      });
+
+      security-test-bin = craneLib.mkCargoDerivation (commonArgs // {
+        inherit cargoArtifacts;
+        src = srcWithPayloads;
+        pnameSuffix = "-security-tests";
+        doCheck = false;
+        nativeBuildInputs = (commonArgs.nativeBuildInputs or []) ++ [ pkgs.jq pkgs.gcc ];
+        buildPhaseCargoCommand = ''
+          cargo test -p evalbox-sandbox --test security_tests \
+            --no-run --release --message-format=json 2>/dev/null \
+            | jq -r 'select(.executable != null) | .executable' \
+            > /tmp/test-bins.txt
+        '';
+        installPhaseCommand = ''
+          mkdir -p $out/bin/payloads
+          while IFS= read -r bin; do
+            [ -f "$bin" ] && cp "$bin" $out/bin/
+          done < /tmp/test-bins.txt
+          for dir in target/release/build/evalbox-sandbox-*/out/payloads; do
+            [ -d "$dir" ] && cp "$dir"/* $out/bin/payloads/
+          done
+        '';
+      });
+    };
+
+    checks = {
+      clippy = craneLib.cargoClippy (commonArgs // {
+        inherit cargoArtifacts;
+        cargoClippyExtraArgs = "--all-targets -- -D warnings";
+      });
+      fmt = craneLib.cargoFmt { inherit src; };
+      test = craneLib.cargoTest (commonArgs // {
+        inherit cargoArtifacts;
+        cargoTestExtraArgs = "--lib";
+      });
+      doc = craneLib.cargoDoc (commonArgs // {
+        inherit cargoArtifacts;
+        RUSTDOCFLAGS = "-D warnings";
+      });
+    };
+  };
+}
diff --git a/nix/toolchain.nix b/nix/toolchain.nix
new file mode 100644
index 0000000..3b90771
--- /dev/null
+++ b/nix/toolchain.nix
@@ -0,0 +1,27 @@
+{ inputs, ... }:
+{
+  perSystem = { system, ... }:
+    let
+      pkgs = import inputs.nixpkgs {
+        inherit system;
+        overlays = [ inputs.rust-overlay.overlays.default ];
+      };
+      toolchain = pkgs.rust-bin.stable.latest.default;
+      toolchainWithExtensions = toolchain.override {
+        extensions = [ "rust-src" "rust-analyzer" "clippy" "rustfmt" ];
+      };
+      craneLib = (inputs.crane.mkLib pkgs).overrideToolchain toolchain;
+      src = craneLib.cleanCargoSource ./..;
+      crateInfo = craneLib.crateNameFromCargoToml { cargoToml = ./../Cargo.toml; };
+      commonArgs = {
+        inherit src;
+        inherit (crateInfo) pname version;
+        nativeBuildInputs = with pkgs; [ pkg-config ];
+      };
+      cargoArtifacts = craneLib.buildDepsOnly commonArgs;
+    in {
+      _module.args = {
+        inherit pkgs craneLib toolchainWithExtensions src commonArgs cargoArtifacts;
+      };
+    };
+}