diff --git a/crates/exec-harness/preload/codspeed_preload.c b/crates/exec-harness/preload/codspeed_preload.c
index 0b908e04..418af143 100644
--- a/crates/exec-harness/preload/codspeed_preload.c
+++ b/crates/exec-harness/preload/codspeed_preload.c
@@ -7,8 +7,6 @@
 //
 // Environment variables:
 //   CODSPEED_BENCH_URI - The benchmark URI to report (required)
-//   CODSPEED_PRELOAD_LOCK - Set by the first process to prevent child processes
-//                          from re-initializing instrumentation
 
 #include <stdlib.h>
 #include <unistd.h>
@@ -22,8 +20,6 @@
 #define RUNNING_ON_VALGRIND 0
 #endif
 
-static const char *LOCK_ENV = "CODSPEED_PRELOAD_LOCK";
-
 // These constants are defined by the build script (build.rs) via -D flags
 #ifndef CODSPEED_URI_ENV
 #error "CODSPEED_URI_ENV must be defined by the build system"
@@ -54,14 +50,6 @@ __attribute__((constructor)) static void codspeed_preload_init(void) {
     return;
   }
 
-  // Check if another process already owns the instrumentation
-  if (getenv(LOCK_ENV)) {
-    return;
-  }
-
-  // Set the lock to prevent child processes from initializing
-  setenv(LOCK_ENV, "1", 1);
-
   g_bench_uri = getenv(URI_ENV);
   if (!g_bench_uri) {
     return;
diff --git a/crates/exec-harness/src/analysis/mod.rs b/crates/exec-harness/src/analysis/mod.rs
index 65e220d7..bc9db591 100644
--- a/crates/exec-harness/src/analysis/mod.rs
+++ b/crates/exec-harness/src/analysis/mod.rs
@@ -6,6 +6,7 @@ use crate::BenchmarkCommand;
 use crate::constants;
 use crate::uri;
 use instrument_hooks_bindings::InstrumentHooks;
+use std::path::PathBuf;
 use std::process::Command;
 
 mod ld_preload_check;
@@ -54,9 +55,14 @@ pub fn perform_with_valgrind(commands: Vec<BenchmarkCommand>) -> Result<()> {
         cmd.args(&benchmark_cmd.command[1..]);
         // Use LD_PRELOAD to inject instrumentation into the child process
         cmd.env("LD_PRELOAD", preload_lib_path);
+        cmd.env("PYTHONPERFSUPPORT", "1");
         cmd.env(constants::URI_ENV, &name_and_uri.uri);
 
-        let status = cmd.status().context("Failed to execute command")?;
+        let mut child = cmd.spawn().context("Failed to spawn command")?;
+
+        let status = child.wait().context("Failed to execute command")?;
+
+        bail_if_command_spawned_subprocesses_under_valgrind(child.id())?;
 
         if !status.success() {
             bail!("Command exited with non-zero status: {status}");
@@ -65,3 +71,35 @@ pub fn perform_with_valgrind(commands: Vec<BenchmarkCommand>) -> Result<()> {
 
     Ok(())
 }
+
+fn bail_if_command_spawned_subprocesses_under_valgrind(pid: u32) -> Result<()> {
+    let Some(profile_folder) = std::env::var_os("CODSPEED_PROFILE_FOLDER") else {
+        debug!("CODSPEED_PROFILE_FOLDER is not set, skipping subprocess detection");
+        return Ok(());
+    };
+
+    let profile_folder = PathBuf::from(profile_folder);
+
+    // Bail if any <pid>.out where <pid> > pid of the benchmark process exists in the profile
+    // folder, which indicates that the benchmark process spawned subprocesses.
+    for entry in std::fs::read_dir(profile_folder)? {
+        let entry = entry?;
+        let file_name = entry.file_name();
+        let file_name = file_name.to_string_lossy();
+
+        if let Some(stripped) = file_name.strip_suffix(".out") {
+            if let Ok(subprocess_pid) = stripped.parse::<u32>() {
+                if subprocess_pid > pid {
+                    bail!(
+                        "The codspeed CLI in CPU Simulation mode does not support measuring processes that spawn other processes yet.\n\n\
+                         Please either:\n\
+                         - Use the walltime measurement mode, or\n\
+                         - Benchmark a process that does not create subprocesses"
+                    )
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
diff --git a/src/executor/helpers/harvest_perf_maps_for_pids.rs b/src/executor/helpers/harvest_perf_maps_for_pids.rs
index 67270359..fc7b6ac3 100644
--- a/src/executor/helpers/harvest_perf_maps_for_pids.rs
+++ b/src/executor/helpers/harvest_perf_maps_for_pids.rs
@@ -1,14 +1,13 @@
 use crate::prelude::*;
-use std::collections::HashSet;
 use std::fs;
 use std::path::{Path, PathBuf};
 
-pub async fn harvest_perf_maps_for_pids(
-    profile_folder: &Path,
-    pids: &HashSet<libc::pid_t>,
-) -> Result<()> {
+pub async fn harvest_perf_maps_for_pids<I>(profile_folder: &Path, pids: I) -> Result<()>
+where
+    I: IntoIterator<Item = libc::pid_t>,
+{
     let perf_maps = pids
-        .iter()
+        .into_iter()
         .map(|pid| format!("perf-{pid}.map"))
         .map(|file_name| {
             (
diff --git a/src/executor/shared/fifo.rs b/src/executor/shared/fifo.rs
index 40f2cc81..723f921c 100644
--- a/src/executor/shared/fifo.rs
+++ b/src/executor/shared/fifo.rs
@@ -69,7 +69,6 @@ impl GenericFifo {
 pub struct FifoBenchmarkData {
     /// Name and version of the integration
     pub integration: Option<(String, String)>,
-    pub bench_pids: HashSet<pid_t>,
 }
 
 pub struct RunnerFifo {
@@ -255,10 +254,7 @@ impl RunnerFifo {
                     );
                     let marker_result =
                         ExecutionTimestamps::new(&bench_order_by_timestamp, &markers);
-                    let fifo_data = FifoBenchmarkData {
-                        integration,
-                        bench_pids,
-                    };
+                    let fifo_data = FifoBenchmarkData { integration };
                     return Ok((marker_result, fifo_data, exit_status));
                 }
                 Err(e) => return Err(anyhow::Error::from(e)),
diff --git a/src/executor/valgrind/helpers/perf_maps.rs b/src/executor/valgrind/helpers/perf_maps.rs
index c017a885..b9955cc2 100644
--- a/src/executor/valgrind/helpers/perf_maps.rs
+++ b/src/executor/valgrind/helpers/perf_maps.rs
@@ -19,5 +19,5 @@ pub async fn harvest_perf_maps(profile_folder: &Path) -> Result<()> {
         .filter_map(|pid| pid.parse().ok())
         .collect::<HashSet<_>>();
 
-    harvest_perf_maps_for_pids(profile_folder, &pids).await
+    harvest_perf_maps_for_pids(profile_folder, pids.iter().copied()).await
 }
diff --git a/src/executor/wall_time/perf/jit_dump.rs b/src/executor/wall_time/perf/jit_dump.rs
index a337f263..1948b236 100644
--- a/src/executor/wall_time/perf/jit_dump.rs
+++ b/src/executor/wall_time/perf/jit_dump.rs
@@ -4,10 +4,7 @@ use crate::{
 };
 use linux_perf_data::jitdump::{JitDumpReader, JitDumpRecord};
 use runner_shared::unwind_data::UnwindData;
-use std::{
-    collections::HashSet,
-    path::{Path, PathBuf},
-};
+use std::path::{Path, PathBuf};
 
 struct JitDump {
     path: PathBuf,
@@ -102,7 +99,10 @@ impl JitDump {
 }
 
 /// Converts all the `jit-<pid>.dump` into unwind data and copies it to the profile folder.
-pub async fn harvest_perf_jit_for_pids(profile_folder: &Path, pids: &HashSet<i32>) -> Result<()> {
+pub async fn harvest_perf_jit_for_pids<I>(profile_folder: &Path, pids: I) -> Result<()>
+where
+    I: IntoIterator<Item = libc::pid_t>,
+{
     for pid in pids {
         let name = format!("jit-{pid}.dump");
         let path = PathBuf::from("/tmp").join(&name);
@@ -131,7 +131,7 @@ pub async fn harvest_perf_jit_for_pids(profile_folder: &Path, pids: &HashSet<i32
         };
 
         for module in unwind_data {
-            module.save_to(profile_folder, *pid as _)?;
+            module.save_to(profile_folder, pid)?;
         }
     }
 
diff --git a/src/executor/wall_time/perf/mod.rs b/src/executor/wall_time/perf/mod.rs
index 7d861e61..c00b5cd7 100644
--- a/src/executor/wall_time/perf/mod.rs
+++ b/src/executor/wall_time/perf/mod.rs
@@ -105,9 +105,9 @@ impl PerfRunner {
             || config.command.contains("uv")
             || config.command.contains("python")
         {
-            // Max supported stack size is 64KiB, but this will increase the file size by a lot. In
-            // order to allow uploads and maintain accuracy, we limit this to 8KiB.
-            (UnwindingMode::Dwarf, Some(8 * 1024))
+            // Note that the higher the stack size, the larger the file, although it is mitigated
+            // by zstd compression
+            (UnwindingMode::Dwarf, Some(32 * 1024))
         } else {
             // Default to dwarf unwinding since it works well with most binaries.
             debug!("No call graph mode detected, defaulting to dwarf");
@@ -195,15 +195,10 @@ impl PerfRunner {
             .get()
             .expect("Benchmark order is not available");
 
-        // Harvest the perf maps generated by python. This will copy the perf
-        // maps from /tmp to the profile folder. We have to write our own perf
-        // maps to these files AFTERWARDS, otherwise it'll be overwritten!
-        harvest_perf_maps_for_pids(profile_folder, &bench_data.fifo_data.bench_pids).await?;
-        harvest_perf_jit_for_pids(profile_folder, &bench_data.fifo_data.bench_pids).await?;
-
         // Append perf maps, unwind info and other metadata
-        if let Err(BenchmarkDataSaveError::MissingIntegration) =
-            bench_data.save_to(profile_folder, &self.get_perf_file_path(profile_folder))
+        if let Err(BenchmarkDataSaveError::MissingIntegration) = bench_data
+            .save_to(profile_folder, &self.get_perf_file_path(profile_folder))
+            .await
         {
             warn!(
                 "Perf is enabled, but failed to detect benchmarks. If you wish to disable this warning, set CODSPEED_PERF_ENABLED=false"
@@ -273,16 +268,20 @@ pub struct BenchmarkData {
 pub enum BenchmarkDataSaveError {
     MissingIntegration,
     FailedToParsePerfFile,
+    FailedToHarvestPerfMaps,
+    FailedToHarvestJitDumps,
 }
 
 impl BenchmarkData {
-    pub fn save_to<P: AsRef<std::path::Path>>(
+    pub async fn save_to<P: AsRef<std::path::Path>>(
         &self,
         path: P,
         perf_file_path: P,
     ) -> Result<(), BenchmarkDataSaveError> {
         self.marker_result.save_to(&path).unwrap();
 
+        let path_ref = path.as_ref();
+
         debug!("Reading perf data from file for mmap extraction");
         let MemmapRecordsOutput {
             symbols_by_pid,
@@ -294,7 +293,22 @@ impl BenchmarkData {
             })?
         };
 
-        let path_ref = path.as_ref();
+        // Harvest the perf maps generated by python. This will copy the perf
+        // maps from /tmp to the profile folder. We have to write our own perf
+        // maps to these files AFTERWARDS, otherwise it'll be overwritten!
+        harvest_perf_maps_for_pids(path_ref, symbols_by_pid.keys().copied())
+            .await
+            .map_err(|e| {
+                error!("Failed to harvest perf maps: {e}");
+                BenchmarkDataSaveError::FailedToHarvestPerfMaps
+            })?;
+        harvest_perf_jit_for_pids(path_ref, symbols_by_pid.keys().copied())
+            .await
+            .map_err(|e| {
+                error!("Failed to harvest jit dumps: {e}");
+                BenchmarkDataSaveError::FailedToHarvestJitDumps
+            })?;
+
         debug!("Saving symbols addresses");
         symbols_by_pid.par_iter().for_each(|(_, proc_sym)| {
             proc_sym.save_to(path_ref).unwrap();