diff --git a/api/builder.go b/api/builder.go
index d6ef0e5..2a8d647 100644
--- a/api/builder.go
+++ b/api/builder.go
@@ -3,16 +3,28 @@ package api
 import "github.com/sarchlab/akita/v4/sim"
 
 type defaultPortFactory struct {
+	incomingBufCap int
+	outgoingBufCap int
 }
 
 func (f defaultPortFactory) make(c sim.Component, name string) sim.Port {
-	return sim.NewPort(c, 1, 1, name)
+	incoming := f.incomingBufCap
+	if incoming <= 0 {
+		incoming = 1
+	}
+	outgoing := f.outgoingBufCap
+	if outgoing <= 0 {
+		outgoing = 1
+	}
+	return sim.NewPort(c, incoming, outgoing, name)
 }
 
 // DriverBuilder creates a new instance of Driver.
 type DriverBuilder struct {
-	engine sim.Engine
-	freq   sim.Freq
+	engine                sim.Engine
+	freq                  sim.Freq
+	portIncomingBufferCap int
+	portOutgoingBufferCap int
 }
 
 // WithEngine sets the engine.
@@ -27,10 +39,20 @@ func (b DriverBuilder) WithFreq(freq sim.Freq) DriverBuilder {
 	return b
 }
 
+// WithPortBufferDepth configures driver boundary-port incoming/outgoing capacity.
+func (b DriverBuilder) WithPortBufferDepth(incoming, outgoing int) DriverBuilder {
+	b.portIncomingBufferCap = incoming
+	b.portOutgoingBufferCap = outgoing
+	return b
+}
+
 // Build create a driver.
 func (b DriverBuilder) Build(name string) Driver {
 	d := &driverImpl{
-		portFactory: defaultPortFactory{},
+		portFactory: defaultPortFactory{
+			incomingBufCap: b.portIncomingBufferCap,
+			outgoingBufCap: b.portOutgoingBufferCap,
+		},
 	}
 
 	d.TickingComponent = sim.NewTickingComponent(name, b.engine, b.freq, d)
diff --git a/api/builder_microarch_test.go b/api/builder_microarch_test.go
new file mode 100644
index 0000000..77fbcf8
--- /dev/null
+++ b/api/builder_microarch_test.go
@@ -0,0 +1,29 @@
+package api
+
+import (
+	"testing"
+
+	"github.com/sarchlab/akita/v4/sim"
+)
+
+func TestDriverBuilderWithPortBufferDepth(t *testing.T) {
+	engine := sim.NewSerialEngine()
+	driver := DriverBuilder{}.
+		WithEngine(engine).
+		WithFreq(1*sim.GHz).
+		WithPortBufferDepth(3, 5).
+		Build("Driver")
+
+	impl, ok := driver.(*driverImpl)
+	if !ok {
+		t.Fatalf("expected *driverImpl, got %T", driver)
+	}
+
+	factory, ok := impl.portFactory.(defaultPortFactory)
+	if !ok {
+		t.Fatalf("expected defaultPortFactory, got %T", impl.portFactory)
+	}
+	if factory.incomingBufCap != 3 || factory.outgoingBufCap != 5 {
+		t.Fatalf("unexpected driver port caps: in=%d out=%d", factory.incomingBufCap, factory.outgoingBufCap)
+	}
+}
diff --git a/api/driver.go b/api/driver.go
index 41fa379..4a0e3c6 100644
--- a/api/driver.go
+++ b/api/driver.go
@@ -142,17 +142,23 @@ func (d *driverImpl) doOneFeedInTask(task *feedInTask) bool {
 		err := port.Send(msg)
 		//fmt.Println(msg)
 		if err != nil {
-			panic("CGRA cannot handle the data rate")
+			// Keep task pending when downstream is temporarily back-pressured.
+			continue
 		}
 
-		core.Trace("DataFlow",
-			"Behavior", "FeedIn",
-			slog.Float64("Time", float64(d.Engine.CurrentTime()*1e9)),
-			"Data", task.data[dataIndex],
-			"Color", task.color,
-			"From", port.Name(),
-			"To", task.remotePorts[i],
-		)
+		timeValue := float64(d.Engine.CurrentTime() * 1e9)
+		if core.TraceEnabled() {
+			core.Trace("DataFlow",
+				"Behavior", "FeedIn",
+				slog.Float64("Time", timeValue),
+				"Data", task.data[dataIndex],
+				"Color", task.color,
+				"From", port.Name(),
+				"To", task.remotePorts[i],
+			)
+		} else {
+			core.ObserveDataFlow("FeedIn", timeValue, port.Name(), string(task.remotePorts[i]), "", "")
+		}
 		task.portRounds[i]++
 		madeProgress = true
 	}
@@ -202,15 +208,20 @@ func (d *driverImpl) doOneCollectTask(task *collectTask) bool {
 		}
 		task.data[dataIndex] = msg.Data.First()
 
-		core.Trace("DataFlow",
-			"Behavior", "Collect",
-			slog.Float64("Time", float64(d.Engine.CurrentTime()*1e9)),
-			"Data", msg.Data.First(),
-			"Pred", msg.Data.Pred,
-			"Color", task.color,
-			"From", task.ports[i].Name(),
-			"To", "None",
-		)
+		timeValue := float64(d.Engine.CurrentTime() * 1e9)
+		if core.TraceEnabled() {
+			core.Trace("DataFlow",
+				"Behavior", "Collect",
+				slog.Float64("Time", timeValue),
+				"Data", msg.Data.First(),
+				"Pred", msg.Data.Pred,
+				"Color", task.color,
+				"From", task.ports[i].Name(),
+				"To", "None",
+			)
+		} else {
+			core.ObserveDataFlow("Collect", timeValue, task.ports[i].Name(), "None", "", "")
+		}
 
 		task.portRounds[i]++
 		madeProgress = true
diff --git a/api/feedin_backpressure_test.go b/api/feedin_backpressure_test.go
new file mode 100644
index 0000000..3767429
--- /dev/null
+++ b/api/feedin_backpressure_test.go
@@ -0,0 +1,48 @@
+package api
+
+import (
+	"testing"
+
+	gomock "github.com/golang/mock/gomock"
+	"github.com/sarchlab/akita/v4/sim"
+)
+
+func TestDoOneFeedInTaskBackpressureDoesNotPanic(t *testing.T) {
+	ctrl := gomock.NewController(t)
+	defer ctrl.Finish()
+
+	engine := sim.NewSerialEngine()
+	d := &driverImpl{}
+	d.TickingComponent = sim.NewTickingComponent("Driver", engine, 1*sim.GHz, d)
+
+	port := NewMockPort(ctrl)
+	port.EXPECT().CanSend().Return(true).Times(2)
+	port.EXPECT().Name().Return("mock-port").AnyTimes()
+	port.EXPECT().AsRemote().Return(sim.RemotePort("driver-local")).AnyTimes()
+	port.EXPECT().Send(gomock.Any()).Return(sim.NewSendError()).Times(1)
+	port.EXPECT().Send(gomock.Any()).Return(nil).Times(1)
+
+	task := &feedInTask{
+		data:        []uint32{7},
+		localPorts:  []sim.Port{port},
+		remotePorts: []sim.RemotePort{sim.RemotePort("device-remote")},
+		stride:      1,
+		color:       0,
+		rounds:      1,
+		portRounds:  []int{0},
+	}
+
+	if progressed := d.doOneFeedInTask(task); progressed {
+		t.Fatal("expected no progress when Send returns backpressure error")
+	}
+	if task.portRounds[0] != 0 {
+		t.Fatalf("expected round to stay 0 after backpressure, got %d", task.portRounds[0])
+	}
+
+	if progressed := d.doOneFeedInTask(task); !progressed {
+		t.Fatal("expected progress once backpressure clears")
+	}
+	if task.portRounds[0] != 1 {
+		t.Fatalf("expected round to advance to 1, got %d", task.portRounds[0])
+	}
+}
diff --git a/config/config.go b/config/config.go
index 875f499..9825c56 100644
--- a/config/config.go
+++ b/config/config.go
@@ -20,9 +20,19 @@ type DeviceBuilder struct {
 	freq    sim.Freq
 	monitor *monitoring.Monitor
 	//portFactory   portFactory
-	width, height int
-	memoryMode    string         // simple or shared or local
-	memoryShare   map[[2]int]int //map[[x, y]]GroupID
+	width, height         int
+	memoryMode            string         // simple or shared or local
+	memoryShare           map[[2]int]int //map[[x, y]]GroupID
+	executionPolicy       string
+	strictMaxSlip         int64
+	strictFailOnViolation bool
+	corePortIncomingCap   int
+	corePortOutgoingCap   int
+	enableFIFOModel       bool
+	enableQueueWatches    bool
+	queueWatches          []core.QueueWatchSpec
+	numRegisters          int
+	localMemoryWords      int
 }
 
 // type portFactory interface {
@@ -74,6 +84,60 @@ func (d DeviceBuilder) WithMemoryShare(share map[[2]int]int) DeviceBuilder {
 	return d
 }
 
+// WithExecutionPolicy sets core execution policy.
+func (d DeviceBuilder) WithExecutionPolicy(policy string) DeviceBuilder {
+	d.executionPolicy = policy
+	return d
+}
+
+// WithStrictTimingConfig sets strict timing replay controls.
+func (d DeviceBuilder) WithStrictTimingConfig(maxSlip int64, failOnViolation bool) DeviceBuilder {
+	d.strictMaxSlip = maxSlip
+	d.strictFailOnViolation = failOnViolation
+	return d
+}
+
+// WithCorePortBufferDepth sets core port incoming/outgoing capacities.
+func (d DeviceBuilder) WithCorePortBufferDepth(incoming, outgoing int) DeviceBuilder {
+	d.corePortIncomingCap = incoming
+	d.corePortOutgoingCap = outgoing
+	return d
+}
+
+// WithEnableFIFOModel toggles FIFO-based core execution model.
+func (d DeviceBuilder) WithEnableFIFOModel(enabled bool) DeviceBuilder {
+	d.enableFIFOModel = enabled
+	return d
+}
+
+// WithEnableQueueWatches toggles optional queue-occupancy instrumentation.
+func (d DeviceBuilder) WithEnableQueueWatches(enabled bool) DeviceBuilder {
+	d.enableQueueWatches = enabled
+	return d
+}
+
+// WithQueueWatches sets optional queue watch definitions for all cores.
+func (d DeviceBuilder) WithQueueWatches(queueWatches []core.QueueWatchSpec) DeviceBuilder {
+	if len(queueWatches) == 0 {
+		d.queueWatches = nil
+		return d
+	}
+	d.queueWatches = append([]core.QueueWatchSpec(nil), queueWatches...)
+	return d
+}
+
+// WithRegisterCount sets register-file size per core.
+func (d DeviceBuilder) WithRegisterCount(num int) DeviceBuilder {
+	d.numRegisters = num
+	return d
+}
+
+// WithLocalMemoryWords sets local memory size (in words) per core.
+func (d DeviceBuilder) WithLocalMemoryWords(words int) DeviceBuilder {
+	d.localMemoryWords = words
+	return d
+}
+
 // Build creates a CGRA device.
 func (d DeviceBuilder) Build(name string) cgra.Device {
 	dev := &device{
@@ -188,6 +252,14 @@ func (d DeviceBuilder) createTiles(
 				WithExitAddr(&exit).
 				WithRetValAddr(&retVal).
 				WithExitReqAddr(&exitReqTimestamp).
+				WithExecutionPolicy(d.executionPolicy).
+				WithStrictTimingConfig(d.strictMaxSlip, d.strictFailOnViolation).
+				WithPortBufferDepth(d.corePortIncomingCap, d.corePortOutgoingCap).
+				WithEnableFIFOModel(d.enableFIFOModel).
+				WithEnableQueueWatches(d.enableQueueWatches).
+				WithQueueWatches(d.queueWatches).
+				WithRegisterCount(d.numRegisters).
+				WithLocalMemoryWords(d.localMemoryWords).
 				Build(coreName)
 
 			if d.monitor != nil {
diff --git a/config/config_microarch_test.go b/config/config_microarch_test.go
new file mode 100644
index 0000000..eee7606
--- /dev/null
+++ b/config/config_microarch_test.go
@@ -0,0 +1,35 @@
+package config
+
+import (
+	"testing"
+
+	"github.com/sarchlab/akita/v4/sim"
+)
+
+func TestDeviceBuilderLocalMemoryWordsPropagatesToTile(t *testing.T) {
+	engine := sim.NewSerialEngine()
+	dev := DeviceBuilder{}.
+		WithEngine(engine).
+		WithFreq(1 * sim.GHz).
+		WithWidth(1).
+		WithHeight(1).
+		WithMemoryMode("simple").
+		WithLocalMemoryWords(32).
+		Build("Device")
+
+	tile := dev.GetTile(0, 0)
+	_ = tile.GetMemory(0, 0, 31)
+
+	didPanic := false
+	func() {
+		defer func() {
+			if recover() != nil {
+				didPanic = true
+			}
+		}()
+		_ = tile.GetMemory(0, 0, 32)
+	}()
+	if !didPanic {
+		t.Fatal("expected out-of-range panic at address 32 with local_memory_words=32")
+	}
+}
diff --git a/core/builder.go b/core/builder.go
index e6c3a9d..a1b5e38 100644
--- a/core/builder.go
+++ b/core/builder.go
@@ -1,17 +1,30 @@
 package core
 
 import (
+	"os"
+	"strings"
+
 	"github.com/sarchlab/akita/v4/sim"
 	"github.com/sarchlab/zeonica/cgra"
 )
 
 // Builder can create new cores.
 type Builder struct {
-	engine      sim.Engine
-	freq        sim.Freq
-	exitAddr    *bool
-	retValAddr  *uint32
-	exitReqAddr *float64
+	engine                sim.Engine
+	freq                  sim.Freq
+	exitAddr              *bool
+	retValAddr            *uint32
+	exitReqAddr           *float64
+	executionPolicy       string
+	strictMaxSlip         int64
+	strictFailOnViolation bool
+	portIncomingBufferCap int
+	portOutgoingBufferCap int
+	enableFIFOModel       bool
+	enableQueueWatches    bool
+	queueWatches          []QueueWatchSpec
+	numRegisters          int
+	localMemoryWords      int
 }
 
 // WithEngine sets the engine.
@@ -43,15 +56,98 @@ func (b Builder) WithExitReqAddr(exitReqAddr *float64) Builder {
 	return b
 }
 
+// WithExecutionPolicy sets the execution policy for issue-time gating.
+func (b Builder) WithExecutionPolicy(policy string) Builder {
+	b.executionPolicy = policy
+	return b
+}
+
+// WithStrictTimingConfig sets strict timing replay controls.
+func (b Builder) WithStrictTimingConfig(maxSlip int64, failOnViolation bool) Builder {
+	b.strictMaxSlip = maxSlip
+	b.strictFailOnViolation = failOnViolation
+	return b
+}
+
+// WithPortBufferDepth configures each core port incoming/outgoing capacity.
+func (b Builder) WithPortBufferDepth(incoming, outgoing int) Builder {
+	b.portIncomingBufferCap = incoming
+	b.portOutgoingBufferCap = outgoing
+	return b
+}
+
+// WithEnableFIFOModel toggles FIFO-based execution behavior.
+func (b Builder) WithEnableFIFOModel(enabled bool) Builder {
+	b.enableFIFOModel = enabled
+	return b
+}
+
+// WithEnableQueueWatches toggles optional queue-occupancy instrumentation.
+func (b Builder) WithEnableQueueWatches(enabled bool) Builder {
+	b.enableQueueWatches = enabled
+	return b
+}
+
+// WithQueueWatches sets optional queue watch definitions for occupancy instrumentation.
+func (b Builder) WithQueueWatches(queueWatches []QueueWatchSpec) Builder {
+	if len(queueWatches) == 0 {
+		b.queueWatches = nil
+		return b
+	}
+	b.queueWatches = append([]QueueWatchSpec(nil), queueWatches...)
+	return b
+}
+
+// WithRegisterCount configures register-file size per core.
+func (b Builder) WithRegisterCount(num int) Builder {
+	b.numRegisters = num
+	return b
+}
+
+// WithLocalMemoryWords configures local memory size (in words) per core.
+func (b Builder) WithLocalMemoryWords(words int) Builder {
+	b.localMemoryWords = words
+	return b
+}
+
+func readyHeldTraceEnabledFromEnv() bool {
+	value := strings.ToLower(strings.TrimSpace(os.Getenv("ZEONICA_TRACE_READY_HELD")))
+	return value == "1" || value == "true" || value == "yes" || value == "on"
+}
+
 // Build creates a core.
 //
 //nolint:funlen
 func (b Builder) Build(name string) *Core {
 	c := &Core{}
 
+	incomingBufCap := b.portIncomingBufferCap
+	if incomingBufCap <= 0 {
+		incomingBufCap = 1
+	}
+	outgoingBufCap := b.portOutgoingBufferCap
+	if outgoingBufCap <= 0 {
+		outgoingBufCap = 1
+	}
+	registerCount := b.numRegisters
+	if registerCount <= 0 {
+		registerCount = 64
+	}
+	localMemoryWords := b.localMemoryWords
+	if localMemoryWords <= 0 {
+		localMemoryWords = 1024
+	}
+	resolvedQueueWatches, err := resolveQueueWatchSpecs(b.queueWatches)
+	if err != nil {
+		panic(err)
+	}
+
 	c.TickingComponent = sim.NewTickingComponent(name, b.engine, b.freq, c)
 	c.emu = instEmulator{
-		CareFlags: true,
+		CareFlags:             true,
+		ExecutionPolicy:       normalizeExecutionPolicyString(b.executionPolicy),
+		StrictMaxSlip:         b.strictMaxSlip,
+		StrictFailOnViolation: b.strictFailOnViolation,
 	}
 	c.state = coreState{
 		exit:                 b.exitAddr,
@@ -70,16 +166,35 @@ func (b Builder) Build(name string) *Core {
 			"NorthWest": true,
 			"Router":    true,
 		},
-		Registers:        make([]cgra.Data, 64),
-		Memory:           make([]uint32, 1024),
-		RecvBufHead:      make([][]cgra.Data, 4),
-		RecvBufHeadReady: make([][]bool, 4),
-		SendBufHead:      make([][]cgra.Data, 4),
-		SendBufHeadBusy:  make([][]bool, 4),
-		AddrBuf:          0,
-		IsToWriteMemory:  false,
-		States:           make(map[string]interface{}),
-		Mode:             SyncOp,
+		Registers:              make([]cgra.Data, registerCount),
+		Memory:                 make([]uint32, localMemoryWords),
+		RecvBufHead:            make([][]cgra.Data, 4),
+		RecvBufHeadReady:       make([][]bool, 4),
+		SendBufHead:            make([][]cgra.Data, 4),
+		SendBufHeadBusy:        make([][]bool, 4),
+		RecvBufQueue:           make([][][]cgra.Data, 4),
+		SendBufQueue:           make([][][]cgra.Data, 4),
+		RecvQueueCapacity:      incomingBufCap,
+		SendQueueCapacity:      outgoingBufCap,
+		EnableFIFOModel:        b.enableFIFOModel,
+		EnableQueueWatches:     b.enableQueueWatches,
+		ConfiguredQueueWatches: cloneQueueWatches(resolvedQueueWatches),
+		OpInputReadCache:       make(map[string]cgra.Data),
+		AddrBuf:                0,
+		IsToWriteMemory:        false,
+		States:                 make(map[string]interface{}),
+		Mode:                   SyncOp,
+		CurrentCycle:           0,
+		OpTimingCursor:         make(map[int]int),
+		OpTimingLate:           make(map[int]bool),
+		OpTimingRollCycle:      make(map[int]int64),
+		OpIssueCount:           make(map[int]int),
+		ReadyHeldTraceEnabled:  readyHeldTraceEnabledFromEnv(),
+		ReadyHeldRunMode:       strings.TrimSpace(os.Getenv("ZEONICA_READY_HELD_RUN_MODE")),
+		TimingWaitBlocked:      false,
+		StallReason:            "",
+		StallOpID:              0,
+		StallOpCode:            "",
 		CurrReservationState: ReservationState{
 			ReservationMap:  make(map[int]bool),
 			OpToExec:        0,
@@ -92,28 +207,34 @@ func (b Builder) Build(name string) *Core {
 		c.state.RecvBufHeadReady[i] = make([]bool, 12)
 		c.state.SendBufHead[i] = make([]cgra.Data, 12)
 		c.state.SendBufHeadBusy[i] = make([]bool, 12)
+		c.state.RecvBufQueue[i] = make([][]cgra.Data, 12)
+		c.state.SendBufQueue[i] = make([][]cgra.Data, 12)
+		for direction := 0; direction < 12; direction++ {
+			c.state.RecvBufQueue[i][direction] = make([]cgra.Data, 0, incomingBufCap)
+			c.state.SendBufQueue[i][direction] = make([]cgra.Data, 0, outgoingBufCap)
+		}
 	}
 
 	c.ports = make(map[cgra.Side]*portPair)
 
-	b.makePort(c, cgra.North)
-	b.makePort(c, cgra.West)
-	b.makePort(c, cgra.South)
-	b.makePort(c, cgra.East)
-	b.makePort(c, cgra.NorthEast)
-	b.makePort(c, cgra.SouthEast)
-	b.makePort(c, cgra.SouthWest)
-	b.makePort(c, cgra.NorthWest)
-	b.makePort(c, cgra.Router)
-	b.makePort(c, cgra.Dummy1)
-	b.makePort(c, cgra.Dummy2)
-	b.makePort(c, cgra.Dummy3)
+	b.makePort(c, cgra.North, incomingBufCap, outgoingBufCap)
+	b.makePort(c, cgra.West, incomingBufCap, outgoingBufCap)
+	b.makePort(c, cgra.South, incomingBufCap, outgoingBufCap)
+	b.makePort(c, cgra.East, incomingBufCap, outgoingBufCap)
+	b.makePort(c, cgra.NorthEast, incomingBufCap, outgoingBufCap)
+	b.makePort(c, cgra.SouthEast, incomingBufCap, outgoingBufCap)
+	b.makePort(c, cgra.SouthWest, incomingBufCap, outgoingBufCap)
+	b.makePort(c, cgra.NorthWest, incomingBufCap, outgoingBufCap)
+	b.makePort(c, cgra.Router, incomingBufCap, outgoingBufCap)
+	b.makePort(c, cgra.Dummy1, incomingBufCap, outgoingBufCap)
+	b.makePort(c, cgra.Dummy2, incomingBufCap, outgoingBufCap)
+	b.makePort(c, cgra.Dummy3, incomingBufCap, outgoingBufCap)
 
 	return c
 }
 
-func (b *Builder) makePort(c *Core, side cgra.Side) {
-	localPort := sim.NewPort(c, 1, 1, c.Name()+"."+side.Name())
+func (b *Builder) makePort(c *Core, side cgra.Side, incomingBufCap, outgoingBufCap int) {
+	localPort := sim.NewPort(c, incomingBufCap, outgoingBufCap, c.Name()+"."+side.Name())
 	c.ports[side] = &portPair{
 		local: localPort,
 	}
diff --git a/core/builder_microarch_test.go b/core/builder_microarch_test.go
new file mode 100644
index 0000000..79a2362
--- /dev/null
+++ b/core/builder_microarch_test.go
@@ -0,0 +1,32 @@
+package core
+
+import (
+	"testing"
+
+	"github.com/sarchlab/akita/v4/sim"
+)
+
+func TestCoreBuilderResourceSizing(t *testing.T) {
+	engine := sim.NewSerialEngine()
+	c := Builder{}.
+		WithEngine(engine).
+		WithFreq(1*sim.GHz).
+		WithEnableFIFOModel(true).
+		WithRegisterCount(96).
+		WithLocalMemoryWords(2048).
+		WithPortBufferDepth(4, 6).
+		Build("Core")
+
+	if got := len(c.state.Registers); got != 96 {
+		t.Fatalf("unexpected register count: got %d want 96", got)
+	}
+	if got := len(c.state.Memory); got != 2048 {
+		t.Fatalf("unexpected local memory words: got %d want 2048", got)
+	}
+	if c.GetPortByName("North") == nil {
+		t.Fatal("expected North port to be initialized")
+	}
+	if !c.state.EnableFIFOModel {
+		t.Fatal("expected EnableFIFOModel to propagate to core state")
+	}
+}
diff --git a/core/core.go b/core/core.go
index 8b32db5..64a81f3 100644
--- a/core/core.go
+++ b/core/core.go
@@ -59,14 +59,19 @@ func (c *Core) WriteMemory(x int, y int, data uint32, baseAddr uint32) {
 	if x == int(c.state.TileX) && y == int(c.state.TileY) {
 		c.state.Memory[baseAddr] = data
 		//fmt.Printf("Core [%d][%d] write memory[%d] = %d\n", c.state.TileX, c.state.TileY, baseAddr, c.state.Memory[baseAddr])
-		Trace("Memory",
-			"Behavior", "WriteMemory",
-			"Time", float64(c.Engine.CurrentTime()*1e9),
-			"Data", data,
-			"X", x,
-			"Y", y,
-			"Addr", baseAddr,
-		)
+		timeValue := float64(c.Engine.CurrentTime() * 1e9)
+		if TraceEnabled() {
+			Trace("Memory",
+				"Behavior", "WriteMemory",
+				"Time", timeValue,
+				"Data", data,
+				"X", x,
+				"Y", y,
+				"Addr", baseAddr,
+			)
+		} else {
+			ObserveMemory("WriteMemory", timeValue, x, y, "", "")
+		}
 	} else {
 		panic(fmt.Sprintf("Invalid Tile: Expect (%d, %d)，but get (%d, %d)", c.state.TileX, c.state.TileY, x, y))
 	}
@@ -85,8 +90,20 @@ func (c *Core) MapProgram(program interface{}, x int, y int) {
 		panic("MapProgram expects core.Program type")
 	}
 	c.state.PCInBlock = -1
+	c.state.CurrentCycle = 0
+	c.state.OpTimingCursor = make(map[int]int)
+	c.state.OpTimingLate = make(map[int]bool)
+	c.state.OpTimingRollCycle = make(map[int]int64)
+	c.state.PendingSyncGroup = nil
+	c.state.TimingWaitBlocked = false
+	c.state.StallReason = ""
+	c.state.StallOpID = 0
+	c.state.StallOpCode = ""
+	c.state.OpInputReadCache = make(map[string]cgra.Data)
+	c.state.resetPortQueues()
 	c.state.TileX = uint32(x)
 	c.state.TileY = uint32(y)
+	c.state.WatchedQueues = matchingQueueWatchesForTile(c.state.EnableQueueWatches, c.state.ConfiguredQueueWatches, x, y)
 }
 
 // Tick runs the program for one cycle.
@@ -96,6 +113,8 @@ func (c *Core) Tick() (madeProgress bool) {
 	// madeProgress = c.emu.runRoutingRules(&c.state) || madeProgress
 	madeProgress = c.runProgram() || madeProgress
 	madeProgress = c.doSend() || madeProgress
+	c.state.observeWatchedQueues(float64(c.Engine.CurrentTime() * 1e9))
+	c.state.CurrentCycle++
 	return madeProgress
 }
 
@@ -103,12 +122,16 @@ func makeBytesFromUint32(data uint32) []byte {
 	return []byte{byte(data >> 24), byte(data >> 16), byte(data >> 8), byte(data)}
 }
 
+//nolint:gocyclo
 func (c *Core) doSend() bool {
 	madeProgress := false
 	for i := 0; i < 8; i++ { // only 8 directions
 		for color := 0; color < 4; color++ {
-
-			if !c.state.SendBufHeadBusy[color][i] {
+			if !c.state.sendQueueHasData(color, i) {
+				continue
+			}
+			head, ok := c.state.sendQueuePeek(color, i)
+			if !ok {
 				continue
 			}
 
@@ -117,7 +140,7 @@ func (c *Core) doSend() bool {
 			msg := cgra.MoveMsgBuilder{}.
 				WithDst(c.ports[cgra.Side(i)].remote).
 				WithSrc(c.ports[cgra.Side(i)].local.AsRemote()).
-				WithData(c.state.SendBufHead[color][i]).
+				WithData(head).
 				WithSendTime(c.Engine.CurrentTime()).
 				WithColor(color).
 				Build()
@@ -127,26 +150,37 @@ func (c *Core) doSend() bool {
 				continue
 			}
 
-			Trace("DataFlow",
-				"Behavior", "Send",
-				slog.Float64("Time", float64(c.Engine.CurrentTime()*1e9)),
-				"Data", msg.Data.First(),
-				"Pred", c.state.SendBufHead[color][i].Pred,
-				"Color", color,
-				"Src", msg.Src,
-				"Dst", msg.Dst,
-			)
-			c.state.SendBufHeadBusy[color][i] = false
+			timeValue := float64(c.Engine.CurrentTime() * 1e9)
+			if TraceEnabled() {
+				Trace("DataFlow",
+					"Behavior", "Send",
+					slog.Float64("Time", timeValue),
+					"Data", msg.Data.First(),
+					"Pred", head.Pred,
+					"Color", color,
+					"Src", msg.Src,
+					"Dst", msg.Dst,
+				)
+			} else {
+				ObserveDataFlow("Send", timeValue, "", "", string(msg.Src), string(msg.Dst))
+			}
+			c.state.sendQueueConsume(color, i)
+			madeProgress = true
 		}
 	}
 
 	// handle the memory request
 
-	if c.state.SendBufHeadBusy[c.emu.getColorIndex("R")][cgra.Router] { // only one port, must be Router-red
+	routerColor := c.emu.getColorIndex("R")
+	if c.state.sendQueueHasData(routerColor, int(cgra.Router)) { // only one port, must be Router-red
+		head, ok := c.state.sendQueuePeek(routerColor, int(cgra.Router))
+		if !ok {
+			return madeProgress
+		}
 		if c.state.IsToWriteMemory {
 			msg := mem.WriteReqBuilder{}.
 				WithAddress(uint64(c.state.AddrBuf)).
-				WithData(makeBytesFromUint32(c.state.SendBufHead[c.emu.getColorIndex("R")][cgra.Router].First())).
+				WithData(makeBytesFromUint32(head.First())).
 				WithSrc(c.ports[cgra.Router].local.AsRemote()).
 				WithDst(c.ports[cgra.Router].remote).
 				Build()
@@ -156,16 +190,22 @@ func (c *Core) doSend() bool {
 				return madeProgress
 			}
 
-			Trace("Memory",
-				"Behavior", "Send",
-				slog.Float64("Time", float64(c.Engine.CurrentTime()*1e9)),
-				"Data", c.state.SendBufHead[c.emu.getColorIndex("R")][cgra.Router].First(),
-				"Pred", c.state.SendBufHead[c.emu.getColorIndex("R")][cgra.Router].Pred,
-				"Color", "R",
-				"Src", msg.Src,
-				"Dst", msg.Dst,
-			)
-			c.state.SendBufHeadBusy[c.emu.getColorIndex("R")][cgra.Router] = false
+			timeValue := float64(c.Engine.CurrentTime() * 1e9)
+			if TraceEnabled() {
+				Trace("Memory",
+					"Behavior", "Send",
+					slog.Float64("Time", timeValue),
+					"Data", head.First(),
+					"Pred", head.Pred,
+					"Color", "R",
+					"Src", msg.Src,
+					"Dst", msg.Dst,
+				)
+			} else {
+				ObserveMemory("Send", timeValue, int(c.state.TileX), int(c.state.TileY), string(msg.Src), string(msg.Dst))
+			}
+			c.state.sendQueueConsume(routerColor, int(cgra.Router))
+			madeProgress = true
 		} else {
 			msg := mem.ReadReqBuilder{}.
 				WithAddress(uint64(c.state.AddrBuf)).
@@ -179,15 +219,21 @@ func (c *Core) doSend() bool {
 				return madeProgress
 			}
 
-			Trace("Memory",
-				"Behavior", "Send",
-				slog.Float64("Time", float64(c.Engine.CurrentTime()*1e9)),
-				"Data", c.state.AddrBuf,
-				"Color", "R",
-				"Src", msg.Src,
-				"Dst", msg.Dst,
-			)
-			c.state.SendBufHeadBusy[c.emu.getColorIndex("R")][cgra.Router] = false
+			timeValue := float64(c.Engine.CurrentTime() * 1e9)
+			if TraceEnabled() {
+				Trace("Memory",
+					"Behavior", "Send",
+					slog.Float64("Time", timeValue),
+					"Data", c.state.AddrBuf,
+					"Color", "R",
+					"Src", msg.Src,
+					"Dst", msg.Dst,
+				)
+			} else {
+				ObserveMemory("Send", timeValue, int(c.state.TileX), int(c.state.TileY), string(msg.Src), string(msg.Dst))
+			}
+			c.state.sendQueueConsume(routerColor, int(cgra.Router))
+			madeProgress = true
 		}
 	}
 
@@ -198,6 +244,7 @@ func convert4BytesToUint32(data []byte) uint32 {
 	return uint32(data[0])<<24 | uint32(data[1])<<16 | uint32(data[2])<<8 | uint32(data[3])
 }
 
+//nolint:gocyclo
 func (c *Core) doRecv() bool {
 	madeProgress := false
 	for i := 0; i < 8; i++ { //direction
@@ -214,7 +261,7 @@ func (c *Core) doRecv() bool {
 		for color := 0; color < 4; color++ {
 			//fmt.Printf("%s Receiving Data with color %d. Recv buffer head: %+v\n",
 			//	c.Name(), color, c.state.RecvBufHeadReady[color][i])
-			if c.state.RecvBufHeadReady[color][i] {
+			if c.state.recvQueueIsFull(color, i) {
 				continue
 			}
 
@@ -223,18 +270,24 @@ func (c *Core) doRecv() bool {
 				continue
 			}
 
-			c.state.RecvBufHeadReady[color][i] = true
-			c.state.RecvBufHead[color][i] = msg.Data
+			if !c.state.recvQueuePush(color, i, msg.Data) {
+				continue
+			}
 
-			Trace("DataFlow",
-				"Behavior", "Recv",
-				"Time", float64(c.Engine.CurrentTime()*1e9),
-				"Data", msg.Data.First(),
-				"Pred", c.state.RecvBufHead[color][i].Pred,
-				"Src", msg.Src,
-				"Dst", msg.Dst,
-				"Color", color,
-			)
+			timeValue := float64(c.Engine.CurrentTime() * 1e9)
+			if TraceEnabled() {
+				Trace("DataFlow",
+					"Behavior", "Recv",
+					"Time", timeValue,
+					"Data", msg.Data.First(),
+					"Pred", msg.Data.Pred,
+					"Src", msg.Src,
+					"Dst", msg.Dst,
+					"Color", color,
+				)
+			} else {
+				ObserveDataFlow("Recv", timeValue, "", "", string(msg.Src), string(msg.Dst))
+			}
 
 			c.ports[cgra.Side(i)].local.RetrieveIncoming()
 			madeProgress = true
@@ -245,39 +298,55 @@ func (c *Core) doRecv() bool {
 	if item == nil {
 		return madeProgress
 	}
-	if c.state.RecvBufHeadReady[c.emu.getColorIndex("R")][cgra.Router] {
+	routerColor := c.emu.getColorIndex("R")
+	routerDir := int(cgra.Router)
+	if c.state.recvQueueIsFull(routerColor, routerDir) {
 		return madeProgress
 	}
 
 	// if msg is DataReadyRsp, then the data is ready
 	if msg, ok := item.(*mem.DataReadyRsp); ok {
-		c.state.RecvBufHeadReady[c.emu.getColorIndex("R")][cgra.Router] = true
-		c.state.RecvBufHead[c.emu.getColorIndex("R")][cgra.Router] = cgra.NewScalar(convert4BytesToUint32(msg.Data))
-
-		Trace("Memory",
-			"Behavior", "Recv",
-			"Time", float64(c.Engine.CurrentTime()*1e9),
-			"Data", msg.Data,
-			"Src", msg.Src,
-			"Dst", msg.Dst,
-			"Pred", c.state.RecvBufHead[c.emu.getColorIndex("R")][cgra.Router].Pred,
-			"Color", "R",
-		)
+		value := cgra.NewScalar(convert4BytesToUint32(msg.Data))
+		if !c.state.recvQueuePush(routerColor, routerDir, value) {
+			return madeProgress
+		}
+
+		timeValue := float64(c.Engine.CurrentTime() * 1e9)
+		if TraceEnabled() {
+			Trace("Memory",
+				"Behavior", "Recv",
+				"Time", timeValue,
+				"Data", msg.Data,
+				"Src", msg.Src,
+				"Dst", msg.Dst,
+				"Pred", value.Pred,
+				"Color", "R",
+			)
+		} else {
+			ObserveMemory("Recv", timeValue, int(c.state.TileX), int(c.state.TileY), string(msg.Src), string(msg.Dst))
+		}
 
 		c.ports[cgra.Router].local.RetrieveIncoming()
 		madeProgress = true
 	} else if msg, ok := item.(*mem.WriteDoneRsp); ok {
-		c.state.RecvBufHeadReady[c.emu.getColorIndex("R")][cgra.Router] = true
-		c.state.RecvBufHead[c.emu.getColorIndex("R")][cgra.Router] = cgra.NewScalar(0)
-
-		Trace("Memory",
-			"Behavior", "Recv",
-			"Time", float64(c.Engine.CurrentTime()*1e9),
-			"Src", msg.Src,
-			"Dst", msg.Dst,
-			"Pred", c.state.RecvBufHead[c.emu.getColorIndex("R")][cgra.Router].Pred,
-			"Color", "R",
-		)
+		value := cgra.NewScalar(0)
+		if !c.state.recvQueuePush(routerColor, routerDir, value) {
+			return madeProgress
+		}
+
+		timeValue := float64(c.Engine.CurrentTime() * 1e9)
+		if TraceEnabled() {
+			Trace("Memory",
+				"Behavior", "Recv",
+				"Time", timeValue,
+				"Src", msg.Src,
+				"Dst", msg.Dst,
+				"Pred", value.Pred,
+				"Color", "R",
+			)
+		} else {
+			ObserveMemory("Recv", timeValue, int(c.state.TileX), int(c.state.TileY), string(msg.Src), string(msg.Dst))
+		}
 
 		c.ports[cgra.Router].local.RetrieveIncoming()
 		madeProgress = true
diff --git a/core/derived_timing.go b/core/derived_timing.go
new file mode 100644
index 0000000..21c13b2
--- /dev/null
+++ b/core/derived_timing.go
@@ -0,0 +1,68 @@
+package core
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"strings"
+)
+
+const timingSidecarEnv = "ZEONICA_TIMING_SIDECAR"
+
+type timingSidecar struct {
+	SourceLog string             `json:"source_log"`
+	DerivedAt string             `json:"derived_at"`
+	Ops       []timingOpSchedule `json:"ops"`
+}
+
+type timingOpSchedule struct {
+	X      int     `json:"x"`
+	Y      int     `json:"y"`
+	OpID   int     `json:"op_id"`
+	Cycles []int64 `json:"cycles"`
+}
+
+func loadDerivedTimingFromEnv() (map[string]map[int][]int64, error) {
+	path := strings.TrimSpace(os.Getenv(timingSidecarEnv))
+	if path == "" {
+		return nil, nil
+	}
+
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, fmt.Errorf("read %s (%s): %w", timingSidecarEnv, path, err)
+	}
+
+	var sidecar timingSidecar
+	if err := json.Unmarshal(data, &sidecar); err != nil {
+		return nil, fmt.Errorf("parse timing sidecar %s: %w", path, err)
+	}
+
+	result := make(map[string]map[int][]int64)
+	for _, op := range sidecar.Ops {
+		if len(op.Cycles) == 0 {
+			continue
+		}
+		coordKey := fmt.Sprintf("(%d,%d)", op.X, op.Y)
+		if _, exists := result[coordKey]; !exists {
+			result[coordKey] = make(map[int][]int64)
+		}
+		result[coordKey][op.OpID] = append(result[coordKey][op.OpID], op.Cycles...)
+	}
+
+	return result, nil
+}
+
+func cloneDerivedTimingMap(src map[int][]int64) map[int][]int64 {
+	if len(src) == 0 {
+		return nil
+	}
+
+	cloned := make(map[int][]int64, len(src))
+	for opID, cycles := range src {
+		copied := make([]int64, len(cycles))
+		copy(copied, cycles)
+		cloned[opID] = copied
+	}
+	return cloned
+}
diff --git a/core/emu.go b/core/emu.go
index dfda1e9..354ac1b 100644
--- a/core/emu.go
+++ b/core/emu.go
@@ -24,6 +24,18 @@ const (
 	AsyncOp
 )
 
+const (
+	ExecutionPolicyStrictTimed      = "strict_timed"
+	ExecutionPolicyElasticScheduled = "elastic_scheduled"
+	ExecutionPolicyInOrderDataflow  = "in_order_dataflow"
+)
+
+const (
+	StallReasonScheduleBubble = "schedule_bubble"
+	StallReasonOperandWait    = "operand_wait"
+	StallReasonOutputBlocked  = "output_blocked"
+)
+
 type routingRule struct {
 	src   cgra.Side
 	dst   cgra.Side
@@ -80,7 +92,7 @@ func (r *ReservationState) SetReservationMap(ig InstructionGroup, state *coreSta
 		r.ReservationMap[i] = true
 	}
 	r.OpToExec = len(ig.Operations)
-	print("SetReservationMap: ", r.OpToExec, "\n")
+	// print("SetReservationMap: ", r.OpToExec, "\n")
 }
 
 type coreState struct {
@@ -102,20 +114,942 @@ type coreState struct {
 
 	Mode OpMode
 
-	RecvBufHead      [][]cgra.Data //[Color][Direction]
-	RecvBufHeadReady [][]bool
-	SendBufHead      [][]cgra.Data
-	SendBufHeadBusy  [][]bool
-	AddrBuf          uint32 // buffer for the address of the memory
-	IsToWriteMemory  bool
-
-	routingRules []*routingRule
-	triggers     []*Trigger
-	CurrentTime  float64 // current simulation time for logging
+	RecvBufHead            [][]cgra.Data //[Color][Direction]
+	RecvBufHeadReady       [][]bool
+	SendBufHead            [][]cgra.Data
+	SendBufHeadBusy        [][]bool
+	RecvBufQueue           [][][]cgra.Data // [Color][Direction]FIFO
+	SendBufQueue           [][][]cgra.Data // [Color][Direction]FIFO
+	RecvQueueCapacity      int
+	SendQueueCapacity      int
+	EnableFIFOModel        bool
+	EnableQueueWatches     bool
+	ConfiguredQueueWatches []resolvedQueueWatch
+	WatchedQueues          []resolvedQueueWatch
+	OpInputReadCache       map[string]cgra.Data
+	AddrBuf                uint32 // buffer for the address of the memory
+	IsToWriteMemory        bool
+
+	routingRules          []*routingRule
+	triggers              []*Trigger
+	CurrentTime           float64 // current simulation time for logging
+	CurrentCycle          int64
+	OpTimingCursor        map[int]int
+	OpTimingLate          map[int]bool
+	OpTimingRollCycle     map[int]int64
+	OpIssueCount          map[int]int
+	PendingSyncGroup      *pendingSyncGroup
+	ReadyHeldTraceEnabled bool
+	ReadyHeldRunMode      string
+	TimingWaitBlocked     bool
+	StallReason           string
+	StallOpID             int
+	StallOpCode           string
 }
 
 type instEmulator struct {
-	CareFlags bool
+	CareFlags             bool
+	ExecutionPolicy       string
+	StrictMaxSlip         int64
+	StrictFailOnViolation bool
+}
+
+type issueReadiness struct {
+	OperandsReady        bool
+	PredicateReadyOrTrue bool
+	ResourcesAvailable   bool
+	Ready                bool
+	WaitReason           string
+}
+
+type issueDecision struct {
+	AnnotatedTimeT       *int64
+	OperandsReady        bool
+	PredicateReadyOrTrue bool
+	ResourcesAvailable   bool
+	TimingGateSatisfied  bool
+	FireableExceptTime   bool
+	BlockedByLowerBound  bool
+	CanIssue             bool
+	WaitReason           string
+	TimingWaitBlocked    bool
+}
+
+type readyHeldObservation struct {
+	RunMode              string
+	Cycle                int64
+	X                    int
+	Y                    int
+	OpID                 int
+	OccurrenceIndex      int
+	OpCode               string
+	AnnotatedTimeT       *int64
+	OperandsReady        bool
+	PredicateReadyOrTrue bool
+	ResourcesAvailable   bool
+	TimingGateSatisfied  bool
+	FireableExceptTime   bool
+	BlockedByLowerBound  bool
+	IssuedThisCycle      bool
+}
+
+type pendingSyncGroup struct {
+	RemainingCycles   int
+	BufferedResults   map[Operand]cgra.Data
+	InvalidDecrements []int
+	RepresentativeID  int
+	RepresentativeOp  string
+}
+
+func (s *coreState) recvFIFOEnabled() bool {
+	return s.EnableFIFOModel &&
+		len(s.RecvBufQueue) == 4 &&
+		len(s.RecvBufQueue[0]) > int(cgra.Router)
+}
+
+func (s *coreState) sendFIFOEnabled() bool {
+	return s.EnableFIFOModel &&
+		len(s.SendBufQueue) == 4 &&
+		len(s.SendBufQueue[0]) > int(cgra.Router)
+}
+
+func (s *coreState) recvQueueCap() int {
+	if s.RecvQueueCapacity > 0 {
+		return s.RecvQueueCapacity
+	}
+	return 1
+}
+
+func (s *coreState) sendQueueCap(color, direction int) int {
+	// Keep router-red as single outstanding request to preserve existing
+	// address/req-state coupling semantics.
+	if color == 0 && direction == int(cgra.Router) {
+		return 1
+	}
+	if s.SendQueueCapacity > 0 {
+		return s.SendQueueCapacity
+	}
+	return 1
+}
+
+func (s *coreState) syncRecvHead(color, direction int) {
+	if len(s.RecvBufHead) <= color || len(s.RecvBufHeadReady) <= color {
+		return
+	}
+	if len(s.RecvBufHead[color]) <= direction || len(s.RecvBufHeadReady[color]) <= direction {
+		return
+	}
+	if s.recvFIFOEnabled() && len(s.RecvBufQueue[color]) > direction && len(s.RecvBufQueue[color][direction]) > 0 {
+		s.RecvBufHead[color][direction] = s.RecvBufQueue[color][direction][0]
+		s.RecvBufHeadReady[color][direction] = true
+		return
+	}
+	s.RecvBufHeadReady[color][direction] = false
+}
+
+func (s *coreState) syncSendHead(color, direction int) {
+	if len(s.SendBufHead) <= color || len(s.SendBufHeadBusy) <= color {
+		return
+	}
+	if len(s.SendBufHead[color]) <= direction || len(s.SendBufHeadBusy[color]) <= direction {
+		return
+	}
+	if s.sendFIFOEnabled() && len(s.SendBufQueue[color]) > direction && len(s.SendBufQueue[color][direction]) > 0 {
+		s.SendBufHead[color][direction] = s.SendBufQueue[color][direction][0]
+		s.SendBufHeadBusy[color][direction] = true
+		return
+	}
+	s.SendBufHeadBusy[color][direction] = false
+}
+
+func (s *coreState) recvQueueLen(color, direction int) int {
+	if s.recvFIFOEnabled() && len(s.RecvBufQueue[color]) > direction {
+		return len(s.RecvBufQueue[color][direction])
+	}
+	if s.RecvBufHeadReady[color][direction] {
+		return 1
+	}
+	return 0
+}
+
+func (s *coreState) sendQueueLen(color, direction int) int {
+	if s.sendFIFOEnabled() && len(s.SendBufQueue[color]) > direction {
+		return len(s.SendBufQueue[color][direction])
+	}
+	if s.SendBufHeadBusy[color][direction] {
+		return 1
+	}
+	return 0
+}
+
+func (s *coreState) recvQueueIsFull(color, direction int) bool {
+	if s.recvFIFOEnabled() && len(s.RecvBufQueue[color]) > direction {
+		return len(s.RecvBufQueue[color][direction]) >= s.recvQueueCap()
+	}
+	return s.RecvBufHeadReady[color][direction]
+}
+
+func (s *coreState) recvQueuePush(color, direction int, data cgra.Data) bool {
+	if s.recvFIFOEnabled() && len(s.RecvBufQueue[color]) > direction {
+		if len(s.RecvBufQueue[color][direction]) >= s.recvQueueCap() {
+			return false
+		}
+		s.RecvBufQueue[color][direction] = append(s.RecvBufQueue[color][direction], data)
+		s.syncRecvHead(color, direction)
+		return true
+	}
+	if s.RecvBufHeadReady[color][direction] {
+		return false
+	}
+	s.RecvBufHead[color][direction] = data
+	s.RecvBufHeadReady[color][direction] = true
+	return true
+}
+
+func (s *coreState) recvQueuePeek(color, direction int) (cgra.Data, bool) {
+	if s.recvFIFOEnabled() && len(s.RecvBufQueue[color]) > direction {
+		if len(s.RecvBufQueue[color][direction]) == 0 {
+			return cgra.Data{}, false
+		}
+		return s.RecvBufQueue[color][direction][0], true
+	}
+	if !s.RecvBufHeadReady[color][direction] {
+		return cgra.Data{}, false
+	}
+	return s.RecvBufHead[color][direction], true
+}
+
+func (s *coreState) recvQueueConsume(color, direction int) (cgra.Data, bool) {
+	if s.recvFIFOEnabled() && len(s.RecvBufQueue[color]) > direction {
+		if len(s.RecvBufQueue[color][direction]) == 0 {
+			return cgra.Data{}, false
+		}
+		value := s.RecvBufQueue[color][direction][0]
+		s.RecvBufQueue[color][direction] = s.RecvBufQueue[color][direction][1:]
+		s.syncRecvHead(color, direction)
+		return value, true
+	}
+	if !s.RecvBufHeadReady[color][direction] {
+		return cgra.Data{}, false
+	}
+	value := s.RecvBufHead[color][direction]
+	s.RecvBufHeadReady[color][direction] = false
+	return value, true
+}
+
+func (s *coreState) sendQueueHasData(color, direction int) bool {
+	if s.sendFIFOEnabled() && len(s.SendBufQueue[color]) > direction {
+		return len(s.SendBufQueue[color][direction]) > 0
+	}
+	return s.SendBufHeadBusy[color][direction]
+}
+
+func (s *coreState) sendQueueIsFull(color, direction int) bool {
+	if s.sendFIFOEnabled() && len(s.SendBufQueue[color]) > direction {
+		return len(s.SendBufQueue[color][direction]) >= s.sendQueueCap(color, direction)
+	}
+	return s.SendBufHeadBusy[color][direction]
+}
+
+func (s *coreState) sendQueuePush(color, direction int, data cgra.Data) bool {
+	if s.sendFIFOEnabled() && len(s.SendBufQueue[color]) > direction {
+		if len(s.SendBufQueue[color][direction]) >= s.sendQueueCap(color, direction) {
+			return false
+		}
+		s.SendBufQueue[color][direction] = append(s.SendBufQueue[color][direction], data)
+		s.syncSendHead(color, direction)
+		return true
+	}
+	if s.SendBufHeadBusy[color][direction] {
+		return false
+	}
+	s.SendBufHeadBusy[color][direction] = true
+	s.SendBufHead[color][direction] = data
+	return true
+}
+
+func (s *coreState) sendQueuePeek(color, direction int) (cgra.Data, bool) {
+	if s.sendFIFOEnabled() && len(s.SendBufQueue[color]) > direction {
+		if len(s.SendBufQueue[color][direction]) == 0 {
+			return cgra.Data{}, false
+		}
+		return s.SendBufQueue[color][direction][0], true
+	}
+	if !s.SendBufHeadBusy[color][direction] {
+		return cgra.Data{}, false
+	}
+	return s.SendBufHead[color][direction], true
+}
+
+func (s *coreState) sendQueueConsume(color, direction int) (cgra.Data, bool) {
+	if s.sendFIFOEnabled() && len(s.SendBufQueue[color]) > direction {
+		if len(s.SendBufQueue[color][direction]) == 0 {
+			return cgra.Data{}, false
+		}
+		value := s.SendBufQueue[color][direction][0]
+		s.SendBufQueue[color][direction] = s.SendBufQueue[color][direction][1:]
+		s.syncSendHead(color, direction)
+		return value, true
+	}
+	if !s.SendBufHeadBusy[color][direction] {
+		return cgra.Data{}, false
+	}
+	value := s.SendBufHead[color][direction]
+	s.SendBufHeadBusy[color][direction] = false
+	return value, true
+}
+
+func (s *coreState) resetPortQueues() {
+	for color := range s.RecvBufHeadReady {
+		for direction := range s.RecvBufHeadReady[color] {
+			s.RecvBufHeadReady[color][direction] = false
+		}
+	}
+	for color := range s.SendBufHeadBusy {
+		for direction := range s.SendBufHeadBusy[color] {
+			s.SendBufHeadBusy[color][direction] = false
+		}
+	}
+	if s.recvFIFOEnabled() {
+		for color := range s.RecvBufQueue {
+			for direction := range s.RecvBufQueue[color] {
+				s.RecvBufQueue[color][direction] = s.RecvBufQueue[color][direction][:0]
+				s.syncRecvHead(color, direction)
+			}
+		}
+	}
+	if s.sendFIFOEnabled() {
+		for color := range s.SendBufQueue {
+			for direction := range s.SendBufQueue[color] {
+				s.SendBufQueue[color][direction] = s.SendBufQueue[color][direction][:0]
+				s.syncSendHead(color, direction)
+			}
+		}
+	}
+}
+
+func clone2DData(input [][]cgra.Data) [][]cgra.Data {
+	if input == nil {
+		return nil
+	}
+	out := make([][]cgra.Data, len(input))
+	for i := range input {
+		if input[i] == nil {
+			continue
+		}
+		out[i] = append([]cgra.Data(nil), input[i]...)
+	}
+	return out
+}
+
+func clone2DBool(input [][]bool) [][]bool {
+	if input == nil {
+		return nil
+	}
+	out := make([][]bool, len(input))
+	for i := range input {
+		if input[i] == nil {
+			continue
+		}
+		out[i] = append([]bool(nil), input[i]...)
+	}
+	return out
+}
+
+func clone3DData(input [][][]cgra.Data) [][][]cgra.Data {
+	if input == nil {
+		return nil
+	}
+	out := make([][][]cgra.Data, len(input))
+	for i := range input {
+		if input[i] == nil {
+			continue
+		}
+		out[i] = make([][]cgra.Data, len(input[i]))
+		for j := range input[i] {
+			if input[i][j] == nil {
+				continue
+			}
+			out[i][j] = append([]cgra.Data(nil), input[i][j]...)
+		}
+	}
+	return out
+}
+
+func cloneStringBoolMap(input map[string]bool) map[string]bool {
+	if input == nil {
+		return nil
+	}
+	out := make(map[string]bool, len(input))
+	for k, v := range input {
+		out[k] = v
+	}
+	return out
+}
+
+func cloneStringIntMap(input map[string]int) map[string]int {
+	if input == nil {
+		return nil
+	}
+	out := make(map[string]int, len(input))
+	for k, v := range input {
+		out[k] = v
+	}
+	return out
+}
+
+func cloneIntBoolMap(input map[int]bool) map[int]bool {
+	if input == nil {
+		return nil
+	}
+	out := make(map[int]bool, len(input))
+	for k, v := range input {
+		out[k] = v
+	}
+	return out
+}
+
+func cloneIntIntMap(input map[int]int) map[int]int {
+	if input == nil {
+		return nil
+	}
+	out := make(map[int]int, len(input))
+	for k, v := range input {
+		out[k] = v
+	}
+	return out
+}
+
+func cloneIntInt64Map(input map[int]int64) map[int]int64 {
+	if input == nil {
+		return nil
+	}
+	out := make(map[int]int64, len(input))
+	for k, v := range input {
+		out[k] = v
+	}
+	return out
+}
+
+func cloneOperandDataMap(input map[Operand]cgra.Data) map[Operand]cgra.Data {
+	if input == nil {
+		return nil
+	}
+	out := make(map[Operand]cgra.Data, len(input))
+	for operand, value := range input {
+		out[operand] = value
+	}
+	return out
+}
+
+func cloneIntSlice(input []int) []int {
+	if input == nil {
+		return nil
+	}
+	return append([]int(nil), input...)
+}
+
+func clonePendingSyncGroup(input *pendingSyncGroup) *pendingSyncGroup {
+	if input == nil {
+		return nil
+	}
+	return &pendingSyncGroup{
+		RemainingCycles:   input.RemainingCycles,
+		BufferedResults:   cloneOperandDataMap(input.BufferedResults),
+		InvalidDecrements: cloneIntSlice(input.InvalidDecrements),
+		RepresentativeID:  input.RepresentativeID,
+		RepresentativeOp:  input.RepresentativeOp,
+	}
+}
+
+func cloneIntAnyMap(input map[string]interface{}) map[string]interface{} {
+	if input == nil {
+		return nil
+	}
+	out := make(map[string]interface{}, len(input))
+	for k, v := range input {
+		out[k] = v
+	}
+	return out
+}
+
+func (s *coreState) cloneForEval() *coreState {
+	clone := *s
+	clone.Registers = append([]cgra.Data(nil), s.Registers...)
+	clone.Memory = append([]uint32(nil), s.Memory...)
+	clone.States = cloneIntAnyMap(s.States)
+	clone.Directions = cloneStringBoolMap(s.Directions)
+	clone.RecvBufHead = clone2DData(s.RecvBufHead)
+	clone.RecvBufHeadReady = clone2DBool(s.RecvBufHeadReady)
+	clone.SendBufHead = clone2DData(s.SendBufHead)
+	clone.SendBufHeadBusy = clone2DBool(s.SendBufHeadBusy)
+	clone.RecvBufQueue = clone3DData(s.RecvBufQueue)
+	clone.SendBufQueue = clone3DData(s.SendBufQueue)
+	clone.ConfiguredQueueWatches = cloneQueueWatches(s.ConfiguredQueueWatches)
+	clone.WatchedQueues = cloneQueueWatches(s.WatchedQueues)
+	clone.OpInputReadCache = make(map[string]cgra.Data)
+	clone.OpTimingCursor = cloneIntIntMap(s.OpTimingCursor)
+	clone.OpTimingLate = cloneIntBoolMap(s.OpTimingLate)
+	clone.OpTimingRollCycle = cloneIntInt64Map(s.OpTimingRollCycle)
+	clone.OpIssueCount = cloneIntIntMap(s.OpIssueCount)
+	clone.PendingSyncGroup = clonePendingSyncGroup(s.PendingSyncGroup)
+	clone.CurrReservationState = ReservationState{
+		ReservationMap:  cloneIntBoolMap(s.CurrReservationState.ReservationMap),
+		OpToExec:        s.CurrReservationState.OpToExec,
+		RefCountRuntime: cloneStringIntMap(s.CurrReservationState.RefCountRuntime),
+	}
+	return &clone
+}
+
+func (s *coreState) observeWatchedQueues(timeValue float64) {
+	if s == nil || len(s.WatchedQueues) == 0 {
+		return
+	}
+
+	for _, watch := range s.WatchedQueues {
+		occupancy := 0
+		var capacity int
+		switch watch.Kind {
+		case "recv":
+			occupancy = s.recvQueueLen(watch.ColorIdx, watch.DirectionIdx)
+			capacity = s.recvQueueCap()
+		case "send":
+			occupancy = s.sendQueueLen(watch.ColorIdx, watch.DirectionIdx)
+			capacity = s.sendQueueCap(watch.ColorIdx, watch.DirectionIdx)
+		default:
+			continue
+		}
+
+		ObserveQueue(
+			watch.Label,
+			watch.Kind,
+			timeValue,
+			int(s.TileX),
+			int(s.TileY),
+			watch.Direction,
+			watch.Color,
+			occupancy,
+			capacity,
+		)
+	}
+}
+
+func normalizeExecutionPolicyString(policy string) string {
+	text := strings.ToLower(strings.TrimSpace(policy))
+	switch text {
+	case ExecutionPolicyStrictTimed, "strict-timed", "static":
+		return ExecutionPolicyStrictTimed
+	case ExecutionPolicyElasticScheduled, "elastic-scheduled", "hybrid":
+		return ExecutionPolicyElasticScheduled
+	case "", ExecutionPolicyInOrderDataflow, "in-order-dataflow", "dynamic":
+		return ExecutionPolicyInOrderDataflow
+	default:
+		// Fall back to in-order dataflow for backward compatibility.
+		return ExecutionPolicyInOrderDataflow
+	}
+}
+
+func isStrictControlSensitiveOp(opCode string) bool {
+	normalized := strings.ToUpper(strings.TrimSpace(opCode))
+	switch {
+	case normalized == "SEL",
+		normalized == "JMP",
+		normalized == "RET",
+		normalized == "CTRL_MOV",
+		normalized == "CMP_EXPORT",
+		normalized == "LT_EX":
+		return true
+	case strings.HasPrefix(normalized, "PHI"),
+		strings.HasPrefix(normalized, "GRANT"),
+		strings.HasPrefix(normalized, "ICMP"),
+		strings.HasPrefix(normalized, "RETURN"),
+		strings.HasPrefix(normalized, "B"):
+		return true
+	default:
+		return false
+	}
+}
+
+func (i instEmulator) panicSynchronizationViolation(operation Operation, state *coreState, reason string) {
+	currentStep, targetStep, ii := i.resolveScheduleStep(operation, state)
+	panic(fmt.Sprintf(
+		"synchronization violation under %s: op=%s id=%d cycle=%d schedule_step=%d target_step=%d ii=%d raw_timestep=%d tile=(%d,%d): %s",
+		normalizeExecutionPolicyString(i.ExecutionPolicy),
+		operation.OpCode,
+		operation.ID,
+		state.CurrentCycle,
+		currentStep,
+		targetStep,
+		ii,
+		operation.TimeStep,
+		state.TileX,
+		state.TileY,
+		reason,
+	))
+}
+
+func (i instEmulator) resolveScheduleStep(operation Operation, state *coreState) (currentStep int64, targetStep int64, ii int64) {
+	ii = int64(state.Code.CompiledII)
+	if ii <= 0 {
+		return state.CurrentCycle, int64(operation.TimeStep), 0
+	}
+
+	currentStep = state.CurrentCycle % ii
+	if currentStep < 0 {
+		currentStep += ii
+	}
+
+	targetStep = int64(operation.TimeStep)
+	if targetStep < 0 {
+		panic(fmt.Sprintf(
+			"invalid time_step=%d for compiled_ii=%d at op=%s id=%d tile=(%d,%d)",
+			operation.TimeStep,
+			state.Code.CompiledII,
+			operation.OpCode,
+			operation.ID,
+			state.TileX,
+			state.TileY,
+		))
+	}
+	// Normalize to phase within II: compiler may emit time_step >= ii (e.g. 4 when ii=4 → step 0).
+	if targetStep >= ii {
+		targetStep = targetStep % ii
+	}
+
+	return currentStep, targetStep, ii
+}
+
+func (i instEmulator) resolveDerivedSchedule(operation Operation, state *coreState) ([]int64, int, bool) {
+	if state == nil || state.Code.DerivedTiming == nil {
+		return nil, 0, false
+	}
+
+	schedule, exists := state.Code.DerivedTiming[operation.ID]
+	if !exists || len(schedule) == 0 {
+		return nil, 0, false
+	}
+
+	cursor := state.OpTimingCursor[operation.ID]
+	return schedule, cursor, true
+}
+
+func (i instEmulator) advanceDerivedTimingCursor(operation Operation, state *coreState) {
+	if state == nil || state.Code.DerivedTiming == nil {
+		return
+	}
+	if _, exists := state.Code.DerivedTiming[operation.ID]; !exists {
+		return
+	}
+	state.OpTimingCursor[operation.ID] = state.OpTimingCursor[operation.ID] + 1
+	delete(state.OpTimingLate, operation.ID)
+	delete(state.OpTimingRollCycle, operation.ID)
+}
+
+func (i instEmulator) setStallReason(state *coreState, operation Operation, reason string) {
+	if state == nil || reason == "" {
+		return
+	}
+	state.StallReason = reason
+	state.StallOpID = operation.ID
+	state.StallOpCode = operation.OpCode
+}
+
+func (i instEmulator) rollStrictExpectedCycle(expectedCycle, currentCycle int64, compiledII int) int64 {
+	ii := int64(compiledII)
+	if ii <= 0 {
+		ii = 1
+	}
+	// Move to the next window start strictly after the current cycle.
+	nextExpected := expectedCycle + ii
+	if nextExpected > currentCycle {
+		return nextExpected
+	}
+	delta := currentCycle - expectedCycle
+	steps := delta/ii + 1
+	return expectedCycle + steps*ii
+}
+
+func (s *coreState) readyHeldTraceActive() bool {
+	return s != nil && s.ReadyHeldTraceEnabled && strings.TrimSpace(s.ReadyHeldRunMode) != ""
+}
+
+func (s *coreState) nextOpOccurrenceIndex(opID int) int {
+	if s == nil || s.OpIssueCount == nil {
+		return 0
+	}
+	return s.OpIssueCount[opID]
+}
+
+func (s *coreState) advanceOpOccurrenceIndex(opID int) {
+	if s == nil {
+		return
+	}
+	if s.OpIssueCount == nil {
+		s.OpIssueCount = make(map[int]int)
+	}
+	s.OpIssueCount[opID] = s.OpIssueCount[opID] + 1
+}
+
+func (i instEmulator) applyIssueDecision(operation Operation, state *coreState, decision issueDecision) {
+	if state == nil {
+		return
+	}
+	if decision.TimingWaitBlocked {
+		state.TimingWaitBlocked = true
+	}
+	if decision.WaitReason != "" {
+		i.setStallReason(state, operation, decision.WaitReason)
+	}
+}
+
+func (i instEmulator) readyHeldObservationFor(
+	operation Operation,
+	state *coreState,
+	decision issueDecision,
+	occurrenceIndex int,
+	issuedThisCycle bool,
+) (readyHeldObservation, bool) {
+	if state == nil || !state.readyHeldTraceActive() {
+		return readyHeldObservation{}, false
+	}
+	if !decision.FireableExceptTime && !decision.BlockedByLowerBound && !issuedThisCycle {
+		return readyHeldObservation{}, false
+	}
+	return readyHeldObservation{
+		RunMode:              state.ReadyHeldRunMode,
+		Cycle:                state.CurrentCycle,
+		X:                    int(state.TileX),
+		Y:                    int(state.TileY),
+		OpID:                 operation.ID,
+		OccurrenceIndex:      occurrenceIndex,
+		OpCode:               operation.OpCode,
+		AnnotatedTimeT:       decision.AnnotatedTimeT,
+		OperandsReady:        decision.OperandsReady,
+		PredicateReadyOrTrue: decision.PredicateReadyOrTrue,
+		ResourcesAvailable:   decision.ResourcesAvailable,
+		TimingGateSatisfied:  decision.TimingGateSatisfied,
+		FireableExceptTime:   decision.FireableExceptTime,
+		BlockedByLowerBound:  decision.BlockedByLowerBound,
+		IssuedThisCycle:      issuedThisCycle,
+	}, true
+}
+
+func (i instEmulator) emitReadyHeldObservation(observation readyHeldObservation) {
+	var annotated any
+	if observation.AnnotatedTimeT != nil {
+		annotated = *observation.AnnotatedTimeT
+	}
+	Trace(
+		"ReadyHeld",
+		"run_mode", observation.RunMode,
+		"cycle", observation.Cycle,
+		"X", observation.X,
+		"Y", observation.Y,
+		"ID", observation.OpID,
+		"occurrence_index", observation.OccurrenceIndex,
+		"OpCode", observation.OpCode,
+		"annotated_time_t", annotated,
+		"operands_ready", observation.OperandsReady,
+		"predicate_ready_or_true", observation.PredicateReadyOrTrue,
+		"resources_available", observation.ResourcesAvailable,
+		"timing_gate_satisfied", observation.TimingGateSatisfied,
+		"fireable_except_time", observation.FireableExceptTime,
+		"blocked_by_lower_bound", observation.BlockedByLowerBound,
+		"issued_this_cycle", observation.IssuedThisCycle,
+	)
+}
+
+func (i instEmulator) issueDecision(operation Operation, state *coreState) issueDecision {
+	decision := issueDecision{
+		OperandsReady:        true,
+		PredicateReadyOrTrue: true,
+		ResourcesAvailable:   true,
+		TimingGateSatisfied:  true,
+		CanIssue:             true,
+	}
+
+	if !i.CareFlags || operation.InvalidIterations > 0 {
+		decision.FireableExceptTime = true
+		return decision
+	}
+
+	readiness := i.checkIssueReadinessDetails(operation, state)
+	decision.OperandsReady = readiness.OperandsReady
+	decision.PredicateReadyOrTrue = readiness.PredicateReadyOrTrue
+	decision.ResourcesAvailable = readiness.ResourcesAvailable
+	decision.FireableExceptTime = readiness.Ready
+
+	policy := normalizeExecutionPolicyString(i.ExecutionPolicy)
+	if schedule, cursor, hasDerived := i.resolveDerivedSchedule(operation, state); hasDerived &&
+		(policy == ExecutionPolicyStrictTimed || policy == ExecutionPolicyElasticScheduled) {
+		if cursor >= len(schedule) {
+			decision.TimingGateSatisfied = false
+			decision.FireableExceptTime = false
+			decision.CanIssue = false
+			return decision
+		}
+
+		annotatedTime := schedule[cursor]
+		decision.AnnotatedTimeT = int64Ptr(annotatedTime)
+		expectedCycle := annotatedTime
+
+		switch policy {
+		case ExecutionPolicyStrictTimed:
+			if rolledCycle, exists := state.OpTimingRollCycle[operation.ID]; exists {
+				expectedCycle = rolledCycle
+			}
+			decision.TimingGateSatisfied = state.CurrentCycle >= expectedCycle
+
+			if isStrictControlSensitiveOp(operation.OpCode) {
+				if state.CurrentCycle < expectedCycle {
+					decision.CanIssue = false
+					decision.WaitReason = StallReasonScheduleBubble
+					decision.TimingWaitBlocked = true
+					return decision
+				}
+				if readiness.Ready {
+					if state.CurrentCycle > annotatedTime {
+						state.OpTimingLate[operation.ID] = true
+					}
+					decision.CanIssue = true
+					return decision
+				}
+				if state.CurrentCycle > annotatedTime {
+					state.OpTimingLate[operation.ID] = true
+				}
+				decision.CanIssue = false
+				decision.WaitReason = readiness.WaitReason
+				return decision
+			}
+
+			if state.CurrentCycle < expectedCycle {
+				decision.CanIssue = false
+				decision.WaitReason = StallReasonScheduleBubble
+				decision.TimingWaitBlocked = true
+				return decision
+			}
+
+			lateness := state.CurrentCycle - expectedCycle
+			if lateness > 0 && i.StrictMaxSlip >= 0 && lateness > i.StrictMaxSlip {
+				reason := fmt.Sprintf(
+					"strict slip window violation: lateness=%d exceeds max_slip=%d (expected=%d current=%d)",
+					lateness,
+					i.StrictMaxSlip,
+					expectedCycle,
+					state.CurrentCycle,
+				)
+				if i.StrictFailOnViolation {
+					i.panicSynchronizationViolation(operation, state, reason)
+				}
+
+				nextExpected := i.rollStrictExpectedCycle(expectedCycle, state.CurrentCycle, state.Code.CompiledII)
+				state.OpTimingRollCycle[operation.ID] = nextExpected
+				Trace(
+					"TimingViolation",
+					"Policy", policy,
+					"OpCode", operation.OpCode,
+					"ID", operation.ID,
+					"X", state.TileX,
+					"Y", state.TileY,
+					"ExpectedCycle", expectedCycle,
+					"NextExpectedCycle", nextExpected,
+					"CurrentCycle", state.CurrentCycle,
+					"Lateness", lateness,
+					"MaxSlip", i.StrictMaxSlip,
+				)
+				decision.CanIssue = false
+				decision.WaitReason = StallReasonScheduleBubble
+				decision.TimingWaitBlocked = true
+				return decision
+			}
+
+			if !readiness.Ready {
+				if state.CurrentCycle > annotatedTime {
+					state.OpTimingLate[operation.ID] = true
+				}
+				decision.CanIssue = false
+				decision.WaitReason = readiness.WaitReason
+				return decision
+			}
+
+			if state.CurrentCycle > annotatedTime {
+				state.OpTimingLate[operation.ID] = true
+			}
+			decision.CanIssue = true
+			return decision
+		case ExecutionPolicyElasticScheduled:
+			decision.TimingGateSatisfied = state.CurrentCycle >= expectedCycle
+			decision.BlockedByLowerBound = readiness.Ready && !decision.TimingGateSatisfied
+			if !decision.TimingGateSatisfied {
+				decision.CanIssue = false
+				decision.WaitReason = StallReasonScheduleBubble
+				decision.TimingWaitBlocked = true
+				return decision
+			}
+			if readiness.Ready {
+				decision.CanIssue = true
+				return decision
+			}
+			decision.CanIssue = false
+			decision.WaitReason = readiness.WaitReason
+			return decision
+		}
+	}
+
+	currentStep, targetStep, ii := i.resolveScheduleStep(operation, state)
+
+	// No schedule (compiled_ii missing or 0): ignore time gating so existing workloads
+	// (e.g. histogram) that do not use II-based scheduling still run like in-order.
+	if ii <= 0 {
+		decision.CanIssue = readiness.Ready
+		decision.WaitReason = readiness.WaitReason
+		return decision
+	}
+
+	switch policy {
+	case ExecutionPolicyStrictTimed:
+		decision.TimingGateSatisfied = currentStep >= targetStep
+		if currentStep < targetStep {
+			decision.CanIssue = false
+			decision.WaitReason = StallReasonScheduleBubble
+			decision.TimingWaitBlocked = true
+			return decision
+		}
+		if currentStep == targetStep {
+			if readiness.Ready {
+				decision.CanIssue = true
+				return decision
+			}
+			i.panicSynchronizationViolation(operation, state, "operand/credit not ready at scheduled step")
+		}
+		i.panicSynchronizationViolation(operation, state, "operation missed its exact scheduled step")
+		return decision
+	case ExecutionPolicyElasticScheduled:
+		decision.TimingGateSatisfied = currentStep >= targetStep
+		if currentStep < targetStep {
+			decision.CanIssue = false
+			decision.WaitReason = StallReasonScheduleBubble
+			decision.TimingWaitBlocked = true
+			return decision
+		}
+		decision.CanIssue = readiness.Ready
+		decision.WaitReason = readiness.WaitReason
+		return decision
+	case ExecutionPolicyInOrderDataflow:
+		decision.CanIssue = readiness.Ready
+		decision.WaitReason = readiness.WaitReason
+		return decision
+	default:
+		decision.CanIssue = readiness.Ready
+		decision.WaitReason = readiness.WaitReason
+		return decision
+	}
+}
+
+func (i instEmulator) canIssue(operation Operation, state *coreState) bool {
+	decision := i.issueDecision(operation, state)
+	i.applyIssueDecision(operation, state, decision)
+	return decision.CanIssue
 }
 
 // set up the necessary state for the instruction group
@@ -131,10 +1065,181 @@ func (i instEmulator) SetUpInstructionGroup(index int32, state *coreState) {
 	state.CurrReservationState.SetRefCount(iGroup, state)
 }
 
+func supportsDeferredLatency(opCode string) bool {
+	switch normalizeLatencyOpcode(opCode) {
+	case "LOAD", "STORE", "LDD", "STD", "LD", "LDW", "ST", "STW",
+		"TRIGGER", "JMP", "BEQ", "BNE", "BLT",
+		"RETURN_VALUE", "RETURN_VOID", "RET",
+		"PHI", "PHI_CONST", "PHI_START", "GRANT_PREDICATE", "GRANT_ONCE":
+		return false
+	default:
+		return true
+	}
+}
+
+func (i instEmulator) deferredSyncGroupLatency(cinst InstructionGroup, state *coreState) (int, int, string, bool) {
+	if state == nil {
+		return 1, 0, "", false
+	}
+
+	type sendKey struct {
+		color     int
+		direction int
+	}
+
+	maxLatency := 1
+	representativeID := 0
+	representativeOp := ""
+	requiredSends := make(map[sendKey]int)
+	executedOps := 0
+
+	for _, operation := range cinst.Operations {
+		if operation.InvalidIterations > 0 {
+			continue
+		}
+		executedOps++
+		if !supportsDeferredLatency(operation.OpCode) {
+			return 1, 0, "", false
+		}
+
+		latency := state.Code.OperationLatency(operation.OpCode)
+		if latency > maxLatency {
+			maxLatency = latency
+			representativeID = operation.ID
+			representativeOp = operation.OpCode
+		} else if representativeOp == "" {
+			representativeID = operation.ID
+			representativeOp = operation.OpCode
+		}
+
+		for _, dst := range operation.DstOperands.Operands {
+			normalized := i.normalizeDirection(dst.Impl)
+			if !state.Directions[normalized] {
+				continue
+			}
+			key := sendKey{
+				color:     i.getColorIndex(dst.Color),
+				direction: i.getDirecIndex(normalized),
+			}
+			requiredSends[key]++
+			if requiredSends[key] > state.sendQueueCap(key.color, key.direction) {
+				return 1, 0, "", false
+			}
+		}
+	}
+
+	if executedOps == 0 || maxLatency <= 1 {
+		return 1, 0, "", false
+	}
+
+	return maxLatency, representativeID, representativeOp, true
+}
+
+func (i instEmulator) canCommitPendingSyncGroup(state *coreState, pending *pendingSyncGroup) bool {
+	if state == nil || pending == nil {
+		return true
+	}
+
+	type sendKey struct {
+		color     int
+		direction int
+	}
+
+	requiredSends := make(map[sendKey]int)
+	for operand := range pending.BufferedResults {
+		normalized := i.normalizeDirection(operand.Impl)
+		if !state.Directions[normalized] {
+			continue
+		}
+		key := sendKey{
+			color:     i.getColorIndex(operand.Color),
+			direction: i.getDirecIndex(normalized),
+		}
+		requiredSends[key]++
+	}
+
+	for key, required := range requiredSends {
+		free := state.sendQueueCap(key.color, key.direction) - state.sendQueueLen(key.color, key.direction)
+		if free < required {
+			return false
+		}
+	}
+	return true
+}
+
+func (i instEmulator) advancePendingSyncGroup(state *coreState) bool {
+	if state == nil || state.PendingSyncGroup == nil {
+		return false
+	}
+
+	pending := state.PendingSyncGroup
+	if pending.RemainingCycles > 1 {
+		pending.RemainingCycles--
+		state.TimingWaitBlocked = true
+		return true
+	}
+
+	if pending.RemainingCycles == 1 {
+		pending.RemainingCycles = 0
+	}
+
+	if !i.canCommitPendingSyncGroup(state, pending) {
+		state.TimingWaitBlocked = true
+		state.StallReason = StallReasonOutputBlocked
+		state.StallOpID = pending.RepresentativeID
+		state.StallOpCode = pending.RepresentativeOp
+		return true
+	}
+
+	for operand, value := range pending.BufferedResults {
+		i.writeOperand(operand, value, state)
+	}
+	i.applyInvalidIterationDecrements(state, pending.InvalidDecrements)
+	state.PendingSyncGroup = nil
+	return true
+}
+
+func (i instEmulator) bufferDeferredResult(
+	operand Operand,
+	value cgra.Data,
+	workState *coreState,
+	bufferedResults map[Operand]cgra.Data,
+) {
+	bufferedResults[operand] = value
+	if workState == nil || !strings.HasPrefix(operand.Impl, "$") {
+		return
+	}
+	registerIndex, err := strconv.Atoi(strings.TrimPrefix(operand.Impl, "$"))
+	if err != nil {
+		panic(fmt.Sprintf("invalid register index in deferred result buffering: %v", operand))
+	}
+	if registerIndex < 0 || registerIndex >= len(workState.Registers) {
+		panic(fmt.Sprintf("register index %d out of range in deferred result buffering", registerIndex))
+	}
+	workState.Registers[registerIndex] = value
+}
+
+func (i instEmulator) applyDeferredSyncIssueState(state *coreState, workState *coreState) {
+	if state == nil || workState == nil {
+		return
+	}
+
+	state.RecvBufHead = clone2DData(workState.RecvBufHead)
+	state.RecvBufHeadReady = clone2DBool(workState.RecvBufHeadReady)
+	state.RecvBufQueue = clone3DData(workState.RecvBufQueue)
+	state.OpTimingCursor = cloneIntIntMap(workState.OpTimingCursor)
+	state.OpTimingLate = cloneIntBoolMap(workState.OpTimingLate)
+	state.OpTimingRollCycle = cloneIntInt64Map(workState.OpTimingRollCycle)
+	state.OpIssueCount = cloneIntIntMap(workState.OpIssueCount)
+	state.CurrentTime = workState.CurrentTime
+}
+
 func (i instEmulator) RunInstructionGroup(cinst InstructionGroup, state *coreState, time float64) bool {
 	// check the Return signal
 	if *state.exit && time > *state.requestExitTimestamp {
-		fmt.Println("Exit signal ( requested at", *state.requestExitTimestamp, ") received at time", time)
+		if DebugEnabled() {
+			slog.Debug("ExitSignal", "requestedAt", *state.requestExitTimestamp, "time", time)
+		}
 		return false
 	}
 	prevPC := state.PCInBlock
@@ -172,19 +1277,23 @@ func (i instEmulator) RunInstructionGroup(cinst InstructionGroup, state *coreSta
 		} // else, this group is not finished, PC stays the same
 	} else if state.Mode == SyncOp {
 		if progressSync {
-			if state.NextPCInBlock == -1 {
-				print("PC+4 for PC=", state.PCInBlock, " X:", state.TileX, " Y:", state.TileY, "\n")
-				print("Instruction at PC=", state.PCInBlock, " is ", state.SelectedBlock.InstructionGroups[state.PCInBlock].Operations[0].OpCode, "\n")
-				state.PCInBlock++
-			} else {
-				print("PC+Jump to ", state.NextPCInBlock, " X:", state.TileX, " Y:", state.TileY, "\n")
-				state.PCInBlock = state.NextPCInBlock
+			// Timing wait means "advance cycle but keep the same instruction group",
+			// otherwise later groups may observe stale local registers.
+			if !state.TimingWaitBlocked {
+				if state.NextPCInBlock == -1 {
+					// print("PC+4 for PC=", state.PCInBlock, " X:", state.TileX, " Y:", state.TileY, "\n")
+					// print("Instruction at PC=", state.PCInBlock, " is ", state.SelectedBlock.InstructionGroups[state.PCInBlock].Operations[0].OpCode, "\n")
+					state.PCInBlock++
+				} else {
+					// print("PC+Jump to ", state.NextPCInBlock, " X:", state.TileX, " Y:", state.TileY, "\n")
+					state.PCInBlock = state.NextPCInBlock
+				}
 			}
 		}
 		if state.SelectedBlock != nil && state.PCInBlock >= int32(len(state.SelectedBlock.InstructionGroups)) {
 			state.PCInBlock = -1
 			state.SelectedBlock = nil
-			print("PCInBlock = -1 at (", state.TileX, ",", state.TileY, ")\n")
+			// print("PCInBlock = -1 at (", state.TileX, ",", state.TileY, ")\n")
 			slog.Info("Flow", "PCInBlock", "-1", "X", state.TileX, "Y", state.TileY)
 		}
 		state.NextPCInBlock = -1
@@ -208,43 +1317,128 @@ func (i instEmulator) RunInstructionGroup(cinst InstructionGroup, state *coreSta
 }
 
 func (i instEmulator) RunInstructionGroupWithSyncOps(cinst InstructionGroup, state *coreState, time float64) bool {
+	state.TimingWaitBlocked = false
+	state.StallReason = ""
+	state.StallOpID = 0
+	state.StallOpCode = ""
+	state.OpInputReadCache = make(map[string]cgra.Data)
+	if state.PendingSyncGroup != nil {
+		return i.advancePendingSyncGroup(state)
+	}
+	if state.EnableFIFOModel {
+		return i.runInstructionGroupWithSyncOpsTwoPhase(cinst, state, time)
+	}
+	return i.runInstructionGroupWithSyncOpsLegacy(cinst, state, time)
+}
+
+func (i instEmulator) runInstructionGroupWithSyncOpsLegacy(cinst InstructionGroup, state *coreState, time float64) bool {
 	run := true
+	type evaluatedDecision struct {
+		operation       Operation
+		decision        issueDecision
+		occurrenceIndex int
+	}
+	evaluated := make([]evaluatedDecision, 0, len(cinst.Operations))
 	for _, operation := range cinst.Operations {
-		if (!i.CareFlags) || operation.InvalidIterations > 0 || i.CheckFlags(operation, state) {
+		decision := i.issueDecision(operation, state)
+		i.applyIssueDecision(operation, state, decision)
+		evaluated = append(evaluated, evaluatedDecision{
+			operation:       operation,
+			decision:        decision,
+			occurrenceIndex: state.nextOpOccurrenceIndex(operation.ID),
+		})
+		if decision.CanIssue {
 			continue
-		} else {
-			run = false
-			break
 		}
+		run = false
+		break
 	}
 	if run {
-		// Collect all results first
+		deferredLatency, representativeID, representativeOp, deferGroup := i.deferredSyncGroupLatency(cinst, state)
 		allResults := make(map[Operand]cgra.Data)
+		invalidDecrements := make([]int, 0)
 		for index := range cinst.Operations {
-			// Get reference to the original operation in state.SelectedBlock
 			operation := &state.SelectedBlock.InstructionGroups[state.PCInBlock].Operations[index]
-			// Decrement InvalidIterations before running if needed
 			if operation.InvalidIterations > 0 {
-				print("Invalid iteration for ", operation.OpCode, "@(", state.TileX, ",", state.TileY, ")\n")
-				operation.InvalidIterations--
+				if deferGroup {
+					invalidDecrements = append(invalidDecrements, index)
+				} else {
+					operation.InvalidIterations--
+				}
 				continue
 			}
+			occurrenceIndex := state.nextOpOccurrenceIndex(operation.ID)
+			decision := evaluated[index].decision
+			if observation, ok := i.readyHeldObservationFor(*operation, state, decision, occurrenceIndex, true); ok {
+				i.emitReadyHeldObservation(observation)
+			}
 			results := i.RunOperation(*operation, state, time)
-			// Merge results into allResults
+			state.advanceOpOccurrenceIndex(operation.ID)
+			i.advanceDerivedTimingCursor(*operation, state)
 			for operand, value := range results {
 				allResults[operand] = value
 			}
-			//print("RunOperation", operation.OpCode, "@(", state.TileX, ",", state.TileY, ")", time, ":", "YES", "\n")
 		}
-		// Write all results at once
+		if deferGroup {
+			state.PendingSyncGroup = &pendingSyncGroup{
+				RemainingCycles:   deferredLatency - 1,
+				BufferedResults:   allResults,
+				InvalidDecrements: invalidDecrements,
+				RepresentativeID:  representativeID,
+				RepresentativeOp:  representativeOp,
+			}
+			state.TimingWaitBlocked = true
+			return true
+		}
 		for operand, value := range allResults {
 			i.writeOperand(operand, value, state)
 		}
+	} else {
+		for _, eval := range evaluated {
+			if observation, ok := i.readyHeldObservationFor(eval.operation, state, eval.decision, eval.occurrenceIndex, false); ok {
+				i.emitReadyHeldObservation(observation)
+			}
+		}
+	}
+	if state.TimingWaitBlocked {
+		if !run && state.StallReason != "" {
+			Trace(
+				"Stall",
+				"Behavior", state.StallReason,
+				"Policy", normalizeExecutionPolicyString(i.ExecutionPolicy),
+				"Time", float64(state.CurrentCycle),
+				"X", state.TileX,
+				"Y", state.TileY,
+				"ID", state.StallOpID,
+				"OpCode", state.StallOpCode,
+			)
+		}
+		return true
+	}
+	if !run && state.StallReason != "" {
+		Trace(
+			"Stall",
+			"Behavior", state.StallReason,
+			"Policy", normalizeExecutionPolicyString(i.ExecutionPolicy),
+			"Time", float64(state.CurrentCycle),
+			"X", state.TileX,
+			"Y", state.TileY,
+			"ID", state.StallOpID,
+			"OpCode", state.StallOpCode,
+		)
 	}
 	return run
 }
 
 func (i instEmulator) RunInstructionGroupWithAsyncOps(cinst InstructionGroup, state *coreState, time float64) {
+	if state.EnableFIFOModel {
+		i.runInstructionGroupWithAsyncOpsTwoPhase(cinst, state, time)
+		return
+	}
+	i.runInstructionGroupWithAsyncOpsLegacy(cinst, state, time)
+}
+
+func (i instEmulator) runInstructionGroupWithAsyncOpsLegacy(cinst InstructionGroup, state *coreState, time float64) {
 	// Collect all results first
 	allResults := make(map[Operand]cgra.Data)
 	for index := range cinst.Operations {
@@ -254,16 +1448,18 @@ func (i instEmulator) RunInstructionGroupWithAsyncOps(cinst InstructionGroup, st
 		}
 		// Get reference to the original operation in state.SelectedBlock
 		operation := &state.SelectedBlock.InstructionGroups[state.PCInBlock].Operations[index]
-		if (!i.CareFlags) || operation.InvalidIterations > 0 || i.CheckFlags(*operation, state) { // can also only choose one (another pattern)
+		if i.canIssue(*operation, state) { // can also only choose one (another pattern)
 			state.CurrReservationState.ReservationMap[index] = false
 			state.CurrReservationState.OpToExec--
 			// Decrement InvalidIterations before running if needed
 			if operation.InvalidIterations > 0 {
-				print("Invalid iteration for ", operation.OpCode, "@(", state.TileX, ",", state.TileY, ")\n")
+				// print("Invalid iteration for ", operation.OpCode, "@(", state.TileX, ",", state.TileY, ")\n")
 				operation.InvalidIterations--
 				continue
 			}
 			results := i.RunOperation(*operation, state, time)
+			state.advanceOpOccurrenceIndex(operation.ID)
+			i.advanceDerivedTimingCursor(*operation, state)
 			// Merge results into allResults
 			for operand, value := range results {
 				allResults[operand] = value
@@ -279,6 +1475,166 @@ func (i instEmulator) RunInstructionGroupWithAsyncOps(cinst InstructionGroup, st
 	}
 }
 
+func (i instEmulator) runInstructionGroupWithSyncOpsTwoPhase(cinst InstructionGroup, state *coreState, time float64) bool {
+	workState := state.cloneForEval()
+	run := true
+	type evaluatedDecision struct {
+		operation       Operation
+		decision        issueDecision
+		occurrenceIndex int
+	}
+	evaluated := make([]evaluatedDecision, 0, len(cinst.Operations))
+	for _, operation := range cinst.Operations {
+		decision := i.issueDecision(operation, workState)
+		i.applyIssueDecision(operation, workState, decision)
+		evaluated = append(evaluated, evaluatedDecision{
+			operation:       operation,
+			decision:        decision,
+			occurrenceIndex: workState.nextOpOccurrenceIndex(operation.ID),
+		})
+		if decision.CanIssue {
+			continue
+		}
+		run = false
+		break
+	}
+
+	if !run {
+		for _, eval := range evaluated {
+			if observation, ok := i.readyHeldObservationFor(eval.operation, workState, eval.decision, eval.occurrenceIndex, false); ok {
+				i.emitReadyHeldObservation(observation)
+			}
+		}
+		state.TimingWaitBlocked = workState.TimingWaitBlocked
+		state.StallReason = workState.StallReason
+		state.StallOpID = workState.StallOpID
+		state.StallOpCode = workState.StallOpCode
+		if state.TimingWaitBlocked {
+			if state.StallReason != "" {
+				Trace(
+					"Stall",
+					"Behavior", state.StallReason,
+					"Policy", normalizeExecutionPolicyString(i.ExecutionPolicy),
+					"Time", float64(state.CurrentCycle),
+					"X", state.TileX,
+					"Y", state.TileY,
+					"ID", state.StallOpID,
+					"OpCode", state.StallOpCode,
+				)
+			}
+			return true
+		}
+		if state.StallReason != "" {
+			Trace(
+				"Stall",
+				"Behavior", state.StallReason,
+				"Policy", normalizeExecutionPolicyString(i.ExecutionPolicy),
+				"Time", float64(state.CurrentCycle),
+				"X", state.TileX,
+				"Y", state.TileY,
+				"ID", state.StallOpID,
+				"OpCode", state.StallOpCode,
+			)
+		}
+		return false
+	}
+
+	deferredLatency, representativeID, representativeOp, deferGroup := i.deferredSyncGroupLatency(cinst, state)
+	invalidDecrements := make([]int, 0)
+	issuedObservations := make([]readyHeldObservation, 0, len(cinst.Operations))
+	bufferedResults := make(map[Operand]cgra.Data)
+	for index, operation := range cinst.Operations {
+		if operation.InvalidIterations > 0 {
+			invalidDecrements = append(invalidDecrements, index)
+			continue
+		}
+		occurrenceIndex := workState.nextOpOccurrenceIndex(operation.ID)
+		decision := evaluated[index].decision
+		if observation, ok := i.readyHeldObservationFor(operation, workState, decision, occurrenceIndex, true); ok {
+			issuedObservations = append(issuedObservations, observation)
+		}
+		results := i.RunOperation(operation, workState, time)
+		workState.advanceOpOccurrenceIndex(operation.ID)
+		i.advanceDerivedTimingCursor(operation, workState)
+		for operand, value := range results {
+			if deferGroup {
+				i.bufferDeferredResult(operand, value, workState, bufferedResults)
+				continue
+			}
+			i.writeOperand(operand, value, workState)
+		}
+	}
+	if deferGroup {
+		i.applyDeferredSyncIssueState(state, workState)
+		for _, observation := range issuedObservations {
+			i.emitReadyHeldObservation(observation)
+		}
+		state.PendingSyncGroup = &pendingSyncGroup{
+			RemainingCycles:   deferredLatency - 1,
+			BufferedResults:   bufferedResults,
+			InvalidDecrements: invalidDecrements,
+			RepresentativeID:  representativeID,
+			RepresentativeOp:  representativeOp,
+		}
+		state.TimingWaitBlocked = true
+		return true
+	}
+	*state = *workState
+	for _, observation := range issuedObservations {
+		i.emitReadyHeldObservation(observation)
+	}
+	i.applyInvalidIterationDecrements(state, invalidDecrements)
+	return true
+}
+
+func (i instEmulator) runInstructionGroupWithAsyncOpsTwoPhase(cinst InstructionGroup, state *coreState, time float64) {
+	workState := state.cloneForEval()
+	allResults := make(map[Operand]cgra.Data)
+	invalidDecrements := make([]int, 0)
+	for index, operation := range cinst.Operations {
+		if !workState.CurrReservationState.ReservationMap[index] {
+			continue
+		}
+		if i.canIssue(operation, workState) {
+			workState.CurrReservationState.ReservationMap[index] = false
+			workState.CurrReservationState.OpToExec--
+			if operation.InvalidIterations > 0 {
+				invalidDecrements = append(invalidDecrements, index)
+				continue
+			}
+			results := i.RunOperation(operation, workState, time)
+			workState.advanceOpOccurrenceIndex(operation.ID)
+			i.advanceDerivedTimingCursor(operation, workState)
+			for operand, value := range results {
+				allResults[operand] = value
+			}
+		}
+	}
+	for operand, value := range allResults {
+		i.writeOperand(operand, value, workState)
+	}
+	*state = *workState
+	i.applyInvalidIterationDecrements(state, invalidDecrements)
+}
+
+func (i instEmulator) applyInvalidIterationDecrements(state *coreState, indices []int) {
+	if len(indices) == 0 || state == nil || state.SelectedBlock == nil {
+		return
+	}
+	if state.PCInBlock < 0 || int(state.PCInBlock) >= len(state.SelectedBlock.InstructionGroups) {
+		return
+	}
+	operations := state.SelectedBlock.InstructionGroups[state.PCInBlock].Operations
+	for _, idx := range indices {
+		if idx < 0 || idx >= len(operations) {
+			continue
+		}
+		if operations[idx].InvalidIterations > 0 {
+			operations[idx].InvalidIterations--
+		}
+	}
+}
+
 func (i instEmulator) normalizeDirection(s string) string {
 	u := strings.ToUpper(s)
 	switch u {
@@ -305,34 +1661,38 @@ func (i instEmulator) normalizeDirection(s string) string {
 	}
 }
 
-func (i instEmulator) CheckFlags(inst Operation, state *coreState) bool {
-	//PrintState(state)
-	flag := true
+func (i instEmulator) checkIssueReadinessDetails(inst Operation, state *coreState) issueReadiness {
+	readiness := issueReadiness{
+		OperandsReady:        true,
+		PredicateReadyOrTrue: true,
+		ResourcesAvailable:   true,
+		Ready:                true,
+	}
+
 	for index, src := range inst.SrcOperands.Operands {
 		if index == 1 {
 			if inst.OpCode == "PHI_CONST" || inst.OpCode == "PHI_START" {
-				// Track PHI_CONST per instruction to avoid cross-interference.
 				var stateKey string
 				if inst.OpCode == "PHI_CONST" {
 					stateKey = fmt.Sprintf("PhiConst_%d", inst.ID)
 				} else if inst.OpCode == "PHI_START" {
 					stateKey = fmt.Sprintf("PhiStart_%d", inst.ID)
 				}
-				if state.States[stateKey] == nil || state.States[stateKey] == false { // first execution
+				if state.States[stateKey] == nil || state.States[stateKey] == false {
 					if len(inst.SrcOperands.Operands) > 1 {
-						fmt.Println("ID", inst.ID, "bypass check")
 						continue
-					} else {
-						panic("PHI_CONST or PHI_START must have two sources")
 					}
+					panic("PHI_CONST or PHI_START must have two sources")
 				}
 			}
 		}
 		srcImpl := i.normalizeDirection(src.Impl)
 		if state.Directions[srcImpl] {
-			if !state.RecvBufHeadReady[i.getColorIndex(src.Color)][i.getDirecIndex(srcImpl)] {
-				flag = false
-				break
+			if state.recvQueueLen(i.getColorIndex(src.Color), i.getDirecIndex(srcImpl)) == 0 {
+				readiness.OperandsReady = false
+				readiness.Ready = false
+				readiness.WaitReason = StallReasonOperandWait
+				return readiness
 			}
 		}
 	}
@@ -340,15 +1700,38 @@ func (i instEmulator) CheckFlags(inst Operation, state *coreState) bool {
 	for _, dst := range inst.DstOperands.Operands {
 		dstImpl := i.normalizeDirection(dst.Impl)
 		if state.Directions[dstImpl] {
-			if state.SendBufHeadBusy[i.getColorIndex(dst.Color)][i.getDirecIndex(dstImpl)] {
-				flag = false
-				break
+			if state.sendQueueIsFull(i.getColorIndex(dst.Color), i.getDirecIndex(dstImpl)) {
+				Trace(
+					"Backpressure",
+					"Time", float64(state.CurrentCycle),
+					"X", state.TileX,
+					"Y", state.TileY,
+					"OpCode", inst.OpCode,
+					"ID", inst.ID,
+					"Reason", "SendBufBusy",
+					"DstDir", dstImpl,
+					"Color", dst.Color,
+					"Policy", normalizeExecutionPolicyString(i.ExecutionPolicy),
+				)
+				readiness.ResourcesAvailable = false
+				readiness.Ready = false
+				readiness.WaitReason = StallReasonOutputBlocked
+				return readiness
 			}
 		}
 	}
-	//fmt.Println("[CheckFlags] checking flags for inst", inst.OpCode, "@(", state.TileX, ",", state.TileY, "):", flag)
-	fmt.Println("Check", inst.OpCode, "ID", inst.ID, "@(", state.TileX, ",", state.TileY, "):", flag)
-	return flag
+
+	return readiness
+}
+
+func (i instEmulator) checkIssueReadiness(inst Operation, state *coreState) (bool, string) {
+	readiness := i.checkIssueReadinessDetails(inst, state)
+	return readiness.Ready, readiness.WaitReason
+}
+
+func (i instEmulator) CheckFlags(inst Operation, state *coreState) bool {
+	ready, _ := i.checkIssueReadiness(inst, state)
+	return ready
 }
 
 func (i instEmulator) RunOperation(inst Operation, state *coreState, time float64) map[Operand]cgra.Data {
@@ -467,13 +1850,39 @@ func (i instEmulator) readOperand(operand Operand, state *coreState) (value cgra
 			//fmt.Println("operand.Impl", operand.Impl)
 			// must first check it is ready
 			color, direction := i.getColorIndex(operand.Color), i.getDirecIndex(normalizedImpl)
-			value = state.RecvBufHead[color][direction]
-			// set the ready flag to false
+			cacheKey := fmt.Sprintf("%d:%d", color, direction)
+			if state.Mode == SyncOp {
+				if cached, ok := state.OpInputReadCache[cacheKey]; ok {
+					return cached
+				}
+			}
+			peek, ok := state.recvQueuePeek(color, direction)
+			if !ok {
+				if state.Mode == SyncOp {
+					// In sync mode, all ops in the same instruction group share one
+					// snapshot of input heads. If a previous op consumed this queue
+					// head earlier in the same tick, keep returning the snapshot.
+					fallback := state.RecvBufHead[color][direction]
+					state.OpInputReadCache[cacheKey] = fallback
+					return fallback
+				}
+				panic(fmt.Sprintf("operand queue unexpectedly empty in async mode: %v", operand))
+			}
+			value = peek
+			// consume queue head according to existing sync/async rules
 			if state.Mode == SyncOp {
-				state.RecvBufHeadReady[color][direction] = false
+				consumed, ok := state.recvQueueConsume(color, direction)
+				if !ok {
+					panic(fmt.Sprintf("operand queue consume failed in sync mode: %v", operand))
+				}
+				value = consumed
+				state.OpInputReadCache[cacheKey] = value
 			} else {
 				if !state.CurrReservationState.DecrementRefCount(operand, state) {
-					state.RecvBufHeadReady[color][direction] = false // no longer used, closed
+					// no longer used, pop queue head
+					if _, ok := state.recvQueueConsume(color, direction); !ok {
+						panic(fmt.Sprintf("operand queue consume failed in async mode: %v", operand))
+					}
 					//fmt.Println("Reduce {", operand.Impl, "} to zero")
 				} else {
 					//fmt.Println("Reduce {", operand.Impl, "} to ", state.CurrReservationState.RefCountRuntime[operand.Impl], "@(", state.TileX, ",", state.TileY, ")")
@@ -523,12 +1932,13 @@ func (i instEmulator) writeOperand(operand Operand, value cgra.Data, state *core
 	} else {
 		normalizedImpl := i.normalizeDirection(operand.Impl)
 		if state.Directions[normalizedImpl] {
-			if state.SendBufHeadBusy[i.getColorIndex(operand.Color)][i.getDirecIndex(normalizedImpl)] {
+			color := i.getColorIndex(operand.Color)
+			direction := i.getDirecIndex(normalizedImpl)
+			if state.sendQueueIsFull(color, direction) {
 				//fmt.Printf("sendbufhead busy\n")
 				return
 			}
-			state.SendBufHeadBusy[i.getColorIndex(operand.Color)][i.getDirecIndex(normalizedImpl)] = true
-			state.SendBufHead[i.getColorIndex(operand.Color)][i.getDirecIndex(normalizedImpl)] = value
+			state.sendQueuePush(color, direction, value)
 		} else {
 			panic(fmt.Sprintf("Invalid operand %v in writeOperand; expected register", operand))
 		}
@@ -682,9 +2092,19 @@ func (i instEmulator) runLoadDirect(inst Operation, state *coreState) map[Operan
 	src1 := inst.SrcOperands.Operands[0]
 	addrStruct := i.readOperand(src1, state)
 	addr := addrStruct.First()
+	finalPred := addrStruct.Pred
+	results := make(map[Operand]cgra.Data)
+
+	// Predicated-off load should not touch memory or trigger bounds checks.
+	if !finalPred {
+		for _, dst := range inst.DstOperands.Operands {
+			results[dst] = cgra.NewScalarWithPred(0, false)
+		}
+		return results
+	}
 
 	if addr >= uint32(len(state.Memory)) {
-		panic("memory address out of bounds")
+		panic("memory address out of bounds, addr: " + strconv.Itoa(int(addr)) + ", len(state.Memory): " + strconv.Itoa(len(state.Memory)))
 	}
 	value := state.Memory[addr]
 	slog.Warn("Memory",
@@ -695,8 +2115,6 @@ func (i instEmulator) runLoadDirect(inst Operation, state *coreState) map[Operan
 		"X", state.TileX,
 		"Y", state.TileY,
 	)
-	finalPred := addrStruct.Pred
-	results := make(map[Operand]cgra.Data)
 	for _, dst := range inst.DstOperands.Operands {
 		results[dst] = cgra.NewScalarWithPred(value, finalPred)
 	}
@@ -754,6 +2172,11 @@ func (i instEmulator) runStoreDirect(inst Operation, state *coreState) map[Opera
 	src2 := inst.SrcOperands.Operands[1]
 	addrStruct := i.readOperand(src2, state)
 	addr := addrStruct.First()
+	finalPred := addrStruct.Pred && valueStruct.Pred
+	if !finalPred {
+		Trace("Inst", "Time", state.CurrentTime, "OpCode", inst.OpCode, "ID", inst.ID, "X", state.TileX, "Y", state.TileY, "Pred", finalPred)
+		return make(map[Operand]cgra.Data)
+	}
 	if addr >= uint32(len(state.Memory)) {
 		panic("memory address out of bounds, addr: " + strconv.Itoa(int(addr)) + ", len(state.Memory): " + strconv.Itoa(len(state.Memory)))
 	}
@@ -765,7 +2188,6 @@ func (i instEmulator) runStoreDirect(inst Operation, state *coreState) map[Opera
 		"Y", state.TileY,
 	)
 	state.Memory[addr] = value
-	finalPred := addrStruct.Pred && valueStruct.Pred
 	Trace("Inst", "Time", state.CurrentTime, "OpCode", inst.OpCode, "ID", inst.ID, "X", state.TileX, "Y", state.TileY, "Pred", finalPred)
 	// elect no next PC
 	return make(map[Operand]cgra.Data)
@@ -921,7 +2343,7 @@ func (i instEmulator) runSub(inst Operation, state *coreState) map[Operand]cgra.
 	dstValSigned := src1Signed - src2Signed
 	dstVal := uint32(dstValSigned)
 
-	fmt.Printf("ISUB: Subtracting %d (src1) - %d (src2) = %d\n", src1Signed, src2Signed, dstValSigned)
+	// fmt.Printf("ISUB: Subtracting %d (src1) - %d (src2) = %d\n", src1Signed, src2Signed, dstValSigned)
 	finalPred := src1Struct.Pred && src2Struct.Pred
 
 	results := make(map[Operand]cgra.Data)
@@ -973,8 +2395,9 @@ func (i instEmulator) runMulAdd(inst Operation, state *coreState) map[Operand]cg
 	s2Val := int32(s2.First())
 	dstValSigned := s0Val*s1Val + s2Val
 	dstVal := uint32(dstValSigned)
-	finalPred := s0.Pred && s1.Pred && s2.Pred
-
+	//finalPred := s0.Pred && s1.Pred && s2.Pred
+	//Only for systolic array currently. if need for other cases, please modify the finalPred calculation.
+	finalPred := s0.Pred && s1.Pred
 	results := make(map[Operand]cgra.Data)
 	for _, dst := range inst.DstOperands.Operands {
 		results[dst] = cgra.NewScalarWithPred(dstVal, finalPred)
@@ -1201,9 +2624,9 @@ func (i instEmulator) runRetImm(inst Operation, state *coreState, time float64)
 			*state.retVal = srcVal
 			*state.exit = true
 			*state.requestExitTimestamp = time
-			fmt.Println("++++++++++++ RETURN executed", srcVal, "T=", time)
-		} else {
-			fmt.Println("++++++++++++ RETURN bypassed")
+			// 	fmt.Println("++++++++++++ RETURN executed", srcVal, "T=", time)
+			// } else {
+			// 	fmt.Println("++++++++++++ RETURN bypassed")
 		}
 	} else {
 		panic("RETURN_VALUE requires a source operand")
@@ -1229,9 +2652,9 @@ func (i instEmulator) runRetDelay(inst Operation, state *coreState, time float64
 			*state.retVal = 0
 			*state.exit = true
 			*state.requestExitTimestamp = time + ExitDelay
-			fmt.Println("++++++++++++ RETURN executed", srcVal, "T=", time)
-		} else {
-			fmt.Println("++++++++++++ RETURN bypassed")
+			// 	fmt.Println("++++++++++++ RETURN executed", srcVal, "T=", time)
+			// } else {
+			// 	fmt.Println("++++++++++++ RETURN bypassed")
 		}
 	} else {
 		panic("RETURN_VOID requires a source operand")
@@ -1338,14 +2761,14 @@ func (i instEmulator) runCmpExport(inst Operation, state *coreState) map[Operand
 		for _, dst := range inst.DstOperands.Operands {
 			results[dst] = cgra.NewScalarWithPred(1, finalPred)
 		}
-		fmt.Println(">>>>>>>>>>>>>>> ICMP_EQ: ", src1Val.First(), src2Val.First(), "Yes")
+		// fmt.Println(">>>>>>>>>>>>>>> ICMP_EQ: ", src1Val.First(), src2Val.First(), "Yes")
 	} else {
 		finalPred = src1Val.Pred
 		resultVal = 0
 		for _, dst := range inst.DstOperands.Operands {
 			results[dst] = cgra.NewScalarWithPred(0, finalPred)
 		}
-		fmt.Println(">>>>>>>>>>>>>>> ICMP_EQ: ", src1Val.First(), src2Val.First(), "No")
+		// fmt.Println(">>>>>>>>>>>>>>> ICMP_EQ: ", src1Val.First(), src2Val.First(), "No")
 	}
 	Trace("Inst", "Time", state.CurrentTime, "OpCode", inst.OpCode, "ID", inst.ID, "X", state.TileX, "Y", state.TileY, "Src1", fmt.Sprintf("%d(%t)", src1Val.First(), src1Val.Pred), "Src2", fmt.Sprintf("%d(%t)", src2Val.First(), src2Val.Pred), "Result", fmt.Sprintf("%d(%t)", resultVal, finalPred))
 	return results
@@ -1555,13 +2978,13 @@ func (i instEmulator) runPhiStart(inst Operation, state *coreState) map[Operand]
 	results := make(map[Operand]cgra.Data)
 
 	if state.States[stateKey] == nil || state.States[stateKey] == false { // first execution
-		if !src1Pred {
-			panic("Predicate of first time PHI_START must be true at (" + strconv.Itoa(int(state.TileX)) + "," + strconv.Itoa(int(state.TileY)) + ") instruction " + strconv.Itoa(inst.ID))
-		}
+		// if !src1Pred {
+		// 	panic("Predicate of first time PHI_START must be true at (" + strconv.Itoa(int(state.TileX)) + "," + strconv.Itoa(int(state.TileY)) + ") instruction " + strconv.Itoa(inst.ID))
+		// }
 		result = src1Val
 		finalPred = src1Pred
 		state.States[stateKey] = true
-		fmt.Println("set state.States[", stateKey, "] to true")
+		// fmt.Println("set state.States[", stateKey, "] to true")
 		for _, dst := range inst.DstOperands.Operands {
 			results[dst] = cgra.NewScalarWithPred(result, finalPred)
 		}
@@ -1616,7 +3039,7 @@ func (i instEmulator) runGrantPred(inst Operation, state *coreState) map[Operand
 		results[dst] = cgra.NewScalarWithPred(srcVal, finalPred)
 	}
 
-	fmt.Println("<<<<<<<<<<<<<< GRANTPRED: ", srcVal, predVal, finalPred)
+	// fmt.Println("<<<<<<<<<<<<<< GRANTPRED: ", srcVal, predVal, finalPred)
 
 	Trace("Inst", "Time", state.CurrentTime, "OpCode", inst.OpCode, "ID", inst.ID, "X", state.TileX, "Y", state.TileY, "SrcOperand", fmt.Sprintf("%d(%t)", srcVal, srcStruct.Pred), "PredOperand", fmt.Sprintf("%d(%t)", predVal, predStruct.Pred), "Pred", finalPred, "Result", fmt.Sprintf("%d(%t)", srcVal, finalPred))
 	// elect no next PC
diff --git a/core/execution_policy_test.go b/core/execution_policy_test.go
new file mode 100644
index 0000000..5304441
--- /dev/null
+++ b/core/execution_policy_test.go
@@ -0,0 +1,812 @@
+package core
+
+import (
+	"bytes"
+	"encoding/json"
+	"log/slog"
+	"os"
+	"strings"
+	"testing"
+)
+
+type readyHeldLog struct {
+	RunMode              string `json:"run_mode"`
+	Cycle                int64  `json:"cycle"`
+	X                    int    `json:"X"`
+	Y                    int    `json:"Y"`
+	ID                   int    `json:"ID"`
+	OccurrenceIndex      int    `json:"occurrence_index"`
+	OpCode               string `json:"OpCode"`
+	AnnotatedTimeT       *int64 `json:"annotated_time_t"`
+	OperandsReady        bool   `json:"operands_ready"`
+	PredicateReadyOrTrue bool   `json:"predicate_ready_or_true"`
+	ResourcesAvailable   bool   `json:"resources_available"`
+	TimingGateSatisfied  bool   `json:"timing_gate_satisfied"`
+	FireableExceptTime   bool   `json:"fireable_except_time"`
+	BlockedByLowerBound  bool   `json:"blocked_by_lower_bound"`
+	IssuedThisCycle      bool   `json:"issued_this_cycle"`
+}
+
+func newPolicyTestState() coreState {
+	state := coreState{
+		Directions: map[string]bool{
+			"North":     true,
+			"East":      true,
+			"South":     true,
+			"West":      true,
+			"NorthEast": true,
+			"SouthEast": true,
+			"SouthWest": true,
+			"NorthWest": true,
+			"Router":    true,
+		},
+		RecvBufHeadReady:  make([][]bool, 4),
+		SendBufHeadBusy:   make([][]bool, 4),
+		OpTimingCursor:    make(map[int]int),
+		OpTimingLate:      make(map[int]bool),
+		OpTimingRollCycle: make(map[int]int64),
+		OpIssueCount:      make(map[int]int),
+		TimingWaitBlocked: false,
+		StallReason:       "",
+		StallOpID:         0,
+		StallOpCode:       "",
+	}
+
+	for i := 0; i < 4; i++ {
+		state.RecvBufHeadReady[i] = make([]bool, 12)
+		state.SendBufHeadBusy[i] = make([]bool, 12)
+	}
+
+	return state
+}
+
+func TestCanIssueInOrderIgnoresTimeStep(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:       true,
+		ExecutionPolicy: ExecutionPolicyInOrderDataflow,
+	}
+	state := newPolicyTestState()
+	state.CurrentCycle = 0
+	op := Operation{
+		OpCode:   "NOP",
+		TimeStep: 10,
+	}
+
+	if !emu.canIssue(op, &state) {
+		t.Fatalf("in_order_dataflow should ignore timestep and allow ready op")
+	}
+}
+
+func TestCanIssueElasticScheduled(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:       true,
+		ExecutionPolicy: ExecutionPolicyElasticScheduled,
+	}
+	state := newPolicyTestState()
+	state.Code.CompiledII = 10 // must have schedule so elastic time-gating applies
+	op := Operation{
+		OpCode:   "NOP",
+		TimeStep: 5,
+	}
+
+	state.CurrentCycle = 4
+	if emu.canIssue(op, &state) {
+		t.Fatalf("elastic_scheduled should block before timestep")
+	}
+
+	state.CurrentCycle = 5
+	if !emu.canIssue(op, &state) {
+		t.Fatalf("elastic_scheduled should allow at timestep when ready")
+	}
+}
+
+func TestCanIssueElasticScheduledWithCompiledIIConversion(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:       true,
+		ExecutionPolicy: ExecutionPolicyElasticScheduled,
+	}
+	state := newPolicyTestState()
+	state.Code.CompiledII = 4
+	op := Operation{
+		OpCode:   "NOP",
+		TimeStep: 1,
+	}
+
+	state.CurrentCycle = 0 // step 0
+	if emu.canIssue(op, &state) {
+		t.Fatalf("elastic_scheduled should block before converted step")
+	}
+
+	state.CurrentCycle = 2 // step 2
+	if !emu.canIssue(op, &state) {
+		t.Fatalf("elastic_scheduled should allow when converted step >= time_step")
+	}
+
+	state.CurrentCycle = 5 // step 1 (5 %% 4)
+	if !emu.canIssue(op, &state) {
+		t.Fatalf("elastic_scheduled should allow on converted matching step")
+	}
+}
+
+func TestCanIssueStrictTimedViolation(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:       true,
+		ExecutionPolicy: ExecutionPolicyStrictTimed,
+	}
+	state := newPolicyTestState()
+	state.Code.CompiledII = 4           // must have schedule so strict time check runs
+	state.CurrentCycle = 6              // step 2 (6%4); op was for step 1 → missed step, violation
+	state.RecvBufHeadReady[0][0] = true // North dir slot 0 ready so CheckFlags passes
+	op := Operation{
+		OpCode:   "DATA_MOV",
+		TimeStep: 1, // scheduled step 1; current step 2 > 1 → panic
+		SrcOperands: OperandList{
+			Operands: []Operand{
+				{Impl: "North", Color: "R"},
+			},
+		},
+	}
+
+	defer func() {
+		recovered := recover()
+		if recovered == nil {
+			t.Fatalf("expected strict_timed synchronization violation panic")
+		}
+		if !strings.Contains(recovered.(string), "synchronization violation") {
+			t.Fatalf("unexpected panic: %v", recovered)
+		}
+	}()
+
+	_ = emu.canIssue(op, &state)
+}
+
+func TestCanIssueStrictTimedWithCompiledIIConversion(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:       true,
+		ExecutionPolicy: ExecutionPolicyStrictTimed,
+	}
+	state := newPolicyTestState()
+	state.Code.CompiledII = 4
+	op := Operation{
+		OpCode:   "NOP",
+		TimeStep: 1,
+	}
+
+	state.CurrentCycle = 5 // step 1
+	if !emu.canIssue(op, &state) {
+		t.Fatalf("strict_timed should allow when converted step equals time_step")
+	}
+
+	state.CurrentCycle = 6 // step 2: missed exact step
+	defer func() {
+		recovered := recover()
+		if recovered == nil {
+			t.Fatalf("expected strict_timed missed-step synchronization violation")
+		}
+		if !strings.Contains(recovered.(string), "missed its exact scheduled step") {
+			t.Fatalf("unexpected panic: %v", recovered)
+		}
+	}()
+	_ = emu.canIssue(op, &state)
+}
+
+func TestCanIssueStrictTimedWithDerivedTiming(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:             true,
+		ExecutionPolicy:       ExecutionPolicyStrictTimed,
+		StrictMaxSlip:         4,
+		StrictFailOnViolation: false,
+	}
+	state := newPolicyTestState()
+	state.Code.DerivedTiming = map[int][]int64{
+		7: []int64{5},
+	}
+	op := Operation{
+		OpCode: "NOP",
+		ID:     7,
+	}
+
+	state.CurrentCycle = 4
+	if emu.canIssue(op, &state) {
+		t.Fatalf("strict_timed should block before derived cycle")
+	}
+
+	state.CurrentCycle = 5
+	if !emu.canIssue(op, &state) {
+		t.Fatalf("strict_timed should allow exactly on derived cycle when ready")
+	}
+
+	state.CurrentCycle = 6
+	if !emu.canIssue(op, &state) {
+		t.Fatalf("strict_timed should allow late issue after derived cycle when ready")
+	}
+}
+
+func TestCanIssueElasticScheduledWithDerivedTiming(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:       true,
+		ExecutionPolicy: ExecutionPolicyElasticScheduled,
+	}
+	state := newPolicyTestState()
+	state.Code.DerivedTiming = map[int][]int64{
+		9: []int64{5},
+	}
+	op := Operation{
+		OpCode: "NOP",
+		ID:     9,
+	}
+
+	state.CurrentCycle = 4
+	if emu.canIssue(op, &state) {
+		t.Fatalf("elastic_scheduled should block before derived cycle")
+	}
+
+	state.CurrentCycle = 6
+	if !emu.canIssue(op, &state) {
+		t.Fatalf("elastic_scheduled should allow after derived cycle when ready")
+	}
+}
+
+func TestRunInstructionGroupWithSyncOpsKeepsAliveOnDerivedTimingWait(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:       true,
+		ExecutionPolicy: ExecutionPolicyStrictTimed,
+	}
+	state := newPolicyTestState()
+	state.Code.DerivedTiming = map[int][]int64{
+		13: []int64{5},
+	}
+	state.CurrentCycle = 4
+
+	group := InstructionGroup{
+		Operations: []Operation{
+			{
+				OpCode: "NOP",
+				ID:     13,
+			},
+		},
+	}
+
+	progress := emu.RunInstructionGroupWithSyncOps(group, &state, 0)
+	if !progress {
+		t.Fatalf("timing wait should keep core ticking until derived cycle is reached")
+	}
+	if !state.TimingWaitBlocked {
+		t.Fatalf("expected timing wait marker to be set")
+	}
+}
+
+func TestCanIssueStrictTimedDerivedTimingNotReady(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:             true,
+		ExecutionPolicy:       ExecutionPolicyStrictTimed,
+		StrictMaxSlip:         4,
+		StrictFailOnViolation: false,
+	}
+	state := newPolicyTestState()
+	state.Code.DerivedTiming = map[int][]int64{
+		11: []int64{5},
+	}
+	op := Operation{
+		OpCode: "DATA_MOV",
+		ID:     11,
+		SrcOperands: OperandList{
+			Operands: []Operand{
+				{Impl: "North", Color: "R"},
+			},
+		},
+	}
+
+	state.CurrentCycle = 5
+	if emu.canIssue(op, &state) {
+		t.Fatalf("strict_timed should stall when operand is not ready on derived cycle")
+	}
+
+	state.RecvBufHeadReady[0][0] = true // North-R becomes ready
+	state.CurrentCycle = 6
+	if !emu.canIssue(op, &state) {
+		t.Fatalf("strict_timed should allow late issue after derived-cycle stall")
+	}
+}
+
+func TestCanIssueStrictTimedDerivedTimingWindowViolationSoft(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:             true,
+		ExecutionPolicy:       ExecutionPolicyStrictTimed,
+		StrictMaxSlip:         1,
+		StrictFailOnViolation: false,
+	}
+	state := newPolicyTestState()
+	state.Code.CompiledII = 2
+	state.Code.DerivedTiming = map[int][]int64{
+		17: []int64{5},
+	}
+	op := Operation{
+		OpCode: "NOP",
+		ID:     17,
+	}
+
+	state.CurrentCycle = 7 // lateness=2 > max slip=1, should roll to next II window
+	if emu.canIssue(op, &state) {
+		t.Fatalf("strict_timed soft mode should stall after window violation")
+	}
+	if !state.TimingWaitBlocked {
+		t.Fatalf("expected timing wait after strict window violation")
+	}
+	if state.OpTimingRollCycle[op.ID] != 9 {
+		t.Fatalf("expected rolled cycle 9, got %d", state.OpTimingRollCycle[op.ID])
+	}
+
+	state.CurrentCycle = 8
+	if emu.canIssue(op, &state) {
+		t.Fatalf("strict_timed should keep waiting before rolled cycle")
+	}
+
+	state.CurrentCycle = 9
+	if !emu.canIssue(op, &state) {
+		t.Fatalf("strict_timed should issue at rolled cycle when ready")
+	}
+}
+
+func TestCanIssueStrictTimedDerivedTimingSetsScheduleBubbleReason(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:             true,
+		ExecutionPolicy:       ExecutionPolicyStrictTimed,
+		StrictMaxSlip:         1,
+		StrictFailOnViolation: false,
+	}
+	state := newPolicyTestState()
+	state.Code.CompiledII = 2
+	state.Code.DerivedTiming = map[int][]int64{
+		21: []int64{5},
+	}
+	op := Operation{
+		OpCode: "NOP",
+		ID:     21,
+	}
+
+	state.CurrentCycle = 7
+	if emu.canIssue(op, &state) {
+		t.Fatalf("expected strict_timed violation to block issue")
+	}
+	if state.StallReason != StallReasonScheduleBubble {
+		t.Fatalf("expected schedule bubble stall reason, got %q", state.StallReason)
+	}
+}
+
+func TestCanIssueStrictTimedDerivedTimingWindowViolationOverridesReadinessReason(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:             true,
+		ExecutionPolicy:       ExecutionPolicyStrictTimed,
+		StrictMaxSlip:         1,
+		StrictFailOnViolation: false,
+	}
+	state := newPolicyTestState()
+	state.Code.CompiledII = 2
+	state.Code.DerivedTiming = map[int][]int64{
+		31: []int64{5},
+	}
+	op := Operation{
+		OpCode: "DATA_MOV",
+		ID:     31,
+		SrcOperands: OperandList{
+			Operands: []Operand{
+				{Impl: "North", Color: "R"},
+			},
+		},
+	}
+
+	state.CurrentCycle = 7 // lateness=2 > max slip, and operand not ready
+	if emu.canIssue(op, &state) {
+		t.Fatalf("expected strict_timed to block on window violation")
+	}
+	if state.StallReason != StallReasonScheduleBubble {
+		t.Fatalf("expected schedule bubble to override readiness reason, got %q", state.StallReason)
+	}
+	if !state.TimingWaitBlocked {
+		t.Fatalf("expected timing wait after strict window violation")
+	}
+	if state.OpTimingRollCycle[op.ID] != 9 {
+		t.Fatalf("expected rolled cycle 9, got %d", state.OpTimingRollCycle[op.ID])
+	}
+}
+
+func TestCanIssueStrictTimedDerivedTimingRollDoesNotDependOnReadiness(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:             true,
+		ExecutionPolicy:       ExecutionPolicyStrictTimed,
+		StrictMaxSlip:         1,
+		StrictFailOnViolation: false,
+	}
+	state := newPolicyTestState()
+	state.Code.CompiledII = 2
+	state.Code.DerivedTiming = map[int][]int64{
+		32: []int64{5},
+	}
+	op := Operation{
+		OpCode: "DATA_MOV",
+		ID:     32,
+		SrcOperands: OperandList{
+			Operands: []Operand{
+				{Impl: "North", Color: "R"},
+			},
+		},
+	}
+
+	state.CurrentCycle = 7 // first observe violation while not ready
+	if emu.canIssue(op, &state) {
+		t.Fatalf("expected violation cycle to block")
+	}
+	if state.OpTimingRollCycle[op.ID] != 9 {
+		t.Fatalf("expected rolled cycle 9, got %d", state.OpTimingRollCycle[op.ID])
+	}
+
+	state.CurrentCycle = 8
+	if emu.canIssue(op, &state) {
+		t.Fatalf("expected strict_timed to wait before rolled cycle")
+	}
+
+	state.RecvBufHeadReady[0][0] = true // ready at rolled cycle
+	state.CurrentCycle = 9
+	if !emu.canIssue(op, &state) {
+		t.Fatalf("expected strict_timed to issue at first legal rolled cycle when ready")
+	}
+}
+
+func TestIsStrictControlSensitiveOpCoversAliasesAndFamilies(t *testing.T) {
+	cases := []struct {
+		opCode string
+		want   bool
+	}{
+		{opCode: "PHI_START", want: true},
+		{opCode: "grant_once", want: true},
+		{opCode: "ICMP_SGE", want: true},
+		{opCode: "CMP_EXPORT", want: true},
+		{opCode: "lt_ex", want: true},
+		{opCode: "RETURN_VALUE", want: true},
+		{opCode: "BNE", want: true},
+		{opCode: "CTRL_MOV", want: true},
+		{opCode: "ADD", want: false},
+		{opCode: "DATA_MOV", want: false},
+		{opCode: "NOP", want: false},
+	}
+
+	for _, tc := range cases {
+		got := isStrictControlSensitiveOp(tc.opCode)
+		if got != tc.want {
+			t.Fatalf("isStrictControlSensitiveOp(%q) = %t, want %t", tc.opCode, got, tc.want)
+		}
+	}
+}
+
+func TestCanIssueStrictTimedDerivedTimingControlAliasSkipsWindowPenalty(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:             true,
+		ExecutionPolicy:       ExecutionPolicyStrictTimed,
+		StrictMaxSlip:         0,
+		StrictFailOnViolation: false,
+	}
+	state := newPolicyTestState()
+	state.Code.CompiledII = 2
+	state.Code.DerivedTiming = map[int][]int64{
+		33: []int64{5},
+	}
+	op := Operation{
+		OpCode: "CMP_EXPORT",
+		ID:     33,
+	}
+
+	state.CurrentCycle = 7 // lateness=2, but control-sensitive alias should skip penalty
+	if !emu.canIssue(op, &state) {
+		t.Fatalf("expected control-sensitive alias to skip finite-W replay penalty")
+	}
+	if _, exists := state.OpTimingRollCycle[op.ID]; exists {
+		t.Fatalf("did not expect roll cycle for control-sensitive op")
+	}
+}
+
+func TestCanIssueGuidedDerivedTimingSetsOutputBlockedReason(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:       true,
+		ExecutionPolicy: ExecutionPolicyElasticScheduled,
+	}
+	state := newPolicyTestState()
+	state.Code.DerivedTiming = map[int][]int64{
+		23: []int64{5},
+	}
+	op := Operation{
+		OpCode: "DATA_MOV",
+		ID:     23,
+		DstOperands: OperandList{
+			Operands: []Operand{
+				{Impl: "East", Color: "R"},
+			},
+		},
+	}
+	state.CurrentCycle = 5
+	state.SendBufHeadBusy[0][emu.getDirecIndex("East")] = true // East-R blocked
+	if emu.canIssue(op, &state) {
+		t.Fatalf("expected guided mode to block when output is busy")
+	}
+	if state.StallReason != StallReasonOutputBlocked {
+		t.Fatalf("expected output blocked stall reason, got %q", state.StallReason)
+	}
+}
+
+func TestCanIssueStrictTimedDerivedTimingWindowViolationHard(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:             true,
+		ExecutionPolicy:       ExecutionPolicyStrictTimed,
+		StrictMaxSlip:         0,
+		StrictFailOnViolation: true,
+	}
+	state := newPolicyTestState()
+	state.Code.CompiledII = 2
+	state.Code.DerivedTiming = map[int][]int64{
+		19: []int64{5},
+	}
+	op := Operation{
+		OpCode: "NOP",
+		ID:     19,
+	}
+
+	defer func() {
+		recovered := recover()
+		if recovered == nil {
+			t.Fatalf("expected strict_timed hard mode panic on window violation")
+		}
+		if !strings.Contains(recovered.(string), "strict slip window violation") {
+			t.Fatalf("unexpected panic: %v", recovered)
+		}
+	}()
+
+	state.CurrentCycle = 6 // lateness=1 > max slip=0
+	_ = emu.canIssue(op, &state)
+}
+
+func captureReadyHeldLogs(t *testing.T, fn func()) []readyHeldLog {
+	t.Helper()
+
+	var buffer bytes.Buffer
+	oldLogger := slog.Default()
+	oldTraceEnabled := TraceEnabled()
+	oldObserver := traceObserver
+
+	handler := slog.NewJSONHandler(&buffer, &slog.HandlerOptions{Level: LevelTrace})
+	slog.SetDefault(slog.New(handler))
+	SetTraceEnabled(true)
+	traceObserver = nil
+	defer func() {
+		traceObserver = oldObserver
+		SetTraceEnabled(oldTraceEnabled)
+		slog.SetDefault(oldLogger)
+	}()
+
+	fn()
+
+	logs := make([]readyHeldLog, 0)
+	for _, line := range strings.Split(strings.TrimSpace(buffer.String()), "\n") {
+		if strings.TrimSpace(line) == "" {
+			continue
+		}
+		var entry struct {
+			Msg string `json:"msg"`
+			readyHeldLog
+		}
+		if err := json.Unmarshal([]byte(line), &entry); err != nil {
+			t.Fatalf("unmarshal trace line: %v", err)
+		}
+		if entry.Msg != "ReadyHeld" {
+			continue
+		}
+		logs = append(logs, entry.readyHeldLog)
+	}
+	return logs
+}
+
+func newSyncTraceState(group InstructionGroup) coreState {
+	state := newPolicyTestState()
+	state.SelectedBlock = &EntryBlock{InstructionGroups: []InstructionGroup{group}}
+	state.PCInBlock = 0
+	state.ReadyHeldTraceEnabled = true
+	state.ReadyHeldRunMode = "lower_bound"
+	return state
+}
+
+func TestIssueDecisionElasticScheduledDerivedTimingReadyButHeld(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:       true,
+		ExecutionPolicy: ExecutionPolicyElasticScheduled,
+	}
+	state := newPolicyTestState()
+	state.Code.DerivedTiming = map[int][]int64{9: {5}}
+	state.CurrentCycle = 4
+	operation := Operation{OpCode: "NOP", ID: 9}
+
+	decision := emu.issueDecision(operation, &state)
+	if decision.AnnotatedTimeT == nil || *decision.AnnotatedTimeT != 5 {
+		t.Fatalf("annotated_time_t = %v, want 5", decision.AnnotatedTimeT)
+	}
+	if !decision.OperandsReady || !decision.PredicateReadyOrTrue || !decision.ResourcesAvailable {
+		t.Fatalf("expected all non-timing readiness gates to pass: %+v", decision)
+	}
+	if decision.TimingGateSatisfied {
+		t.Fatalf("expected timing gate to be unsatisfied before annotated cycle")
+	}
+	if !decision.FireableExceptTime {
+		t.Fatalf("expected fireable_except_time=true when only lower-bound timing blocks issue")
+	}
+	if !decision.BlockedByLowerBound {
+		t.Fatalf("expected blocked_by_lower_bound=true")
+	}
+	if decision.CanIssue {
+		t.Fatalf("expected can_issue=false before annotated cycle")
+	}
+
+	emu.applyIssueDecision(operation, &state, decision)
+	if !state.TimingWaitBlocked {
+		t.Fatalf("expected timing wait marker after applying decision")
+	}
+	if state.StallReason != StallReasonScheduleBubble {
+		t.Fatalf("stall reason = %q, want %q", state.StallReason, StallReasonScheduleBubble)
+	}
+}
+
+func TestIssueDecisionElasticScheduledDerivedTimingOutputBlocked(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:       true,
+		ExecutionPolicy: ExecutionPolicyElasticScheduled,
+	}
+	state := newPolicyTestState()
+	state.Code.DerivedTiming = map[int][]int64{23: {5}}
+	state.CurrentCycle = 5
+	state.SendBufHeadBusy[0][emu.getDirecIndex("East")] = true
+	operation := Operation{
+		OpCode:      "DATA_MOV",
+		ID:          23,
+		DstOperands: OperandList{Operands: []Operand{{Impl: "East", Color: "R"}}},
+	}
+
+	decision := emu.issueDecision(operation, &state)
+	if decision.ResourcesAvailable {
+		t.Fatalf("expected resources_available=false")
+	}
+	if decision.FireableExceptTime {
+		t.Fatalf("expected fireable_except_time=false when output credit is missing")
+	}
+	if decision.BlockedByLowerBound {
+		t.Fatalf("expected blocked_by_lower_bound=false when non-timing checks already fail")
+	}
+	if decision.CanIssue {
+		t.Fatalf("expected can_issue=false when output is blocked")
+	}
+}
+
+func TestIssueDecisionElasticScheduledDerivedTimingOperandWait(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:       true,
+		ExecutionPolicy: ExecutionPolicyElasticScheduled,
+	}
+	state := newPolicyTestState()
+	state.Code.DerivedTiming = map[int][]int64{11: {5}}
+	state.CurrentCycle = 5
+	operation := Operation{
+		OpCode:      "DATA_MOV",
+		ID:          11,
+		SrcOperands: OperandList{Operands: []Operand{{Impl: "North", Color: "R"}}},
+	}
+
+	decision := emu.issueDecision(operation, &state)
+	if decision.OperandsReady {
+		t.Fatalf("expected operands_ready=false")
+	}
+	if decision.FireableExceptTime {
+		t.Fatalf("expected fireable_except_time=false when operands are not ready")
+	}
+	if decision.BlockedByLowerBound {
+		t.Fatalf("expected blocked_by_lower_bound=false when operand wait is the real blocker")
+	}
+}
+
+func TestRunInstructionGroupWithSyncOpsEmitsBlockedThenIssuedReadyHeld(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:       true,
+		ExecutionPolicy: ExecutionPolicyElasticScheduled,
+	}
+	group := InstructionGroup{Operations: []Operation{{OpCode: "NOP", ID: 41}}}
+	state := newSyncTraceState(group)
+	state.Code.DerivedTiming = map[int][]int64{41: {5}}
+
+	logs := captureReadyHeldLogs(t, func() {
+		state.CurrentCycle = 4
+		if !emu.RunInstructionGroupWithSyncOps(group, &state, 4) {
+			t.Fatalf("expected timing wait to keep sync core alive")
+		}
+		state.CurrentCycle = 5
+		if !emu.RunInstructionGroupWithSyncOps(group, &state, 5) {
+			t.Fatalf("expected issued cycle to report progress")
+		}
+	})
+
+	if len(logs) != 2 {
+		t.Fatalf("expected 2 ReadyHeld logs, got %d: %+v", len(logs), logs)
+	}
+	if logs[0].OccurrenceIndex != 0 || logs[1].OccurrenceIndex != 0 {
+		t.Fatalf("expected same occurrence index for blocked/issued pair, got %+v", logs)
+	}
+	if !logs[0].BlockedByLowerBound || logs[0].IssuedThisCycle {
+		t.Fatalf("unexpected blocked log: %+v", logs[0])
+	}
+	if logs[1].BlockedByLowerBound || !logs[1].IssuedThisCycle {
+		t.Fatalf("unexpected issued log: %+v", logs[1])
+	}
+	if state.OpIssueCount[41] != 1 {
+		t.Fatalf("expected issued occurrence count to advance to 1, got %d", state.OpIssueCount[41])
+	}
+}
+
+func TestRunInstructionGroupWithSyncOpsReadyHeldOccurrenceIndexIncrements(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:       true,
+		ExecutionPolicy: ExecutionPolicyElasticScheduled,
+	}
+	group := InstructionGroup{Operations: []Operation{{OpCode: "NOP", ID: 42}}}
+	state := newSyncTraceState(group)
+	state.Code.DerivedTiming = map[int][]int64{42: {5, 6}}
+
+	logs := captureReadyHeldLogs(t, func() {
+		state.CurrentCycle = 5
+		if !emu.RunInstructionGroupWithSyncOps(group, &state, 5) {
+			t.Fatalf("expected first issue to make progress")
+		}
+		state.CurrentCycle = 6
+		if !emu.RunInstructionGroupWithSyncOps(group, &state, 6) {
+			t.Fatalf("expected second issue to make progress")
+		}
+	})
+
+	if len(logs) != 2 {
+		t.Fatalf("expected 2 issued ReadyHeld logs, got %d: %+v", len(logs), logs)
+	}
+	if logs[0].OccurrenceIndex != 0 || logs[1].OccurrenceIndex != 1 {
+		t.Fatalf("expected occurrence indexes [0 1], got [%d %d]", logs[0].OccurrenceIndex, logs[1].OccurrenceIndex)
+	}
+	if !logs[0].IssuedThisCycle || !logs[1].IssuedThisCycle {
+		t.Fatalf("expected issued_this_cycle=true for both logs: %+v", logs)
+	}
+	if state.OpIssueCount[42] != 2 {
+		t.Fatalf("expected occurrence count to reach 2, got %d", state.OpIssueCount[42])
+	}
+}
+
+func TestLoadProgramFileFromYAMLPreservesTimeStep(t *testing.T) {
+	filePath := "../test/testbench/stonneGEMM8x8/gemm.yaml"
+	if _, err := os.Stat(filePath); os.IsNotExist(err) {
+		t.Skipf("test file does not exist: %s", filePath)
+	}
+
+	programMap := LoadProgramFileFromYAML(filePath)
+	program, ok := programMap["(0,0)"]
+	if !ok {
+		t.Fatalf("core (0,0) not found in parsed program")
+	}
+	if len(program.EntryBlocks) == 0 || len(program.EntryBlocks[0].InstructionGroups) < 2 {
+		t.Fatalf("unexpected program structure for core (0,0)")
+	}
+
+	group0 := program.EntryBlocks[0].InstructionGroups[0]
+	if len(group0.Operations) == 0 {
+		t.Fatalf("group0 has no operations")
+	}
+	if group0.Operations[0].TimeStep != 0 {
+		t.Fatalf("unexpected timestep for first op: got %d want 0", group0.Operations[0].TimeStep)
+	}
+
+	group1 := program.EntryBlocks[0].InstructionGroups[1]
+	if len(group1.Operations) == 0 {
+		t.Fatalf("group1 has no operations")
+	}
+	if group1.Operations[0].TimeStep != 1 {
+		t.Fatalf("unexpected timestep for second group first op: got %d want 1", group1.Operations[0].TimeStep)
+	}
+}
diff --git a/core/fifo_buffer_test.go b/core/fifo_buffer_test.go
new file mode 100644
index 0000000..32a7559
--- /dev/null
+++ b/core/fifo_buffer_test.go
@@ -0,0 +1,166 @@
+package core
+
+import (
+	"testing"
+
+	"github.com/sarchlab/zeonica/cgra"
+)
+
+func newFIFOTestState(recvCap, sendCap int) coreState {
+	state := coreState{
+		Directions: map[string]bool{
+			"North":     true,
+			"East":      true,
+			"South":     true,
+			"West":      true,
+			"NorthEast": true,
+			"SouthEast": true,
+			"SouthWest": true,
+			"NorthWest": true,
+			"Router":    true,
+		},
+		Mode:              SyncOp,
+		EnableFIFOModel:   true,
+		RecvQueueCapacity: recvCap,
+		SendQueueCapacity: sendCap,
+		RecvBufHead:       make([][]cgra.Data, 4),
+		RecvBufHeadReady:  make([][]bool, 4),
+		SendBufHead:       make([][]cgra.Data, 4),
+		SendBufHeadBusy:   make([][]bool, 4),
+		RecvBufQueue:      make([][][]cgra.Data, 4),
+		SendBufQueue:      make([][][]cgra.Data, 4),
+		OpInputReadCache:  make(map[string]cgra.Data),
+	}
+	for c := 0; c < 4; c++ {
+		state.RecvBufHead[c] = make([]cgra.Data, 12)
+		state.RecvBufHeadReady[c] = make([]bool, 12)
+		state.SendBufHead[c] = make([]cgra.Data, 12)
+		state.SendBufHeadBusy[c] = make([]bool, 12)
+		state.RecvBufQueue[c] = make([][]cgra.Data, 12)
+		state.SendBufQueue[c] = make([][]cgra.Data, 12)
+		for d := 0; d < 12; d++ {
+			state.RecvBufQueue[c][d] = make([]cgra.Data, 0, recvCap)
+			state.SendBufQueue[c][d] = make([]cgra.Data, 0, sendCap)
+		}
+	}
+	return state
+}
+
+func TestRecvFIFOOrderAndCapacity(t *testing.T) {
+	state := newFIFOTestState(2, 2)
+	emu := instEmulator{}
+	north := emu.getDirecIndex("North")
+
+	if !state.recvQueuePush(0, north, cgra.NewScalar(11)) {
+		t.Fatal("expected first recv enqueue to succeed")
+	}
+	if !state.recvQueuePush(0, north, cgra.NewScalar(22)) {
+		t.Fatal("expected second recv enqueue to succeed")
+	}
+	if state.recvQueuePush(0, north, cgra.NewScalar(33)) {
+		t.Fatal("expected recv queue to report full at capacity")
+	}
+
+	state.OpInputReadCache = make(map[string]cgra.Data)
+	v1 := emu.readOperand(Operand{Impl: "North", Color: "R"}, &state)
+	state.OpInputReadCache = make(map[string]cgra.Data)
+	v2 := emu.readOperand(Operand{Impl: "North", Color: "R"}, &state)
+	if v1.First() != 11 || v2.First() != 22 {
+		t.Fatalf("unexpected FIFO order: got (%d,%d), want (11,22)", v1.First(), v2.First())
+	}
+	if state.recvQueueLen(0, north) != 0 {
+		t.Fatalf("expected recv queue empty after two consumes, got %d", state.recvQueueLen(0, north))
+	}
+}
+
+func TestSyncModeDuplicatePortReadConsumesOnce(t *testing.T) {
+	state := newFIFOTestState(4, 2)
+	emu := instEmulator{}
+	north := emu.getDirecIndex("North")
+
+	if !state.recvQueuePush(0, north, cgra.NewScalar(101)) {
+		t.Fatal("expected first recv enqueue to succeed")
+	}
+	if !state.recvQueuePush(0, north, cgra.NewScalar(202)) {
+		t.Fatal("expected second recv enqueue to succeed")
+	}
+
+	state.OpInputReadCache = make(map[string]cgra.Data)
+	v1 := emu.readOperand(Operand{Impl: "North", Color: "R"}, &state)
+	v2 := emu.readOperand(Operand{Impl: "North", Color: "R"}, &state)
+
+	if v1.First() != 101 || v2.First() != 101 {
+		t.Fatalf("expected duplicate reads to reuse same token, got (%d,%d)", v1.First(), v2.First())
+	}
+	if state.recvQueueLen(0, north) != 1 {
+		t.Fatalf("expected queue length 1 after duplicate read consume-once, got %d", state.recvQueueLen(0, north))
+	}
+}
+
+func TestSendQueueBlocksOnlyWhenFull(t *testing.T) {
+	state := newFIFOTestState(2, 2)
+	emu := instEmulator{}
+	east := emu.getDirecIndex("East")
+
+	if !state.sendQueuePush(0, east, cgra.NewScalar(1)) {
+		t.Fatal("expected first send enqueue to succeed")
+	}
+
+	op := Operation{
+		OpCode: "MOV",
+		SrcOperands: OperandList{Operands: []Operand{
+			{Impl: "#1", Color: "R"},
+		}},
+		DstOperands: OperandList{Operands: []Operand{
+			{Impl: "East", Color: "R"},
+		}},
+	}
+
+	ready, reason := emu.checkIssueReadiness(op, &state)
+	if !ready {
+		t.Fatalf("expected issue ready when queue has room, got reason=%s", reason)
+	}
+
+	if !state.sendQueuePush(0, east, cgra.NewScalar(2)) {
+		t.Fatal("expected second send enqueue to succeed")
+	}
+	ready, reason = emu.checkIssueReadiness(op, &state)
+	if ready || reason != StallReasonOutputBlocked {
+		t.Fatalf("expected output blocked when queue full, got ready=%v reason=%s", ready, reason)
+	}
+}
+
+func TestRouterRedKeepsSingleOutstanding(t *testing.T) {
+	state := newFIFOTestState(2, 8)
+	router := int(cgra.Router)
+
+	if !state.sendQueuePush(0, router, cgra.NewScalar(7)) {
+		t.Fatal("expected first router-red enqueue to succeed")
+	}
+	if state.sendQueuePush(0, router, cgra.NewScalar(8)) {
+		t.Fatal("expected router-red second enqueue to fail (single outstanding)")
+	}
+}
+
+func TestEnableFIFOModelSwitchControlsQueueDepth(t *testing.T) {
+	emu := instEmulator{}
+	north := emu.getDirecIndex("North")
+
+	legacy := newFIFOTestState(4, 4)
+	legacy.EnableFIFOModel = false
+	if !legacy.recvQueuePush(0, north, cgra.NewScalar(1)) {
+		t.Fatal("legacy path first recv push should succeed")
+	}
+	if legacy.recvQueuePush(0, north, cgra.NewScalar(2)) {
+		t.Fatal("legacy path should stay single-slot regardless of configured depth")
+	}
+
+	fifo := newFIFOTestState(4, 4)
+	fifo.EnableFIFOModel = true
+	if !fifo.recvQueuePush(0, north, cgra.NewScalar(1)) {
+		t.Fatal("fifo path first recv push should succeed")
+	}
+	if !fifo.recvQueuePush(0, north, cgra.NewScalar(2)) {
+		t.Fatal("fifo path second recv push should succeed when depth > 1")
+	}
+}
diff --git a/core/operation_latency.go b/core/operation_latency.go
new file mode 100644
index 0000000..2a78ccb
--- /dev/null
+++ b/core/operation_latency.go
@@ -0,0 +1,70 @@
+package core
+
+import (
+	"fmt"
+	"os"
+	"strings"
+
+	"gopkg.in/yaml.v3"
+)
+
+const operationLatencyFileEnv = "ZEONICA_OPERATION_LATENCY_FILE"
+
+type operationLatencySidecar struct {
+	DefaultLatency int            `yaml:"default_latency"`
+	Opcodes        map[string]int `yaml:"opcodes"`
+}
+
+func loadOperationLatencyProfileFromEnv() (map[string]int, int, error) {
+	path := strings.TrimSpace(os.Getenv(operationLatencyFileEnv))
+	if path == "" {
+		return nil, 1, nil
+	}
+
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, 0, fmt.Errorf("read %s (%s): %w", operationLatencyFileEnv, path, err)
+	}
+
+	var sidecar operationLatencySidecar
+	if err := yaml.Unmarshal(data, &sidecar); err != nil {
+		return nil, 0, fmt.Errorf("parse %s (%s): %w", operationLatencyFileEnv, path, err)
+	}
+
+	defaultLatency := sidecar.DefaultLatency
+	if defaultLatency == 0 {
+		defaultLatency = 1
+	}
+	if defaultLatency <= 0 {
+		return nil, 0, fmt.Errorf("default_latency must be > 0, got %d", sidecar.DefaultLatency)
+	}
+
+	normalized := make(map[string]int, len(sidecar.Opcodes))
+	for opcode, latency := range sidecar.Opcodes {
+		key := normalizeLatencyOpcode(opcode)
+		if key == "" {
+			return nil, 0, fmt.Errorf("opcode latency entry has empty opcode key")
+		}
+		if latency <= 0 {
+			return nil, 0, fmt.Errorf("opcode %s latency must be > 0, got %d", key, latency)
+		}
+		normalized[key] = latency
+	}
+
+	return normalized, defaultLatency, nil
+}
+
+func normalizeLatencyOpcode(opCode string) string {
+	return strings.ToUpper(strings.TrimSpace(opCode))
+}
+
+func cloneOperationLatencyMap(src map[string]int) map[string]int {
+	if len(src) == 0 {
+		return nil
+	}
+	cloned := make(map[string]int, len(src))
+	for opcode, latency := range src {
+		cloned[opcode] = latency
+	}
+	return cloned
+}
diff --git a/core/operation_latency_test.go b/core/operation_latency_test.go
new file mode 100644
index 0000000..058a25d
--- /dev/null
+++ b/core/operation_latency_test.go
@@ -0,0 +1,308 @@
+package core
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/sarchlab/zeonica/cgra"
+)
+
+func TestLoadOperationLatencyProfileFromEnv(t *testing.T) {
+	tmpDir := t.TempDir()
+	path := filepath.Join(tmpDir, "latency.yaml")
+	content := []byte("default_latency: 3\nopcodes:\n  mul: 2\n  FMUL: 4\n")
+	if err := os.WriteFile(path, content, 0o644); err != nil {
+		t.Fatalf("write latency file: %v", err)
+	}
+
+	t.Setenv(operationLatencyFileEnv, path)
+
+	opcodes, defaultLatency, err := loadOperationLatencyProfileFromEnv()
+	if err != nil {
+		t.Fatalf("load latency profile: %v", err)
+	}
+	if defaultLatency != 3 {
+		t.Fatalf("unexpected default latency: got %d want 3", defaultLatency)
+	}
+	if opcodes["MUL"] != 2 {
+		t.Fatalf("expected normalized MUL latency 2, got %d", opcodes["MUL"])
+	}
+	if opcodes["FMUL"] != 4 {
+		t.Fatalf("expected FMUL latency 4, got %d", opcodes["FMUL"])
+	}
+}
+
+func TestLoadOperationLatencyProfileFromEnvDefaultsToOneWhenUnset(t *testing.T) {
+	t.Setenv(operationLatencyFileEnv, "")
+
+	opcodes, defaultLatency, err := loadOperationLatencyProfileFromEnv()
+	if err != nil {
+		t.Fatalf("load empty latency profile: %v", err)
+	}
+	if len(opcodes) != 0 {
+		t.Fatalf("expected no opcode latencies, got %d", len(opcodes))
+	}
+	if defaultLatency != 1 {
+		t.Fatalf("unexpected default latency: got %d want 1", defaultLatency)
+	}
+}
+
+func TestLoadOperationLatencyProfileRejectsInvalidLatency(t *testing.T) {
+	tmpDir := t.TempDir()
+	path := filepath.Join(tmpDir, "latency.yaml")
+	content := []byte("default_latency: 1\nopcodes:\n  MUL: 0\n")
+	if err := os.WriteFile(path, content, 0o644); err != nil {
+		t.Fatalf("write latency file: %v", err)
+	}
+
+	t.Setenv(operationLatencyFileEnv, path)
+
+	if _, _, err := loadOperationLatencyProfileFromEnv(); err == nil {
+		t.Fatal("expected invalid latency error")
+	}
+}
+
+func newLatencyTestState(recvCap, sendCap int, enableFIFO bool) coreState {
+	state := newFIFOTestState(recvCap, sendCap)
+	state.EnableFIFOModel = enableFIFO
+	state.Registers = make([]cgra.Data, 8)
+	state.OpTimingCursor = make(map[int]int)
+	state.OpTimingLate = make(map[int]bool)
+	state.OpTimingRollCycle = make(map[int]int64)
+	state.OpIssueCount = make(map[int]int)
+	state.Code = Program{DefaultOperationLatency: 1}
+	state.SelectedBlock = &EntryBlock{}
+	state.PCInBlock = 0
+	return state
+}
+
+func TestSyncOpcodeLatencyDelaysRegisterWriteback(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:       true,
+		ExecutionPolicy: ExecutionPolicyInOrderDataflow,
+	}
+	state := newLatencyTestState(4, 4, false)
+	state.Registers[0] = cgra.NewScalar(5)
+	state.Code.OperationLatencies = map[string]int{"MUL": 2}
+
+	ig := InstructionGroup{
+		Operations: []Operation{
+			{
+				OpCode: "MUL",
+				ID:     1,
+				SrcOperands: OperandList{Operands: []Operand{
+					{Impl: "$0", Color: "R"},
+					{Impl: "#3", Color: "R"},
+				}},
+				DstOperands: OperandList{Operands: []Operand{
+					{Impl: "$1", Color: "R"},
+				}},
+			},
+		},
+	}
+	state.SelectedBlock.InstructionGroups = []InstructionGroup{ig}
+
+	if !emu.RunInstructionGroupWithSyncOps(ig, &state, 0) {
+		t.Fatal("expected first issue cycle to make progress")
+	}
+	if got := state.Registers[1].First(); got != 0 {
+		t.Fatalf("unexpected early writeback: got %d want 0", got)
+	}
+	if state.PendingSyncGroup == nil {
+		t.Fatal("expected pending sync group after first issue")
+	}
+
+	state.CurrentCycle = 1
+	if !emu.RunInstructionGroupWithSyncOps(ig, &state, 1) {
+		t.Fatal("expected completion cycle to make progress")
+	}
+	if got := state.Registers[1].First(); got != 15 {
+		t.Fatalf("unexpected delayed writeback result: got %d want 15", got)
+	}
+	if state.PendingSyncGroup != nil {
+		t.Fatal("expected pending sync group cleared after commit")
+	}
+}
+
+func TestSyncOpcodeLatencyDelaysGroupCommitWithDataMov(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:       true,
+		ExecutionPolicy: ExecutionPolicyInOrderDataflow,
+	}
+	state := newLatencyTestState(4, 4, true)
+	state.Registers[0] = cgra.NewScalar(5)
+	state.Code.OperationLatencies = map[string]int{"MUL": 2}
+
+	ig := InstructionGroup{
+		Operations: []Operation{
+			{
+				OpCode: "MUL",
+				ID:     11,
+				SrcOperands: OperandList{Operands: []Operand{
+					{Impl: "$0", Color: "R"},
+					{Impl: "#3", Color: "R"},
+				}},
+				DstOperands: OperandList{Operands: []Operand{
+					{Impl: "$1", Color: "R"},
+				}},
+			},
+			{
+				OpCode: "DATA_MOV",
+				ID:     12,
+				SrcOperands: OperandList{Operands: []Operand{
+					{Impl: "$1", Color: "R"},
+				}},
+				DstOperands: OperandList{Operands: []Operand{
+					{Impl: "East", Color: "R"},
+				}},
+			},
+		},
+	}
+	state.SelectedBlock.InstructionGroups = []InstructionGroup{ig}
+
+	if !emu.RunInstructionGroupWithSyncOps(ig, &state, 0) {
+		t.Fatal("expected first issue cycle to make progress")
+	}
+	east := emu.getDirecIndex("East")
+	if state.sendQueueLen(0, east) != 0 {
+		t.Fatalf("expected no outgoing data before commit, got send queue len %d", state.sendQueueLen(0, east))
+	}
+	if got := state.Registers[1].First(); got != 0 {
+		t.Fatalf("unexpected early register writeback: got %d want 0", got)
+	}
+
+	state.CurrentCycle = 1
+	if !emu.RunInstructionGroupWithSyncOps(ig, &state, 1) {
+		t.Fatal("expected completion cycle to make progress")
+	}
+	if got := state.Registers[1].First(); got != 15 {
+		t.Fatalf("unexpected committed register value: got %d want 15", got)
+	}
+	if state.sendQueueLen(0, east) != 1 {
+		t.Fatalf("expected delayed DATA_MOV to enqueue once, got len %d", state.sendQueueLen(0, east))
+	}
+	head, ok := state.sendQueuePeek(0, east)
+	if !ok || head.First() != 15 {
+		t.Fatalf("unexpected delayed DATA_MOV payload: ok=%v value=%d", ok, head.First())
+	}
+}
+
+func TestSyncOpcodeLatencyUsesMaxAcrossGroup(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:       true,
+		ExecutionPolicy: ExecutionPolicyInOrderDataflow,
+	}
+	state := newLatencyTestState(4, 4, true)
+	state.Registers[0] = cgra.NewScalar(5)
+	state.Code.OperationLatencies = map[string]int{
+		"ADD": 1,
+		"MUL": 2,
+	}
+
+	ig := InstructionGroup{
+		Operations: []Operation{
+			{
+				OpCode: "ADD",
+				ID:     21,
+				SrcOperands: OperandList{Operands: []Operand{
+					{Impl: "$0", Color: "R"},
+					{Impl: "#1", Color: "R"},
+				}},
+				DstOperands: OperandList{Operands: []Operand{
+					{Impl: "$1", Color: "R"},
+				}},
+			},
+			{
+				OpCode: "MUL",
+				ID:     22,
+				SrcOperands: OperandList{Operands: []Operand{
+					{Impl: "$1", Color: "R"},
+					{Impl: "#3", Color: "R"},
+				}},
+				DstOperands: OperandList{Operands: []Operand{
+					{Impl: "$2", Color: "R"},
+				}},
+			},
+		},
+	}
+	state.SelectedBlock.InstructionGroups = []InstructionGroup{ig}
+
+	if !emu.RunInstructionGroupWithSyncOps(ig, &state, 0) {
+		t.Fatal("expected first issue cycle to make progress")
+	}
+	if got := state.Registers[2].First(); got != 0 {
+		t.Fatalf("unexpected early result writeback: got %d want 0", got)
+	}
+
+	state.CurrentCycle = 1
+	if !emu.RunInstructionGroupWithSyncOps(ig, &state, 1) {
+		t.Fatal("expected completion cycle to make progress")
+	}
+	if got := state.Registers[1].First(); got != 6 {
+		t.Fatalf("unexpected committed ADD result: got %d want 6", got)
+	}
+	if got := state.Registers[2].First(); got != 18 {
+		t.Fatalf("unexpected committed MUL result: got %d want 18", got)
+	}
+}
+
+func TestSyncOpcodeLatencyRespectsDerivedTimingIssueCycle(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:       true,
+		ExecutionPolicy: ExecutionPolicyStrictTimed,
+	}
+	state := newLatencyTestState(4, 4, true)
+	state.Registers[0] = cgra.NewScalar(7)
+	state.Code.OperationLatencies = map[string]int{"MUL": 2}
+	state.Code.DerivedTiming = map[int][]int64{
+		31: []int64{5},
+	}
+
+	ig := InstructionGroup{
+		Operations: []Operation{
+			{
+				OpCode: "MUL",
+				ID:     31,
+				SrcOperands: OperandList{Operands: []Operand{
+					{Impl: "$0", Color: "R"},
+					{Impl: "#2", Color: "R"},
+				}},
+				DstOperands: OperandList{Operands: []Operand{
+					{Impl: "$1", Color: "R"},
+				}},
+			},
+		},
+	}
+	state.SelectedBlock.InstructionGroups = []InstructionGroup{ig}
+
+	state.CurrentCycle = 4
+	if !emu.RunInstructionGroupWithSyncOps(ig, &state, 4) {
+		t.Fatal("expected pre-issue timing wait to keep core alive")
+	}
+	if state.PendingSyncGroup != nil {
+		t.Fatal("did not expect pending sync group before legal issue cycle")
+	}
+	if got := state.Registers[1].First(); got != 0 {
+		t.Fatalf("unexpected result before legal issue cycle: got %d want 0", got)
+	}
+
+	state.CurrentCycle = 5
+	if !emu.RunInstructionGroupWithSyncOps(ig, &state, 5) {
+		t.Fatal("expected legal issue cycle to make progress")
+	}
+	if state.PendingSyncGroup == nil {
+		t.Fatal("expected pending sync group on legal issue cycle")
+	}
+	if got := state.Registers[1].First(); got != 0 {
+		t.Fatalf("unexpected result on issue cycle before commit: got %d want 0", got)
+	}
+
+	state.CurrentCycle = 6
+	if !emu.RunInstructionGroupWithSyncOps(ig, &state, 6) {
+		t.Fatal("expected completion cycle to make progress")
+	}
+	if got := state.Registers[1].First(); got != 14 {
+		t.Fatalf("unexpected delayed result after derived-timing issue: got %d want 14", got)
+	}
+}
diff --git a/core/program.go b/core/program.go
index 4d1db00..57bf6b2 100644
--- a/core/program.go
+++ b/core/program.go
@@ -3,6 +3,7 @@ package core
 
 import (
 	"fmt"
+	"log/slog"
 	"os"
 	"regexp"
 	"strconv"
@@ -63,8 +64,11 @@ type YAMLRoot struct {
 
 // Program is the internal executable representation for one core.
 type Program struct {
-	EntryBlocks []EntryBlock
-	CompiledII  int
+	EntryBlocks             []EntryBlock
+	CompiledII              int
+	DerivedTiming           map[int][]int64
+	OperationLatencies      map[string]int
+	DefaultOperationLatency int
 }
 
 // EntryBlock is one entry block in a core program.
@@ -104,6 +108,7 @@ type Operation struct {
 	SrcOperands       OperandList
 	ID                int // ID from YAML file
 	InvalidIterations int // Invalid iterations from YAML file
+	TimeStep          int // Time step from YAML file
 }
 
 // OperandList wraps source or destination operands for an operation.
@@ -135,8 +140,18 @@ func LoadProgramFileFromYAML(programFilePath string) map[string]Program {
 
 	config := root.ArrayConfig
 
-	// Debug: Print the parsed config
-	fmt.Printf("Debug: Parsed config - Rows: %d, Cols: %d, Cores: %d\n", config.Rows, config.Cols, len(config.Cores))
+	derivedTimingByCoord, err := loadDerivedTimingFromEnv()
+	if err != nil {
+		panic(fmt.Sprintf("Failed to load timing sidecar: %v", err))
+	}
+	operationLatencies, defaultOperationLatency, err := loadOperationLatencyProfileFromEnv()
+	if err != nil {
+		panic(fmt.Sprintf("Failed to load operation latency sidecar: %v", err))
+	}
+
+	if DebugEnabled() {
+		slog.Debug("ParsedProgramConfig", "rows", config.Rows, "cols", config.Cols, "cores", len(config.Cores))
+	}
 
 	// Convert to map[(x,y)]Program
 	programMap := make(map[string]Program)
@@ -144,7 +159,9 @@ func LoadProgramFileFromYAML(programFilePath string) map[string]Program {
 	for _, core := range config.Cores {
 		// Create coordinate key
 		coordKey := fmt.Sprintf("(%d,%d)", core.Column, core.Row)
-		fmt.Printf("Debug: Processing core at %s with %d entries\n", coordKey, len(core.Entries))
+		if DebugEnabled() {
+			slog.Debug("ProcessingProgramCore", "coord", coordKey, "entries", len(core.Entries))
+		}
 
 		// Convert core entries to Program structure
 		var entryBlocks []EntryBlock
@@ -192,6 +209,7 @@ func LoadProgramFileFromYAML(programFilePath string) map[string]Program {
 						DstOperands:       OperandList{Operands: dstOperands},
 						ID:                yamlOp.ID,
 						InvalidIterations: yamlOp.InvalidIterations,
+						TimeStep:          yamlOp.TimeStep,
 					}
 
 					operations = append(operations, operation)
@@ -206,8 +224,11 @@ func LoadProgramFileFromYAML(programFilePath string) map[string]Program {
 		}
 
 		program := Program{
-			EntryBlocks: entryBlocks,
-			CompiledII:  config.CompiledII,
+			EntryBlocks:             entryBlocks,
+			CompiledII:              config.CompiledII,
+			DerivedTiming:           cloneDerivedTimingMap(derivedTimingByCoord[coordKey]),
+			OperationLatencies:      cloneOperationLatencyMap(operationLatencies),
+			DefaultOperationLatency: defaultOperationLatency,
 		}
 
 		programMap[coordKey] = program
@@ -216,6 +237,20 @@ func LoadProgramFileFromYAML(programFilePath string) map[string]Program {
 	return programMap
 }
 
+// OperationLatency returns the configured latency for an opcode, defaulting to 1.
+func (p Program) OperationLatency(opCode string) int {
+	normalized := normalizeLatencyOpcode(opCode)
+	if normalized != "" && len(p.OperationLatencies) > 0 {
+		if latency, ok := p.OperationLatencies[normalized]; ok && latency > 0 {
+			return latency
+		}
+	}
+	if p.DefaultOperationLatency > 0 {
+		return p.DefaultOperationLatency
+	}
+	return 1
+}
+
 // splitRespectingBrackets splits a string by delimiter, but respects brackets
 // so [WEST, RED] is treated as a single token
 func splitRespectingBrackets(s, delimiter string) []string {
diff --git a/core/queue_watch.go b/core/queue_watch.go
new file mode 100644
index 0000000..732ecaa
--- /dev/null
+++ b/core/queue_watch.go
@@ -0,0 +1,145 @@
+package core
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/sarchlab/zeonica/cgra"
+)
+
+// QueueWatchSpec declares one queue to sample for occupancy reporting.
+type QueueWatchSpec struct {
+	Label     string `json:"label" yaml:"label"`
+	X         int    `json:"x" yaml:"x"`
+	Y         int    `json:"y" yaml:"y"`
+	Kind      string `json:"kind" yaml:"kind"`
+	Direction string `json:"direction" yaml:"direction"`
+	Color     string `json:"color" yaml:"color"`
+}
+
+type resolvedQueueWatch struct {
+	Label        string
+	X            int
+	Y            int
+	Kind         string
+	Direction    string
+	DirectionIdx int
+	Color        string
+	ColorIdx     int
+}
+
+// ValidateQueueWatchSpecs checks queue watch definitions before runtime build.
+func ValidateQueueWatchSpecs(specs []QueueWatchSpec) error {
+	_, err := resolveQueueWatchSpecs(specs)
+	return err
+}
+
+func resolveQueueWatchSpecs(specs []QueueWatchSpec) ([]resolvedQueueWatch, error) {
+	if len(specs) == 0 {
+		return nil, nil
+	}
+
+	resolved := make([]resolvedQueueWatch, 0, len(specs))
+	for idx, spec := range specs {
+		watch, err := resolveQueueWatchSpec(spec)
+		if err != nil {
+			return nil, fmt.Errorf("queue watch[%d]: %w", idx, err)
+		}
+		resolved = append(resolved, watch)
+	}
+	return resolved, nil
+}
+
+func matchingQueueWatchesForTile(enabled bool, queueWatches []resolvedQueueWatch, x, y int) []resolvedQueueWatch {
+	if !enabled || len(queueWatches) == 0 {
+		return nil
+	}
+
+	var matched []resolvedQueueWatch
+	for _, watch := range queueWatches {
+		if watch.X == x && watch.Y == y {
+			matched = append(matched, watch)
+		}
+	}
+	return matched
+}
+
+func cloneQueueWatches(input []resolvedQueueWatch) []resolvedQueueWatch {
+	if len(input) == 0 {
+		return nil
+	}
+	out := make([]resolvedQueueWatch, len(input))
+	copy(out, input)
+	return out
+}
+
+func resolveQueueWatchSpec(spec QueueWatchSpec) (resolvedQueueWatch, error) {
+	kind := strings.ToLower(strings.TrimSpace(spec.Kind))
+	if kind != "recv" && kind != "send" {
+		return resolvedQueueWatch{}, fmt.Errorf("invalid kind %q", spec.Kind)
+	}
+
+	directionIdx, directionName, err := resolveQueueWatchDirection(spec.Direction)
+	if err != nil {
+		return resolvedQueueWatch{}, err
+	}
+
+	colorIdx, colorName, err := resolveQueueWatchColor(spec.Color)
+	if err != nil {
+		return resolvedQueueWatch{}, err
+	}
+
+	label := strings.TrimSpace(spec.Label)
+	if label == "" {
+		label = fmt.Sprintf("%s(%d,%d).%s.%s", kind, spec.X, spec.Y, directionName, colorName)
+	}
+
+	return resolvedQueueWatch{
+		Label:        label,
+		X:            spec.X,
+		Y:            spec.Y,
+		Kind:         kind,
+		Direction:    directionName,
+		DirectionIdx: directionIdx,
+		Color:        colorName,
+		ColorIdx:     colorIdx,
+	}, nil
+}
+
+func resolveQueueWatchDirection(raw string) (int, string, error) {
+	switch strings.ToLower(strings.TrimSpace(raw)) {
+	case "north":
+		return int(cgra.North), cgra.North.Name(), nil
+	case "east":
+		return int(cgra.East), cgra.East.Name(), nil
+	case "south":
+		return int(cgra.South), cgra.South.Name(), nil
+	case "west":
+		return int(cgra.West), cgra.West.Name(), nil
+	case "northeast":
+		return int(cgra.NorthEast), cgra.NorthEast.Name(), nil
+	case "northwest":
+		return int(cgra.NorthWest), cgra.NorthWest.Name(), nil
+	case "southeast":
+		return int(cgra.SouthEast), cgra.SouthEast.Name(), nil
+	case "southwest":
+		return int(cgra.SouthWest), cgra.SouthWest.Name(), nil
+	case "router":
+		return int(cgra.Router), cgra.Router.Name(), nil
+	default:
+		return 0, "", fmt.Errorf("invalid direction %q", raw)
+	}
+}
+
+func resolveQueueWatchColor(raw string) (int, string, error) {
+	switch strings.ToUpper(strings.TrimSpace(raw)) {
+	case "R", "RED":
+		return 0, "RED", nil
+	case "Y", "YELLOW":
+		return 1, "YELLOW", nil
+	case "B", "BLUE":
+		return 2, "BLUE", nil
+	default:
+		return 0, "", fmt.Errorf("invalid color %q", raw)
+	}
+}
diff --git a/core/two_phase_switch_test.go b/core/two_phase_switch_test.go
new file mode 100644
index 0000000..020ce20
--- /dev/null
+++ b/core/two_phase_switch_test.go
@@ -0,0 +1,83 @@
+package core
+
+import (
+	"testing"
+
+	"github.com/sarchlab/zeonica/cgra"
+)
+
+func TestSyncTwoPhaseNoPartialCommitOnStall(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:       true,
+		ExecutionPolicy: ExecutionPolicyInOrderDataflow,
+	}
+	state := newFIFOTestState(4, 4)
+	state.Mode = SyncOp
+	state.EnableFIFOModel = true
+	state.Registers = make([]cgra.Data, 8)
+	state.Registers[0] = cgra.NewScalar(9)
+
+	ig := InstructionGroup{
+		Operations: []Operation{
+			{
+				OpCode: "MOV",
+				SrcOperands: OperandList{Operands: []Operand{
+					{Impl: "#1", Color: "R"},
+				}},
+				DstOperands: OperandList{Operands: []Operand{
+					{Impl: "$0", Color: "R"},
+				}},
+			},
+			{
+				OpCode: "MOV",
+				SrcOperands: OperandList{Operands: []Operand{
+					{Impl: "North", Color: "R"},
+				}},
+				DstOperands: OperandList{Operands: []Operand{
+					{Impl: "$1", Color: "R"},
+				}},
+			},
+		},
+	}
+
+	run := emu.RunInstructionGroupWithSyncOps(ig, &state, 0)
+	if run {
+		t.Fatal("expected instruction group to stall on missing North operand")
+	}
+	if got := state.Registers[0].First(); got != 9 {
+		t.Fatalf("expected no partial commit on stall, got register0=%d want 9", got)
+	}
+}
+
+func TestSyncTwoPhaseCommitOnSuccess(t *testing.T) {
+	emu := instEmulator{
+		CareFlags:       true,
+		ExecutionPolicy: ExecutionPolicyInOrderDataflow,
+	}
+	state := newFIFOTestState(4, 4)
+	state.Mode = SyncOp
+	state.EnableFIFOModel = true
+	state.Registers = make([]cgra.Data, 8)
+
+	ig := InstructionGroup{
+		Operations: []Operation{
+			{
+				OpCode: "MOV",
+				SrcOperands: OperandList{Operands: []Operand{
+					{Impl: "#7", Color: "R"},
+				}},
+				DstOperands: OperandList{Operands: []Operand{
+					{Impl: "$0", Color: "R"},
+				}},
+			},
+		},
+	}
+
+	run := emu.RunInstructionGroupWithSyncOps(ig, &state, 0)
+	if !run {
+		t.Fatal("expected instruction group to run successfully")
+	}
+	if got := state.Registers[0].First(); got != 7 {
+		t.Fatalf("unexpected committed register value: got %d want 7", got)
+	}
+}
diff --git a/core/util.go b/core/util.go
index 1e3644c..a65a9e5 100644
--- a/core/util.go
+++ b/core/util.go
@@ -4,6 +4,8 @@ import (
 	"context"
 	"fmt"
 	"log/slog"
+	"sync/atomic"
+	"time"
 
 	"github.com/jedib0t/go-pretty/v6/table"
 )
@@ -11,15 +13,270 @@ import (
 const (
 	// PrintToggle enables verbose state table printing in debugging.
 	PrintToggle = false
-	// LevelTrace is a custom trace level above info.
-	LevelTrace slog.Level = slog.LevelInfo + 1
+	// LevelTrace is a custom trace level below debug/info.
+	LevelTrace slog.Level = slog.LevelDebug - 4
 )
 
+// TraceObservation captures the subset of a trace event needed for report generation.
+type TraceObservation struct {
+	WallTime  time.Time
+	Msg       string
+	Behavior  string
+	Time      *float64
+	X         *int
+	Y         *int
+	Src       string
+	Dst       string
+	From      string
+	To        string
+	Label     string
+	Kind      string
+	Direction string
+	Color     string
+	Occupancy *int
+	Capacity  *int
+}
+
+var traceEnabled atomic.Bool
+var traceObserver func(TraceObservation)
+
+func init() {
+	traceEnabled.Store(true)
+}
+
+// SetTraceEnabled controls whether trace events are written to the slog trace handler.
+func SetTraceEnabled(enabled bool) {
+	traceEnabled.Store(enabled)
+}
+
+// TraceEnabled reports whether trace output is enabled.
+func TraceEnabled() bool {
+	return traceEnabled.Load()
+}
+
+// DebugEnabled reports whether debug logging is enabled on the default logger.
+func DebugEnabled() bool {
+	return slog.Default().Enabled(context.Background(), slog.LevelDebug)
+}
+
+// SetTraceObserver registers a report observer for trace events.
+func SetTraceObserver(observer func(TraceObservation)) {
+	traceObserver = observer
+}
+
 // Trace writes a trace-level structured log record.
 func Trace(msg string, args ...any) {
+	if traceObserver != nil {
+		if observation, valid := buildTraceObservation(msg, args...); valid {
+			traceObserver(observation)
+		}
+	}
+	if !TraceEnabled() {
+		return
+	}
 	slog.Log(context.Background(), LevelTrace, msg, args...)
 }
 
+// ObserveDataFlow records a dataflow event for report generation without emitting trace output.
+func ObserveDataFlow(behavior string, timeValue float64, from, to, src, dst string) {
+	observeTrace(TraceObservation{
+		WallTime: time.Now(),
+		Msg:      "DataFlow",
+		Behavior: behavior,
+		Time:     float64Ptr(timeValue),
+		From:     from,
+		To:       to,
+		Src:      src,
+		Dst:      dst,
+	})
+}
+
+// ObserveMemory records a memory event for report generation without emitting trace output.
+func ObserveMemory(behavior string, timeValue float64, x, y int, src, dst string) {
+	observeTrace(TraceObservation{
+		WallTime: time.Now(),
+		Msg:      "Memory",
+		Behavior: behavior,
+		Time:     float64Ptr(timeValue),
+		X:        intPtr(x),
+		Y:        intPtr(y),
+		Src:      src,
+		Dst:      dst,
+	})
+}
+
+// ObserveInst records an instruction event for report generation without emitting trace output.
+func ObserveInst(timeValue float64, x, y int) {
+	observeTrace(TraceObservation{
+		WallTime: time.Now(),
+		Msg:      "Inst",
+		Time:     float64Ptr(timeValue),
+		X:        intPtr(x),
+		Y:        intPtr(y),
+	})
+}
+
+// ObserveBackpressure records a backpressure event for report generation without emitting trace output.
+func ObserveBackpressure(timeValue float64, x, y int) {
+	observeTrace(TraceObservation{
+		WallTime: time.Now(),
+		Msg:      "Backpressure",
+		Time:     float64Ptr(timeValue),
+		X:        intPtr(x),
+		Y:        intPtr(y),
+	})
+}
+
+// ObserveQueue records a watched queue occupancy sample for report generation.
+func ObserveQueue(label, kind string, timeValue float64, x, y int, direction, color string, occupancy, capacity int) {
+	observeTrace(TraceObservation{
+		WallTime:  time.Now(),
+		Msg:       "Queue",
+		Behavior:  "sample",
+		Time:      float64Ptr(timeValue),
+		X:         intPtr(x),
+		Y:         intPtr(y),
+		Label:     label,
+		Kind:      kind,
+		Direction: direction,
+		Color:     color,
+		Occupancy: intPtr(occupancy),
+		Capacity:  intPtr(capacity),
+	})
+}
+
+func observeTrace(observation TraceObservation) {
+	if traceObserver != nil {
+		traceObserver(observation)
+	}
+}
+
+//nolint:gocyclo
+func buildTraceObservation(msg string, args ...any) (TraceObservation, bool) {
+	observation := TraceObservation{
+		WallTime: time.Now(),
+		Msg:      msg,
+	}
+	if msg != "Inst" && msg != "Memory" && msg != "DataFlow" && msg != "Backpressure" && msg != "Stall" && msg != "Queue" {
+		return observation, false
+	}
+
+	for i := 0; i < len(args); i++ {
+		switch value := args[i].(type) {
+		case slog.Attr:
+			assignObservationField(&observation, value.Key, value.Value.Any())
+		case string:
+			if i+1 >= len(args) {
+				continue
+			}
+			assignObservationField(&observation, value, args[i+1])
+			i++
+		}
+	}
+
+	return observation, true
+}
+
+//nolint:gocyclo
+func assignObservationField(observation *TraceObservation, key string, value any) {
+	switch key {
+	case "Behavior":
+		observation.Behavior = fmt.Sprint(value)
+	case "Time":
+		if converted, ok := toFloat64(value); ok {
+			observation.Time = float64Ptr(converted)
+		}
+	case "X":
+		if converted, ok := toInt(value); ok {
+			observation.X = intPtr(converted)
+		}
+	case "Y":
+		if converted, ok := toInt(value); ok {
+			observation.Y = intPtr(converted)
+		}
+	case "Src":
+		observation.Src = fmt.Sprint(value)
+	case "Dst":
+		observation.Dst = fmt.Sprint(value)
+	case "From":
+		observation.From = fmt.Sprint(value)
+	case "To":
+		observation.To = fmt.Sprint(value)
+	case "Label":
+		observation.Label = fmt.Sprint(value)
+	case "Kind":
+		observation.Kind = fmt.Sprint(value)
+	case "Direction":
+		observation.Direction = fmt.Sprint(value)
+	case "Color":
+		observation.Color = fmt.Sprint(value)
+	case "Occupancy":
+		if converted, ok := toInt(value); ok {
+			observation.Occupancy = intPtr(converted)
+		}
+	case "Capacity":
+		if converted, ok := toInt(value); ok {
+			observation.Capacity = intPtr(converted)
+		}
+	}
+}
+
+func toFloat64(value any) (float64, bool) {
+	switch typed := value.(type) {
+	case float64:
+		return typed, true
+	case float32:
+		return float64(typed), true
+	case int:
+		return float64(typed), true
+	case int64:
+		return float64(typed), true
+	case int32:
+		return float64(typed), true
+	case uint32:
+		return float64(typed), true
+	case uint64:
+		return float64(typed), true
+	default:
+		return 0, false
+	}
+}
+
+func toInt(value any) (int, bool) {
+	switch typed := value.(type) {
+	case int:
+		return typed, true
+	case int32:
+		return int(typed), true
+	case int64:
+		return int(typed), true
+	case uint32:
+		return int(typed), true
+	case uint64:
+		return int(typed), true
+	default:
+		return 0, false
+	}
+}
+
+func intPtr(value int) *int {
+	ptr := new(int)
+	*ptr = value
+	return ptr
+}
+
+func float64Ptr(value float64) *float64 {
+	ptr := new(float64)
+	*ptr = value
+	return ptr
+}
+
+func int64Ptr(value int64) *int64 {
+	ptr := new(int64)
+	*ptr = value
+	return ptr
+}
+
 // PrintState prints a formatted snapshot of core runtime state.
 //
 //nolint:gocyclo,funlen
diff --git a/report/observer_test.go b/report/observer_test.go
new file mode 100644
index 0000000..46c19b7
--- /dev/null
+++ b/report/observer_test.go
@@ -0,0 +1,123 @@
+package report
+
+import (
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"reflect"
+	"testing"
+	"time"
+
+	"github.com/sarchlab/zeonica/core"
+)
+
+//nolint:funlen
+func TestObserverBuildMatchesGenerateFromLog(t *testing.T) {
+	logPath := filepath.Join(t.TempDir(), "trace.json.log")
+	ts0 := time.Date(2026, 3, 6, 0, 0, 0, 0, time.UTC)
+	ts1 := ts0.Add(10 * time.Millisecond)
+	ts2 := ts1.Add(10 * time.Millisecond)
+	ts3 := ts2.Add(10 * time.Millisecond)
+
+	event0 := traceEvent{
+		Timestamp: ts0.Format(time.RFC3339Nano),
+		Msg:       "DataFlow",
+		Behavior:  "FeedIn",
+		Time:      testFloat64Ptr(0),
+		To:        "Device.Tile[0][0].Core.West",
+	}
+	event1 := traceEvent{
+		Timestamp: ts1.Format(time.RFC3339Nano),
+		Msg:       "Inst",
+		Time:      testFloat64Ptr(0),
+		X:         testIntPtr(0),
+		Y:         testIntPtr(0),
+	}
+	event2 := traceEvent{
+		Timestamp: ts2.Format(time.RFC3339Nano),
+		Msg:       "Backpressure",
+		Time:      testFloat64Ptr(0),
+		X:         testIntPtr(0),
+		Y:         testIntPtr(0),
+	}
+	event3 := traceEvent{
+		Timestamp: ts3.Format(time.RFC3339Nano),
+		Msg:       "Stall",
+		Behavior:  "schedule_bubble",
+		Time:      testFloat64Ptr(1),
+		X:         testIntPtr(0),
+		Y:         testIntPtr(0),
+	}
+
+	file, err := os.Create(logPath)
+	if err != nil {
+		t.Fatalf("Create returned error: %v", err)
+	}
+	for _, event := range []traceEvent{event0, event1, event2, event3} {
+		payload, err := json.Marshal(event)
+		if err != nil {
+			t.Fatalf("Marshal returned error: %v", err)
+		}
+		if _, err := file.Write(append(payload, '\n')); err != nil {
+			t.Fatalf("Write returned error: %v", err)
+		}
+	}
+	_ = file.Close()
+
+	opts := GenerateOptions{
+		TestName:   "observer-test",
+		LogPath:    logPath,
+		GridWidth:  1,
+		GridHeight: 1,
+		TopN:       5,
+	}
+
+	fromLog, err := GenerateFromLog(opts)
+	if err != nil {
+		t.Fatalf("GenerateFromLog returned error: %v", err)
+	}
+
+	observer := NewObserver()
+	observer.Observe(core.TraceObservation{
+		WallTime: ts0,
+		Msg:      "DataFlow",
+		Behavior: "FeedIn",
+		Time:     testFloat64Ptr(0),
+		To:       "Device.Tile[0][0].Core.West",
+	})
+	observer.Observe(core.TraceObservation{
+		WallTime: ts1,
+		Msg:      "Inst",
+		Time:     testFloat64Ptr(0),
+		X:        testIntPtr(0),
+		Y:        testIntPtr(0),
+	})
+	observer.Observe(core.TraceObservation{
+		WallTime: ts2,
+		Msg:      "Backpressure",
+		Time:     testFloat64Ptr(0),
+		X:        testIntPtr(0),
+		Y:        testIntPtr(0),
+	})
+	observer.Observe(core.TraceObservation{
+		WallTime: ts3,
+		Msg:      "Stall",
+		Behavior: "schedule_bubble",
+		Time:     testFloat64Ptr(1),
+		X:        testIntPtr(0),
+		Y:        testIntPtr(0),
+	})
+
+	fromObserver := observer.Build(opts)
+	if !reflect.DeepEqual(fromLog, fromObserver) {
+		t.Fatalf("expected observer report to match log report\nfrom log: %#v\nfrom observer: %#v", fromLog, fromObserver)
+	}
+}
+
+func testIntPtr(v int) *int {
+	return &v
+}
+
+func testFloat64Ptr(v float64) *float64 {
+	return &v
+}
diff --git a/report/report.go b/report/report.go
index 5b142b0..883ba01 100644
--- a/report/report.go
+++ b/report/report.go
@@ -10,6 +10,9 @@ import (
 	"os"
 	"regexp"
 	"sort"
+	"time"
+
+	"github.com/sarchlab/zeonica/core"
 )
 
 // GenerateOptions controls report generation behavior from a trace log.
@@ -25,22 +28,33 @@ type GenerateOptions struct {
 
 // Report is the aggregate execution summary derived from a trace log.
 type Report struct {
-	TestName        string       `json:"testName,omitempty"`
-	LogPath         string       `json:"logPath"`
-	Grid            GridInfo     `json:"grid"`
-	TotalCycles     int64        `json:"totalCycles"`
-	ActiveCycles    int64        `json:"activeCyclesGlobal"`
-	IdleCycles      int64        `json:"idleCyclesGlobal"`
-	Passed          *bool        `json:"passed,omitempty"`
-	MismatchCount   *int         `json:"mismatchCount,omitempty"`
-	InstCount       int64        `json:"instCount"`
-	SendCount       int64        `json:"sendCount"`
-	RecvCount       int64        `json:"recvCount"`
-	MemoryCount     int64        `json:"memoryCount"`
-	TotalEvents     int64        `json:"totalEvents"`
-	ActiveTileCount int          `json:"activeTileCount"`
-	Tiles           []TileStats  `json:"tiles"`
-	TopHotTiles     []TopHotTile `json:"topHotTiles"`
+	TestName                 string                `json:"testName,omitempty"`
+	LogPath                  string                `json:"logPath"`
+	Grid                     GridInfo              `json:"grid"`
+	TotalCycles              int64                 `json:"totalCycles"`
+	ActiveCycles             int64                 `json:"activeCyclesGlobal"`
+	IdleCycles               int64                 `json:"idleCyclesGlobal"`
+	Passed                   *bool                 `json:"passed,omitempty"`
+	MismatchCount            *int                  `json:"mismatchCount,omitempty"`
+	InstCount                int64                 `json:"instCount"`
+	SendCount                int64                 `json:"sendCount"`
+	RecvCount                int64                 `json:"recvCount"`
+	MemoryCount              int64                 `json:"memoryCount"`
+	TotalEvents              int64                 `json:"totalEvents"`
+	WallClockDurationSec     float64               `json:"wallClockDurationSec"`
+	InstThroughputPerCycle   float64               `json:"instThroughputPerCycle"`
+	EventThroughputPerCycle  float64               `json:"eventThroughputPerCycle"`
+	InstThroughputPerSec     float64               `json:"instThroughputPerSec"`
+	BackpressureCount        int64                 `json:"backpressureCount"`
+	BackpressureCycles       int64                 `json:"backpressureCycles"`
+	ScheduleBubbleStallCount int64                 `json:"scheduleBubbleStallCount"`
+	OperandWaitStallCount    int64                 `json:"operandWaitStallCount"`
+	OutputBlockedStallCount  int64                 `json:"outputBlockedStallCount"`
+	ActiveTileCount          int                   `json:"activeTileCount"`
+	Tiles                    []TileStats           `json:"tiles"`
+	TopHotTiles              []TopHotTile          `json:"topHotTiles"`
+	TopBackpressureTiles     []TopBackpressureTile `json:"topBackpressureTiles"`
+	WatchedQueues            []QueueStats          `json:"watchedQueues,omitempty"`
 }
 
 // GridInfo describes the grid size used by the workload.
@@ -51,16 +65,20 @@ type GridInfo struct {
 
 // TileStats stores per-tile metrics in the generated report.
 type TileStats struct {
-	X              int     `json:"x"`
-	Y              int     `json:"y"`
-	Coord          string  `json:"coord"`
-	ActiveCycles   int64   `json:"activeCycles"`
-	UtilizationPct float64 `json:"utilizationPct"`
-	InstCount      int64   `json:"instCount"`
-	SendCount      int64   `json:"sendCount"`
-	RecvCount      int64   `json:"recvCount"`
-	MemoryCount    int64   `json:"memoryCount"`
-	TotalEvents    int64   `json:"totalEvents"`
+	X                        int     `json:"x"`
+	Y                        int     `json:"y"`
+	Coord                    string  `json:"coord"`
+	ActiveCycles             int64   `json:"activeCycles"`
+	UtilizationPct           float64 `json:"utilizationPct"`
+	InstCount                int64   `json:"instCount"`
+	SendCount                int64   `json:"sendCount"`
+	RecvCount                int64   `json:"recvCount"`
+	MemoryCount              int64   `json:"memoryCount"`
+	TotalEvents              int64   `json:"totalEvents"`
+	BackpressureCount        int64   `json:"backpressureCount"`
+	ScheduleBubbleStallCount int64   `json:"scheduleBubbleStallCount"`
+	OperandWaitStallCount    int64   `json:"operandWaitStallCount"`
+	OutputBlockedStallCount  int64   `json:"outputBlockedStallCount"`
 }
 
 // TopHotTile is a ranked hot tile summary entry.
@@ -73,6 +91,30 @@ type TopHotTile struct {
 	TotalEvents    int64   `json:"totalEvents"`
 }
 
+// TopBackpressureTile is a ranked backpressure hot tile entry.
+type TopBackpressureTile struct {
+	X                 int    `json:"x"`
+	Y                 int    `json:"y"`
+	Coord             string `json:"coord"`
+	BackpressureCount int64  `json:"backpressureCount"`
+}
+
+// QueueStats stores aggregated occupancy metrics for one watched queue.
+type QueueStats struct {
+	Label             string  `json:"label"`
+	Kind              string  `json:"kind"`
+	X                 int     `json:"x"`
+	Y                 int     `json:"y"`
+	Coord             string  `json:"coord"`
+	Direction         string  `json:"direction"`
+	Color             string  `json:"color"`
+	Capacity          int     `json:"capacity"`
+	SampleCount       int64   `json:"sampleCount"`
+	AvgOccupancy      float64 `json:"avgOccupancy"`
+	PeakOccupancy     int     `json:"peakOccupancy"`
+	AvgUtilizationPct float64 `json:"avgUtilizationPct"`
+}
+
 type traceEvent struct {
 	Timestamp string   `json:"time"`
 	Msg       string   `json:"msg"`
@@ -84,6 +126,12 @@ type traceEvent struct {
 	Dst       string   `json:"Dst"`
 	From      string   `json:"From"`
 	To        string   `json:"To"`
+	Label     string   `json:"Label"`
+	Kind      string   `json:"Kind"`
+	Direction string   `json:"Direction"`
+	Color     string   `json:"Color"`
+	Occupancy *int     `json:"Occupancy"`
+	Capacity  *int     `json:"Capacity"`
 }
 
 type tileCoord struct {
@@ -92,95 +140,221 @@ type tileCoord struct {
 }
 
 type tileAccumulator struct {
-	cycles      map[int64]struct{}
-	instCount   int64
-	sendCount   int64
-	recvCount   int64
-	memoryCount int64
-	totalEvents int64
+	cycles                   map[int64]struct{}
+	backpressureCycles       map[int64]struct{}
+	instCount                int64
+	sendCount                int64
+	recvCount                int64
+	memoryCount              int64
+	totalEvents              int64
+	backpressureCount        int64
+	scheduleBubbleStallCount int64
+	operandWaitStallCount    int64
+	outputBlockedStallCount  int64
+}
+
+type queueKey struct {
+	label     string
+	x         int
+	y         int
+	kind      string
+	direction string
+	color     string
+}
+
+type queueAccumulator struct {
+	capacity      int
+	sampleCount   int64
+	occupancySum  int64
+	peakOccupancy int
+}
+
+type collector struct {
+	tileData                 map[tileCoord]*tileAccumulator
+	queueData                map[queueKey]*queueAccumulator
+	globalCycleSet           map[int64]struct{}
+	globalBackpressureCycles map[int64]struct{}
+	maxCycle                 int64
+	maxX                     int
+	maxY                     int
+	globalBackpressureCount  int64
+	minWallTS                *time.Time
+	maxWallTS                *time.Time
+}
+
+// Observer collects report statistics directly from runtime trace observations.
+type Observer struct {
+	collector *collector
 }
 
 var tileEndpointPattern = regexp.MustCompile(`Device\.Tile\[(\d+)\]\[(\d+)\]\.Core\.`)
 
-// GenerateFromLog builds a report by parsing a JSON trace log.
-//
-//nolint:gocyclo,funlen
-func GenerateFromLog(opts GenerateOptions) (Report, error) {
-	if opts.LogPath == "" {
-		return Report{}, fmt.Errorf("log path is required")
+// NewObserver creates a report observer for runtime trace events.
+func NewObserver() *Observer {
+	return &Observer{
+		collector: newCollector(),
 	}
+}
 
-	topN := opts.TopN
-	if topN <= 0 {
-		topN = 5
+func newCollector() *collector {
+	return &collector{
+		tileData:                 make(map[tileCoord]*tileAccumulator),
+		queueData:                make(map[queueKey]*queueAccumulator),
+		globalCycleSet:           make(map[int64]struct{}),
+		globalBackpressureCycles: make(map[int64]struct{}),
+		maxCycle:                 -1,
+		maxX:                     -1,
+		maxY:                     -1,
 	}
+}
 
-	file, err := os.Open(opts.LogPath)
-	if err != nil {
-		return Report{}, fmt.Errorf("open log file: %w", err)
+// Observe records a runtime trace observation into the in-memory report collector.
+func (o *Observer) Observe(observation core.TraceObservation) {
+	if o == nil || o.collector == nil {
+		return
+	}
+
+	event := traceEvent{
+		Timestamp: observation.WallTime.Format(time.RFC3339Nano),
+		Msg:       observation.Msg,
+		Behavior:  observation.Behavior,
+		Time:      observation.Time,
+		X:         observation.X,
+		Y:         observation.Y,
+		Src:       observation.Src,
+		Dst:       observation.Dst,
+		From:      observation.From,
+		To:        observation.To,
+		Label:     observation.Label,
+		Kind:      observation.Kind,
+		Direction: observation.Direction,
+		Color:     observation.Color,
+		Occupancy: observation.Occupancy,
+		Capacity:  observation.Capacity,
+	}
+	o.collector.observe(event)
+}
+
+// Build materializes a Report using the collected runtime events.
+func (o *Observer) Build(opts GenerateOptions) Report {
+	if o == nil || o.collector == nil {
+		return Report{
+			TestName: opts.TestName,
+			LogPath:  opts.LogPath,
+			Grid: GridInfo{
+				Width:  opts.GridWidth,
+				Height: opts.GridHeight,
+			},
+			Passed:        opts.Passed,
+			MismatchCount: opts.MismatchCount,
+		}
 	}
-	defer func() { _ = file.Close() }()
+	return o.collector.build(opts)
+}
 
-	tileData := make(map[tileCoord]*tileAccumulator)
-	globalCycleSet := make(map[int64]struct{})
+//nolint:gocyclo
+func (c *collector) observe(event traceEvent) {
+	if ts, err := time.Parse(time.RFC3339Nano, event.Timestamp); err == nil {
+		if c.minWallTS == nil || ts.Before(*c.minWallTS) {
+			t := ts
+			c.minWallTS = &t
+		}
+		if c.maxWallTS == nil || ts.After(*c.maxWallTS) {
+			t := ts
+			c.maxWallTS = &t
+		}
+	}
 
-	var maxCycle int64 = -1
-	maxX, maxY := -1, -1
+	cycle, hasCycle := parseCycle(event.Time)
+	if hasCycle && cycle > c.maxCycle {
+		c.maxCycle = cycle
+	}
 
-	scanner := bufio.NewScanner(file)
-	for scanner.Scan() {
-		line := scanner.Bytes()
-		if len(line) == 0 {
-			continue
-		}
+	if event.Msg == "Queue" {
+		c.observeQueue(event)
+		return
+	}
 
-		var event traceEvent
-		if err := json.Unmarshal(line, &event); err != nil {
-			continue
-		}
+	coord, ok := resolveTileCoord(event)
+	if !ok {
+		return
+	}
 
-		coord, ok := resolveTileCoord(event)
-		if !ok {
-			continue
-		}
+	if coord.x > c.maxX {
+		c.maxX = coord.x
+	}
+	if coord.y > c.maxY {
+		c.maxY = coord.y
+	}
 
-		if coord.x > maxX {
-			maxX = coord.x
-		}
-		if coord.y > maxY {
-			maxY = coord.y
+	acc, exists := c.tileData[coord]
+	if !exists {
+		acc = &tileAccumulator{
+			cycles:             make(map[int64]struct{}),
+			backpressureCycles: make(map[int64]struct{}),
 		}
+		c.tileData[coord] = acc
+	}
 
-		acc, exists := tileData[coord]
-		if !exists {
-			acc = &tileAccumulator{
-				cycles: make(map[int64]struct{}),
-			}
-			tileData[coord] = acc
+	isBackpressureEvent := event.Msg == "Backpressure"
+	if hasCycle && !isBackpressureEvent {
+		acc.cycles[cycle] = struct{}{}
+		c.globalCycleSet[cycle] = struct{}{}
+		if cycle > c.maxCycle {
+			c.maxCycle = cycle
 		}
+	}
 
-		cycle, hasCycle := parseCycle(event.Time)
+	if classifyAndCount(event, acc, cycle, hasCycle) {
+		c.globalBackpressureCount++
 		if hasCycle {
-			acc.cycles[cycle] = struct{}{}
-			globalCycleSet[cycle] = struct{}{}
-			if cycle > maxCycle {
-				maxCycle = cycle
-			}
+			c.globalBackpressureCycles[cycle] = struct{}{}
 		}
+	}
+}
 
-		classifyAndCount(event, acc)
+func (c *collector) observeQueue(event traceEvent) {
+	if event.X == nil || event.Y == nil || event.Occupancy == nil {
+		return
 	}
 
-	if err := scanner.Err(); err != nil {
-		return Report{}, fmt.Errorf("scan log file: %w", err)
+	key := queueKey{
+		label:     event.Label,
+		x:         *event.X,
+		y:         *event.Y,
+		kind:      event.Kind,
+		direction: event.Direction,
+		color:     event.Color,
+	}
+
+	acc, exists := c.queueData[key]
+	if !exists {
+		acc = &queueAccumulator{}
+		c.queueData[key] = acc
+	}
+	if event.Capacity != nil && *event.Capacity > 0 {
+		acc.capacity = *event.Capacity
+	}
+	acc.sampleCount++
+	acc.occupancySum += int64(*event.Occupancy)
+	if *event.Occupancy > acc.peakOccupancy {
+		acc.peakOccupancy = *event.Occupancy
+	}
+}
+
+//nolint:gocyclo,funlen
+func (c *collector) build(opts GenerateOptions) Report {
+	topN := opts.TopN
+	if topN <= 0 {
+		topN = 5
 	}
 
 	totalCycles := int64(0)
-	if maxCycle >= 0 {
-		totalCycles = maxCycle + 1
+	if c.maxCycle >= 0 {
+		totalCycles = c.maxCycle + 1
 	}
 
-	activeCycles := int64(len(globalCycleSet))
+	activeCycles := int64(len(c.globalCycleSet))
 	idleCycles := totalCycles - activeCycles
 	if idleCycles < 0 {
 		idleCycles = 0
@@ -188,11 +362,11 @@ func GenerateFromLog(opts GenerateOptions) (Report, error) {
 
 	width := opts.GridWidth
 	if width <= 0 {
-		width = maxX + 1
+		width = c.maxX + 1
 	}
 	height := opts.GridHeight
 	if height <= 0 {
-		height = maxY + 1
+		height = c.maxY + 1
 	}
 	if width < 0 {
 		width = 0
@@ -201,8 +375,8 @@ func GenerateFromLog(opts GenerateOptions) (Report, error) {
 		height = 0
 	}
 
-	tiles := make([]TileStats, 0, len(tileData))
-	for coord, acc := range tileData {
+	tiles := make([]TileStats, 0, len(c.tileData))
+	for coord, acc := range c.tileData {
 		activeTileCycles := int64(len(acc.cycles))
 		util := 0.0
 		if totalCycles > 0 {
@@ -210,16 +384,20 @@ func GenerateFromLog(opts GenerateOptions) (Report, error) {
 		}
 
 		tiles = append(tiles, TileStats{
-			X:              coord.x,
-			Y:              coord.y,
-			Coord:          formatCoord(coord.x, coord.y),
-			ActiveCycles:   activeTileCycles,
-			UtilizationPct: util,
-			InstCount:      acc.instCount,
-			SendCount:      acc.sendCount,
-			RecvCount:      acc.recvCount,
-			MemoryCount:    acc.memoryCount,
-			TotalEvents:    acc.totalEvents,
+			X:                        coord.x,
+			Y:                        coord.y,
+			Coord:                    formatCoord(coord.x, coord.y),
+			ActiveCycles:             activeTileCycles,
+			UtilizationPct:           util,
+			InstCount:                acc.instCount,
+			SendCount:                acc.sendCount,
+			RecvCount:                acc.recvCount,
+			MemoryCount:              acc.memoryCount,
+			TotalEvents:              acc.totalEvents,
+			BackpressureCount:        acc.backpressureCount,
+			ScheduleBubbleStallCount: acc.scheduleBubbleStallCount,
+			OperandWaitStallCount:    acc.operandWaitStallCount,
+			OutputBlockedStallCount:  acc.outputBlockedStallCount,
 		})
 	}
 
@@ -235,6 +413,9 @@ func GenerateFromLog(opts GenerateOptions) (Report, error) {
 	var recvTotal int64
 	var memoryTotal int64
 	var eventTotal int64
+	var scheduleBubbleStallTotal int64
+	var operandWaitStallTotal int64
+	var outputBlockedStallTotal int64
 
 	for _, tile := range tiles {
 		instTotal += tile.InstCount
@@ -242,30 +423,99 @@ func GenerateFromLog(opts GenerateOptions) (Report, error) {
 		recvTotal += tile.RecvCount
 		memoryTotal += tile.MemoryCount
 		eventTotal += tile.TotalEvents
+		scheduleBubbleStallTotal += tile.ScheduleBubbleStallCount
+		operandWaitStallTotal += tile.OperandWaitStallCount
+		outputBlockedStallTotal += tile.OutputBlockedStallCount
 	}
 
 	topHotTiles := buildTopHotTiles(tiles, topN)
+	topBackpressureTiles := buildTopBackpressureTiles(tiles, topN)
+	watchedQueues := buildQueueStats(c.queueData)
+	wallClockDurationSec := 0.0
+	if c.minWallTS != nil && c.maxWallTS != nil {
+		d := c.maxWallTS.Sub(*c.minWallTS).Seconds()
+		if d > 0 {
+			wallClockDurationSec = d
+		}
+	}
+	instThroughputPerCycle := 0.0
+	eventThroughputPerCycle := 0.0
+	if totalCycles > 0 {
+		instThroughputPerCycle = float64(instTotal) / float64(totalCycles)
+		eventThroughputPerCycle = float64(eventTotal) / float64(totalCycles)
+	}
+	instThroughputPerSec := 0.0
+	if wallClockDurationSec > 0 {
+		instThroughputPerSec = float64(instTotal) / wallClockDurationSec
+	}
+
+	return Report{
+		TestName:                 opts.TestName,
+		LogPath:                  opts.LogPath,
+		Grid:                     GridInfo{Width: width, Height: height},
+		TotalCycles:              totalCycles,
+		ActiveCycles:             activeCycles,
+		IdleCycles:               idleCycles,
+		Passed:                   opts.Passed,
+		MismatchCount:            opts.MismatchCount,
+		InstCount:                instTotal,
+		SendCount:                sendTotal,
+		RecvCount:                recvTotal,
+		MemoryCount:              memoryTotal,
+		TotalEvents:              eventTotal,
+		WallClockDurationSec:     wallClockDurationSec,
+		InstThroughputPerCycle:   instThroughputPerCycle,
+		EventThroughputPerCycle:  eventThroughputPerCycle,
+		InstThroughputPerSec:     instThroughputPerSec,
+		BackpressureCount:        c.globalBackpressureCount,
+		BackpressureCycles:       int64(len(c.globalBackpressureCycles)),
+		ScheduleBubbleStallCount: scheduleBubbleStallTotal,
+		OperandWaitStallCount:    operandWaitStallTotal,
+		OutputBlockedStallCount:  outputBlockedStallTotal,
+		ActiveTileCount:          len(tiles),
+		Tiles:                    tiles,
+		TopHotTiles:              topHotTiles,
+		TopBackpressureTiles:     topBackpressureTiles,
+		WatchedQueues:            watchedQueues,
+	}
+}
 
-	report := Report{
-		TestName:        opts.TestName,
-		LogPath:         opts.LogPath,
-		Grid:            GridInfo{Width: width, Height: height},
-		TotalCycles:     totalCycles,
-		ActiveCycles:    activeCycles,
-		IdleCycles:      idleCycles,
-		Passed:          opts.Passed,
-		MismatchCount:   opts.MismatchCount,
-		InstCount:       instTotal,
-		SendCount:       sendTotal,
-		RecvCount:       recvTotal,
-		MemoryCount:     memoryTotal,
-		TotalEvents:     eventTotal,
-		ActiveTileCount: len(tiles),
-		Tiles:           tiles,
-		TopHotTiles:     topHotTiles,
+// GenerateFromLog builds a report by parsing a JSON trace log.
+//
+//nolint:gocyclo,funlen
+func GenerateFromLog(opts GenerateOptions) (Report, error) {
+	if opts.LogPath == "" {
+		return Report{}, fmt.Errorf("log path is required")
+	}
+
+	file, err := os.Open(opts.LogPath)
+	if err != nil {
+		return Report{}, fmt.Errorf("open log file: %w", err)
+	}
+	defer func() { _ = file.Close() }()
+
+	collector := newCollector()
+
+	scanner := bufio.NewScanner(file)
+	for scanner.Scan() {
+		line := scanner.Bytes()
+		if len(line) == 0 {
+			continue
+		}
+
+		var event traceEvent
+		if err := json.Unmarshal(line, &event); err != nil {
+			continue
+		}
+
+		collector.observe(event)
+	}
+
+	if err := scanner.Err(); err != nil {
+		return Report{}, fmt.Errorf("scan log file: %w", err)
 	}
 
-	return report, nil
+	return collector.build(opts), nil
 }
 
 // SaveJSON writes a report as pretty-printed JSON.
@@ -288,6 +538,8 @@ func PrintSummary(report Report) {
 }
 
 // PrintSummaryToWriter prints a compact report summary to the writer.
+//
+//nolint:funlen
 func PrintSummaryToWriter(report Report, w io.Writer) {
 	fmt.Fprintln(w, "========================")
 	fmt.Fprintln(w, "Zeonica Report Summary")
@@ -298,6 +550,22 @@ func PrintSummaryToWriter(report Report, w io.Writer) {
 	fmt.Fprintf(w, "cycles: total=%d active=%d idle=%d\n", report.TotalCycles, report.ActiveCycles, report.IdleCycles)
 	fmt.Fprintf(w, "events: total=%d inst=%d send=%d recv=%d memory=%d\n",
 		report.TotalEvents, report.InstCount, report.SendCount, report.RecvCount, report.MemoryCount)
+	fmt.Fprintf(w, "simulation time: wall=%.3fs\n", report.WallClockDurationSec)
+	fmt.Fprintf(
+		w,
+		"throughput: inst/cycle=%.4f events/cycle=%.4f inst/s=%.2f\n",
+		report.InstThroughputPerCycle,
+		report.EventThroughputPerCycle,
+		report.InstThroughputPerSec,
+	)
+	fmt.Fprintf(w, "backpressure: count=%d cycles=%d\n", report.BackpressureCount, report.BackpressureCycles)
+	fmt.Fprintf(
+		w,
+		"stall breakdown: schedule_bubble=%d operand_wait=%d output_blocked=%d\n",
+		report.ScheduleBubbleStallCount,
+		report.OperandWaitStallCount,
+		report.OutputBlockedStallCount,
+	)
 	fmt.Fprintf(w, "active tiles: %d\n", report.ActiveTileCount)
 	if report.Passed != nil {
 		fmt.Fprintf(w, "passed: %t\n", *report.Passed)
@@ -313,9 +581,35 @@ func PrintSummaryToWriter(report Report, w io.Writer) {
 				idx+1, tile.Coord, tile.UtilizationPct, tile.ActiveCycles, tile.TotalEvents)
 		}
 	}
+	if len(report.TopBackpressureTiles) > 0 {
+		fmt.Fprintln(w, "top backpressure tiles:")
+		for idx, tile := range report.TopBackpressureTiles {
+			fmt.Fprintf(w, "  %d) %s bp=%d\n", idx+1, tile.Coord, tile.BackpressureCount)
+		}
+	}
+	if len(report.WatchedQueues) > 0 {
+		fmt.Fprintln(w, "watched queues:")
+		for idx, queue := range report.WatchedQueues {
+			fmt.Fprintf(
+				w,
+				"  %d) %s %s %s/%s avg=%.2f peak=%d cap=%d util=%.2f%% samples=%d\n",
+				idx+1,
+				queue.Coord,
+				queue.Label,
+				queue.Direction,
+				queue.Color,
+				queue.AvgOccupancy,
+				queue.PeakOccupancy,
+				queue.Capacity,
+				queue.AvgUtilizationPct,
+				queue.SampleCount,
+			)
+		}
+	}
 }
 
-func classifyAndCount(event traceEvent, acc *tileAccumulator) {
+//nolint:gocyclo
+func classifyAndCount(event traceEvent, acc *tileAccumulator, cycle int64, hasCycle bool) bool {
 	switch event.Msg {
 	case "Inst":
 		acc.instCount++
@@ -331,7 +625,27 @@ func classifyAndCount(event traceEvent, acc *tileAccumulator) {
 			acc.recvCount++
 		}
 		acc.totalEvents++
+	case "Backpressure":
+		acc.backpressureCount++
+		if hasCycle {
+			acc.backpressureCycles[cycle] = struct{}{}
+		}
+		return true
+	case "Stall":
+		switch event.Behavior {
+		case "schedule_bubble":
+			acc.scheduleBubbleStallCount++
+		case "operand_wait":
+			acc.operandWaitStallCount++
+		case "output_blocked":
+			acc.outputBlockedStallCount++
+		}
+		acc.totalEvents++
+	case "Queue":
+		// Queue samples are aggregated separately in watchedQueues and should not
+		// inflate existing event throughput counters.
 	}
+	return false
 }
 
 func resolveTileCoord(event traceEvent) (tileCoord, bool) {
@@ -379,13 +693,14 @@ func parseTileFromEndpoint(endpoint string) (tileCoord, bool) {
 		return tileCoord{}, false
 	}
 
-	var x int
-	var y int
-	if _, err := fmt.Sscanf(matches[0], "Device.Tile[%d][%d].Core.", &x, &y); err != nil {
+	var row int
+	var col int
+	if _, err := fmt.Sscanf(matches[0], "Device.Tile[%d][%d].Core.", &row, &col); err != nil {
 		return tileCoord{}, false
 	}
 
-	return tileCoord{x: x, y: y}, true
+	// Endpoint naming is Tile[row][col], while report coordinates are (x=col, y=row).
+	return tileCoord{x: col, y: row}, true
 }
 
 func parseCycle(timeValue *float64) (int64, bool) {
@@ -438,6 +753,86 @@ func buildTopHotTiles(tiles []TileStats, topN int) []TopHotTile {
 	return out
 }
 
+func buildQueueStats(queueData map[queueKey]*queueAccumulator) []QueueStats {
+	if len(queueData) == 0 {
+		return nil
+	}
+
+	stats := make([]QueueStats, 0, len(queueData))
+	for key, acc := range queueData {
+		avgOccupancy := 0.0
+		if acc.sampleCount > 0 {
+			avgOccupancy = float64(acc.occupancySum) / float64(acc.sampleCount)
+		}
+		avgUtilizationPct := 0.0
+		if acc.capacity > 0 {
+			avgUtilizationPct = avgOccupancy * 100.0 / float64(acc.capacity)
+		}
+		stats = append(stats, QueueStats{
+			Label:             key.label,
+			Kind:              key.kind,
+			X:                 key.x,
+			Y:                 key.y,
+			Coord:             formatCoord(key.x, key.y),
+			Direction:         key.direction,
+			Color:             key.color,
+			Capacity:          acc.capacity,
+			SampleCount:       acc.sampleCount,
+			AvgOccupancy:      avgOccupancy,
+			PeakOccupancy:     acc.peakOccupancy,
+			AvgUtilizationPct: avgUtilizationPct,
+		})
+	}
+
+	sort.Slice(stats, func(i, j int) bool {
+		if stats[i].Y != stats[j].Y {
+			return stats[i].Y < stats[j].Y
+		}
+		if stats[i].X != stats[j].X {
+			return stats[i].X < stats[j].X
+		}
+		if stats[i].Direction != stats[j].Direction {
+			return stats[i].Direction < stats[j].Direction
+		}
+		return stats[i].Label < stats[j].Label
+	})
+
+	return stats
+}
+
+func buildTopBackpressureTiles(tiles []TileStats, topN int) []TopBackpressureTile {
+	if len(tiles) == 0 || topN <= 0 {
+		return nil
+	}
+	tmp := make([]TileStats, len(tiles))
+	copy(tmp, tiles)
+	sort.Slice(tmp, func(i, j int) bool {
+		if tmp[i].BackpressureCount != tmp[j].BackpressureCount {
+			return tmp[i].BackpressureCount > tmp[j].BackpressureCount
+		}
+		if tmp[i].Y != tmp[j].Y {
+			return tmp[i].Y < tmp[j].Y
+		}
+		return tmp[i].X < tmp[j].X
+	})
+	if topN > len(tmp) {
+		topN = len(tmp)
+	}
+	out := make([]TopBackpressureTile, 0, topN)
+	for i := 0; i < topN; i++ {
+		if tmp[i].BackpressureCount <= 0 {
+			continue
+		}
+		out = append(out, TopBackpressureTile{
+			X:                 tmp[i].X,
+			Y:                 tmp[i].Y,
+			Coord:             tmp[i].Coord,
+			BackpressureCount: tmp[i].BackpressureCount,
+		})
+	}
+	return out
+}
+
 func formatCoord(x, y int) string {
 	return fmt.Sprintf("(%d,%d)", x, y)
 }
diff --git a/runtimecfg/disable-trace.report.json b/runtimecfg/disable-trace.report.json
new file mode 100644
index 0000000..0cba9de
--- /dev/null
+++ b/runtimecfg/disable-trace.report.json
@@ -0,0 +1,57 @@
+{
+  "testName": "disable-trace",
+  "logPath": "/tmp/TestInitTraceLoggerDisableTraceCreatesEmptyLogAndReport326622658/001/trace.log",
+  "grid": {
+    "width": 1,
+    "height": 1
+  },
+  "totalCycles": 1,
+  "activeCyclesGlobal": 1,
+  "idleCyclesGlobal": 0,
+  "passed": true,
+  "mismatchCount": 0,
+  "instCount": 1,
+  "sendCount": 0,
+  "recvCount": 0,
+  "memoryCount": 0,
+  "totalEvents": 1,
+  "wallClockDurationSec": 0,
+  "instThroughputPerCycle": 1,
+  "eventThroughputPerCycle": 1,
+  "instThroughputPerSec": 0,
+  "backpressureCount": 0,
+  "backpressureCycles": 0,
+  "scheduleBubbleStallCount": 0,
+  "operandWaitStallCount": 0,
+  "outputBlockedStallCount": 0,
+  "activeTileCount": 1,
+  "tiles": [
+    {
+      "x": 0,
+      "y": 0,
+      "coord": "(0,0)",
+      "activeCycles": 1,
+      "utilizationPct": 100,
+      "instCount": 1,
+      "sendCount": 0,
+      "recvCount": 0,
+      "memoryCount": 0,
+      "totalEvents": 1,
+      "backpressureCount": 0,
+      "scheduleBubbleStallCount": 0,
+      "operandWaitStallCount": 0,
+      "outputBlockedStallCount": 0
+    }
+  ],
+  "topHotTiles": [
+    {
+      "x": 0,
+      "y": 0,
+      "coord": "(0,0)",
+      "utilizationPct": 100,
+      "activeCycles": 1,
+      "totalEvents": 1
+    }
+  ],
+  "topBackpressureTiles": []
+}
\ No newline at end of file
diff --git a/runtimecfg/enable_trace_test.go b/runtimecfg/enable_trace_test.go
new file mode 100644
index 0000000..c44f143
--- /dev/null
+++ b/runtimecfg/enable_trace_test.go
@@ -0,0 +1,123 @@
+package runtimecfg
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/sarchlab/zeonica/core"
+	"github.com/sarchlab/zeonica/report"
+)
+
+func TestResolveEnableTraceDefaultsFalse(t *testing.T) {
+	cfg, err := Resolve(ArchSpec{}, "enable-trace-default")
+	if err != nil {
+		t.Fatalf("Resolve returned error: %v", err)
+	}
+	if cfg.EnableTrace {
+		t.Fatal("expected enableTrace default to be false")
+	}
+}
+
+func TestResolveEnableTraceTrue(t *testing.T) {
+	enabled := true
+	cfg, err := Resolve(ArchSpec{
+		Simulator: Simulator{
+			Logging: SimulatorLogging{
+				EnableTrace: &enabled,
+			},
+		},
+	}, "enable-trace-true")
+	if err != nil {
+		t.Fatalf("Resolve returned error: %v", err)
+	}
+	if !cfg.EnableTrace {
+		t.Fatal("expected enableTrace to be true")
+	}
+}
+
+func TestInitTraceLoggerDisableTraceCreatesEmptyLogAndReport(t *testing.T) {
+	logPath := filepath.Join(t.TempDir(), "trace.log")
+	rt := &Runtime{
+		Config: ResolvedConfig{
+			TestName:    "disable-trace",
+			Rows:        1,
+			Columns:     1,
+			LoggingEnabled: true,
+			EnableTrace: false,
+			LogPath:     logPath,
+		},
+		Observer: report.NewObserver(),
+	}
+
+	traceLog, err := rt.InitTraceLogger(core.LevelTrace)
+	if err != nil {
+		t.Fatalf("InitTraceLogger returned error: %v", err)
+	}
+
+	core.Trace("Inst", "Time", float64(0), "X", 0, "Y", 0)
+
+	if err := CloseTraceLog(traceLog); err != nil {
+		t.Fatalf("CloseTraceLog returned error: %v", err)
+	}
+
+	info, err := os.Stat(logPath)
+	if err != nil {
+		t.Fatalf("Stat returned error: %v", err)
+	}
+	if info.Size() != 0 {
+		t.Fatalf("expected empty trace log when enableTrace=false, got %d bytes", info.Size())
+	}
+
+	passed := true
+	mismatch := 0
+	reportPath, err := rt.GenerateSaveAndPrintReport(5, &passed, &mismatch)
+	if err != nil {
+		t.Fatalf("GenerateSaveAndPrintReport returned error: %v", err)
+	}
+
+	content, err := os.ReadFile(reportPath)
+	if err != nil {
+		t.Fatalf("ReadFile returned error: %v", err)
+	}
+	if !strings.Contains(string(content), `"logPath": "`+logPath+`"`) {
+		t.Fatalf("expected report to preserve logPath %q, got %s", logPath, string(content))
+	}
+}
+
+func TestInitTraceLoggerEnableTraceWritesEvents(t *testing.T) {
+	logPath := filepath.Join(t.TempDir(), "trace.log")
+	rt := &Runtime{
+		Config: ResolvedConfig{
+			TestName:    "enable-trace",
+			Rows:        1,
+			Columns:     1,
+			LoggingEnabled: true,
+			EnableTrace: true,
+			LogPath:     logPath,
+		},
+		Observer: report.NewObserver(),
+	}
+
+	traceLog, err := rt.InitTraceLogger(core.LevelTrace)
+	if err != nil {
+		t.Fatalf("InitTraceLogger returned error: %v", err)
+	}
+
+	core.Trace("Inst", "Time", float64(1), "X", 0, "Y", 0)
+	time.Sleep(5 * time.Millisecond)
+
+	if err := CloseTraceLog(traceLog); err != nil {
+		t.Fatalf("CloseTraceLog returned error: %v", err)
+	}
+
+	info, err := os.Stat(logPath)
+	if err != nil {
+		t.Fatalf("Stat returned error: %v", err)
+	}
+	if info.Size() == 0 {
+		t.Fatal("expected non-empty trace log when enableTrace=true")
+	}
+}
diff --git a/runtimecfg/report.go b/runtimecfg/report.go
index e32d46d..a45daa0 100644
--- a/runtimecfg/report.go
+++ b/runtimecfg/report.go
@@ -10,9 +10,6 @@ const defaultTopN = 5
 
 // BuildReportOptions builds report options from resolved runtime configuration.
 func (r *Runtime) BuildReportOptions(topN int, passed *bool, mismatchCount *int) (report.GenerateOptions, error) {
-	if !r.Config.LoggingEnabled {
-		return report.GenerateOptions{}, fmt.Errorf("logging is disabled, cannot build report options from trace log")
-	}
 	if r.Config.LogPath == "" {
 		return report.GenerateOptions{}, fmt.Errorf("log path is empty, cannot build report options")
 	}
@@ -43,9 +40,14 @@ func (r *Runtime) GenerateAndSaveReport(topN int, passed *bool, mismatchCount *i
 		return report.Report{}, "", err
 	}
 
-	result, err := report.GenerateFromLog(opts)
-	if err != nil {
-		return report.Report{}, "", fmt.Errorf("generate report from log: %w", err)
+	var result report.Report
+	if r.Observer != nil {
+		result = r.Observer.Build(opts)
+	} else {
+		result, err = report.GenerateFromLog(opts)
+		if err != nil {
+			return report.Report{}, "", fmt.Errorf("generate report from log: %w", err)
+		}
 	}
 
 	reportPath := r.DefaultReportPath()
diff --git a/runtimecfg/runtime.go b/runtimecfg/runtime.go
index 1e744d6..49db08f 100644
--- a/runtimecfg/runtime.go
+++ b/runtimecfg/runtime.go
@@ -1,9 +1,11 @@
 package runtimecfg
 
 import (
+	"context"
 	"fmt"
 	"log/slog"
 	"os"
+	"path/filepath"
 	"regexp"
 	"strconv"
 	"strings"
@@ -12,32 +14,72 @@ import (
 	"github.com/sarchlab/zeonica/api"
 	"github.com/sarchlab/zeonica/cgra"
 	"github.com/sarchlab/zeonica/config"
+	"github.com/sarchlab/zeonica/core"
+	"github.com/sarchlab/zeonica/report"
 )
 
 const (
-	defaultRows           = 4
-	defaultColumns        = 4
-	defaultExecutionModel = "serial"
-	defaultDriverName     = "Driver"
-	defaultDeviceName     = "Device"
-	defaultLogTemplate    = "<test>.json.log"
+	defaultRows               = 4
+	defaultColumns            = 4
+	defaultExecutionModel     = "serial"
+	defaultExecutionPolicy    = "in_order_dataflow"
+	defaultStrictMaxSlip      = int64(4)
+	defaultStrictFail         = false
+	defaultEnableFIFOModel    = false
+	defaultEnableQueueWatches = false
+	defaultDriverName         = "Driver"
+	defaultDeviceName         = "Device"
+	defaultLogTemplate        = "<test>.json.log"
+
+	defaultDriverPortIncomingBufferDepth = 1
+	defaultDriverPortOutgoingBufferDepth = 1
+	defaultCorePortIncomingBufferDepth   = 1
+	defaultCorePortOutgoingBufferDepth   = 1
+	defaultNumRegisters                  = 64
+	defaultLocalMemoryWords              = 1024
+	defaultMemoryMode                    = "simple"
+	defaultLinkLatency                   = 1
+	defaultLinkBandwidth                 = 32
+	linkTimingModelParseOnly             = "parse_only"
 )
 
 var freqPattern = regexp.MustCompile(`^([0-9]+)\s*(ghz|mhz|khz|hz)$`)
 
 // ResolvedConfig is the executable runtime configuration after defaults/resolution.
 type ResolvedConfig struct {
-	TestName           string
-	Rows               int
-	Columns            int
-	ExecutionModel     string
-	DriverName         string
-	DriverFreq         sim.Freq
-	DeviceName         string
-	DeviceFreq         sim.Freq
-	BindToArchitecture bool
-	LoggingEnabled     bool
-	LogPath            string
+	TestName              string
+	Rows                  int
+	Columns               int
+	ExecutionModel        string
+	ExecutionPolicy       string
+	StrictMaxSlip         int64
+	StrictFailOnViolation bool
+	EnableFIFOModel       bool
+	EnableQueueWatches    bool
+	DriverName            string
+	DriverFreq            sim.Freq
+	DeviceName            string
+	DeviceFreq            sim.Freq
+	BindToArchitecture    bool
+	LoggingEnabled        bool
+	EnableTrace           bool
+	LogPath               string
+
+	DriverPortIncomingBufferDepth int
+	DriverPortOutgoingBufferDepth int
+	CorePortIncomingBufferDepth   int
+	CorePortOutgoingBufferDepth   int
+	NumRegisters                  int
+	LocalMemoryWords              int
+	MemoryMode                    string
+	MemoryShare                   map[[2]int]int
+	LinkLatency                   int
+	LinkBandwidth                 int
+	LinkTimingModel               string
+	ProgramYAML                   string
+	ReportName                    string
+	QueueWatches                  []core.QueueWatchSpec
+	BufferSweepDepths             []int
 }
 
 // BuildOverrides allows optional size override when not binding to architecture.
@@ -54,6 +96,7 @@ type Runtime struct {
 	Engine   sim.Engine
 	Driver   api.Driver
 	Device   cgra.Device
+	Observer *report.Observer
 }
 
 // LoadRuntime loads arch spec, resolves config, and builds runtime objects.
@@ -63,7 +106,7 @@ func LoadRuntime(specPath, testName string) (*Runtime, error) {
 		return nil, err
 	}
 
-	cfg, err := Resolve(spec, testName)
+	cfg, err := ResolveWithSpecPath(spec, specPath, testName)
 	if err != nil {
 		return nil, err
 	}
@@ -78,16 +121,79 @@ func LoadRuntime(specPath, testName string) (*Runtime, error) {
 }
 
 // Resolve resolves defaults and validates runtime values from ArchSpec.
+//
+//nolint:gocyclo,funlen
 func Resolve(spec ArchSpec, testName string) (ResolvedConfig, error) {
+	return ResolveWithSpecPath(spec, "", testName)
+}
+
+// ResolveWithSpecPath resolves defaults and validates runtime values from ArchSpec,
+// using specPath to resolve case2 relative paths when available.
+//
+//nolint:gocyclo,funlen
+func ResolveWithSpecPath(spec ArchSpec, specPath, testName string) (ResolvedConfig, error) {
+	programYAML := resolveSpecRelativePath(specPath, spec.Simulator.ProgramYAML)
+	reportName := strings.TrimSpace(spec.Simulator.ReportName)
+	queueWatches := append([]core.QueueWatchSpec(nil), spec.Simulator.QueueWatches...)
+	bufferSweepDepths, err := resolveBufferSweepDepths(spec.Simulator.BufferSweepDepths)
+	if err != nil {
+		return ResolvedConfig{}, err
+	}
+	if err := core.ValidateQueueWatchSpecs(queueWatches); err != nil {
+		return ResolvedConfig{}, fmt.Errorf("simulator.queue_watches: %w", err)
+	}
+
+	effectiveTestName := strings.TrimSpace(testName)
+	if effectiveTestName == "" && reportName != "" {
+		effectiveTestName = reportName
+	}
+
 	resolved := ResolvedConfig{
-		TestName:           normalizeTestName(testName),
-		Rows:               defaultOrPositive(spec.CGRADefaults.Rows, defaultRows),
-		Columns:            defaultOrPositive(spec.CGRADefaults.Columns, defaultColumns),
-		ExecutionModel:     defaultOrString(spec.Simulator.ExecutionModel, defaultExecutionModel),
-		DriverName:         defaultOrString(spec.Simulator.Driver.Name, defaultDriverName),
-		DeviceName:         defaultOrString(spec.Simulator.Device.Name, defaultDeviceName),
-		BindToArchitecture: defaultOrBool(spec.Simulator.Device.BindToArchitecture, true),
-		LoggingEnabled:     defaultOrBool(spec.Simulator.Logging.Enabled, true),
+		TestName:                      normalizeTestName(effectiveTestName),
+		Rows:                          defaultOrPositive(spec.CGRADefaults.Rows, defaultRows),
+		Columns:                       defaultOrPositive(spec.CGRADefaults.Columns, defaultColumns),
+		ExecutionModel:                defaultOrString(spec.Simulator.ExecutionModel, defaultExecutionModel),
+		ExecutionPolicy:               defaultOrString(spec.Simulator.ExecutionPolicy, defaultExecutionPolicy),
+		EnableFIFOModel:               defaultOrBool(spec.Simulator.EnableFIFOModel, defaultEnableFIFOModel),
+		EnableQueueWatches:            defaultOrBool(spec.Simulator.EnableQueueWatches, defaultEnableQueueWatches),
+		StrictMaxSlip:                 defaultOrInt64(spec.Simulator.StrictMaxSlip, defaultStrictMaxSlip),
+		StrictFailOnViolation:         defaultOrBool(spec.Simulator.StrictFailOnViolation, defaultStrictFail),
+		DriverName:                    defaultOrString(spec.Simulator.Driver.Name, defaultDriverName),
+		DeviceName:                    defaultOrString(spec.Simulator.Device.Name, defaultDeviceName),
+		BindToArchitecture:            defaultOrBool(spec.Simulator.Device.BindToArchitecture, true),
+		LoggingEnabled:                defaultOrBool(spec.Simulator.Logging.Enabled, true),
+		EnableTrace:                   defaultOrBool(spec.Simulator.Logging.EnableTrace, false),
+		LinkTimingModel:               linkTimingModelParseOnly,
+		DriverPortIncomingBufferDepth: defaultDriverPortIncomingBufferDepth,
+		DriverPortOutgoingBufferDepth: defaultDriverPortOutgoingBufferDepth,
+		CorePortIncomingBufferDepth:   defaultCorePortIncomingBufferDepth,
+		CorePortOutgoingBufferDepth:   defaultCorePortOutgoingBufferDepth,
+		NumRegisters:                  defaultNumRegisters,
+		LocalMemoryWords:              defaultLocalMemoryWords,
+		MemoryMode:                    defaultMemoryMode,
+		LinkLatency:                   defaultLinkLatency,
+		LinkBandwidth:                 defaultLinkBandwidth,
+		ProgramYAML:                   programYAML,
+		ReportName:                    reportName,
+		QueueWatches:                  queueWatches,
+		BufferSweepDepths:             bufferSweepDepths,
+	}
+
+	normalizedPolicy, err := normalizeExecutionPolicy(resolved.ExecutionPolicy)
+	if err != nil {
+		return ResolvedConfig{}, err
+	}
+	resolved.ExecutionPolicy = normalizedPolicy
+
+	if envSlip, ok, err := parseInt64Env("ZEONICA_STRICT_MAX_SLIP"); err != nil {
+		return ResolvedConfig{}, err
+	} else if ok {
+		resolved.StrictMaxSlip = envSlip
+	}
+	if envFail, ok, err := parseBoolEnv("ZEONICA_STRICT_FAIL_ON_VIOLATION"); err != nil {
+		return ResolvedConfig{}, err
+	} else if ok {
+		resolved.StrictFailOnViolation = envFail
 	}
 
 	driverFreq, err := parseFrequency(spec.Simulator.Driver.Frequency, 1*sim.GHz)
@@ -105,6 +211,87 @@ func Resolve(spec ArchSpec, testName string) (ResolvedConfig, error) {
 	logTemplate := defaultOrString(spec.Simulator.Logging.File, defaultLogTemplate)
 	resolved.LogPath = resolveLogPath(logTemplate, resolved.TestName)
 
+	resolved.DriverPortIncomingBufferDepth, err = resolvePositivePtr(
+		spec.Simulator.Driver.PortIncomingBufferDepth,
+		defaultDriverPortIncomingBufferDepth,
+		"simulator.driver.port_incoming_buffer_depth",
+	)
+	if err != nil {
+		return ResolvedConfig{}, err
+	}
+	resolved.DriverPortOutgoingBufferDepth, err = resolvePositivePtr(
+		spec.Simulator.Driver.PortOutgoingBufferDepth,
+		defaultDriverPortOutgoingBufferDepth,
+		"simulator.driver.port_outgoing_buffer_depth",
+	)
+	if err != nil {
+		return ResolvedConfig{}, err
+	}
+	resolved.CorePortIncomingBufferDepth, err = resolvePositivePtr(
+		spec.Simulator.Device.PortIncomingBufferDepth,
+		defaultCorePortIncomingBufferDepth,
+		"simulator.device.port_incoming_buffer_depth",
+	)
+	if err != nil {
+		return ResolvedConfig{}, err
+	}
+	resolved.CorePortOutgoingBufferDepth, err = resolvePositivePtr(
+		spec.Simulator.Device.PortOutgoingBufferDepth,
+		defaultCorePortOutgoingBufferDepth,
+		"simulator.device.port_outgoing_buffer_depth",
+	)
+	if err != nil {
+		return ResolvedConfig{}, err
+	}
+
+	resolved.NumRegisters, err = resolvePositive(
+		spec.TileDefaults.NumRegisters,
+		defaultNumRegisters,
+		"tile_defaults.num_registers",
+	)
+	if err != nil {
+		return ResolvedConfig{}, err
+	}
+	resolved.LocalMemoryWords, err = resolvePositive(
+		spec.TileDefaults.LocalMemoryWords,
+		defaultLocalMemoryWords,
+		"tile_defaults.local_memory_words",
+	)
+	if err != nil {
+		return ResolvedConfig{}, err
+	}
+
+	resolved.MemoryMode, err = normalizeMemoryMode(defaultOrString(spec.Simulator.Device.MemoryMode, defaultMemoryMode))
+	if err != nil {
+		return ResolvedConfig{}, err
+	}
+	resolved.MemoryShare, err = resolveMemoryShare(
+		resolved.MemoryMode,
+		resolved.Rows,
+		resolved.Columns,
+		spec.Simulator.Device.MemoryShare,
+	)
+	if err != nil {
+		return ResolvedConfig{}, err
+	}
+
+	resolved.LinkLatency, err = resolveNonNegativePtr(
+		spec.LinkDefaults.Latency,
+		defaultLinkLatency,
+		"link_defaults.latency",
+	)
+	if err != nil {
+		return ResolvedConfig{}, err
+	}
+	resolved.LinkBandwidth, err = resolvePositivePtr(
+		spec.LinkDefaults.Bandwidth,
+		defaultLinkBandwidth,
+		"link_defaults.bandwidth",
+	)
+	if err != nil {
+		return ResolvedConfig{}, err
+	}
+
 	return resolved, nil
 }
 
@@ -133,6 +320,7 @@ func BuildRuntime(cfg ResolvedConfig, overrides *BuildOverrides) (*Runtime, erro
 	driver := api.DriverBuilder{}.
 		WithEngine(engine).
 		WithFreq(cfg.DriverFreq).
+		WithPortBufferDepth(cfg.DriverPortIncomingBufferDepth, cfg.DriverPortOutgoingBufferDepth).
 		Build(cfg.DriverName)
 
 	device := config.DeviceBuilder{}.
@@ -140,33 +328,91 @@ func BuildRuntime(cfg ResolvedConfig, overrides *BuildOverrides) (*Runtime, erro
 		WithFreq(cfg.DeviceFreq).
 		WithWidth(width).
 		WithHeight(height).
+		WithExecutionPolicy(cfg.ExecutionPolicy).
+		WithStrictTimingConfig(cfg.StrictMaxSlip, cfg.StrictFailOnViolation).
+		WithMemoryMode(cfg.MemoryMode).
+		WithMemoryShare(cfg.MemoryShare).
+		WithCorePortBufferDepth(cfg.CorePortIncomingBufferDepth, cfg.CorePortOutgoingBufferDepth).
+		WithEnableFIFOModel(cfg.EnableFIFOModel).
+		WithEnableQueueWatches(cfg.EnableQueueWatches).
+		WithQueueWatches(cfg.QueueWatches).
+		WithRegisterCount(cfg.NumRegisters).
+		WithLocalMemoryWords(cfg.LocalMemoryWords).
 		Build(cfg.DeviceName)
 
+	if cfg.LinkTimingModel == linkTimingModelParseOnly {
+		slog.Info(
+			"link_defaults parsed in parse-only mode",
+			"latency", cfg.LinkLatency,
+			"bandwidth", cfg.LinkBandwidth,
+		)
+	}
+
 	driver.RegisterDevice(device)
 
 	return &Runtime{
-		Config: cfg,
-		Engine: engine,
-		Driver: driver,
-		Device: device,
+		Config:   cfg,
+		Engine:   engine,
+		Driver:   driver,
+		Device:   device,
+		Observer: report.NewObserver(),
 	}, nil
 }
 
-// InitTraceLogger initializes the default slog JSON trace logger.
-func (r *Runtime) InitTraceLogger(level slog.Leveler) (*os.File, error) {
-	if !r.Config.LoggingEnabled {
+func resolveSpecRelativePath(specPath, target string) string {
+	trimmedTarget := strings.TrimSpace(target)
+	if trimmedTarget == "" {
+		return ""
+	}
+	cleanTarget := filepath.Clean(trimmedTarget)
+	if filepath.IsAbs(cleanTarget) || strings.TrimSpace(specPath) == "" {
+		return cleanTarget
+	}
+	return filepath.Clean(filepath.Join(filepath.Dir(specPath), cleanTarget))
+}
+
+func resolveBufferSweepDepths(input []int) ([]int, error) {
+	if len(input) == 0 {
 		return nil, nil
 	}
+	depths := make([]int, 0, len(input))
+	for idx, depth := range input {
+		if depth <= 0 {
+			return nil, fmt.Errorf("simulator.buffer_sweep_depths[%d] must be > 0", idx)
+		}
+		depths = append(depths, depth)
+	}
+	return depths, nil
+}
 
+// InitTraceLogger initializes the default slog JSON trace logger.
+func (r *Runtime) InitTraceLogger(level slog.Leveler) (*os.File, error) {
 	file, err := os.Create(r.Config.LogPath)
 	if err != nil {
 		return nil, fmt.Errorf("create trace log file: %w", err)
 	}
 
-	handler := slog.NewJSONHandler(file, &slog.HandlerOptions{
+	core.SetTraceObserver(nil)
+	if r.Observer != nil {
+		core.SetTraceObserver(r.Observer.Observe)
+	}
+	core.SetTraceEnabled(r.Config.EnableTrace)
+
+	if !r.Config.LoggingEnabled || !r.Config.EnableTrace {
+		stdoutHandler := slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{
+			Level: slog.LevelError,
+		})
+		slog.SetDefault(slog.New(stdoutHandler))
+		return file, nil
+	}
+
+	traceHandler := slog.NewJSONHandler(file, &slog.HandlerOptions{
 		Level: level,
 	})
-	slog.SetDefault(slog.New(handler))
+	stdoutHandler := slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{
+		Level: slog.LevelError,
+	})
+	slog.SetDefault(slog.New(newTeeHandler(stdoutHandler, traceHandler)))
 	return file, nil
 }
 
@@ -206,6 +452,13 @@ func defaultOrBool(value *bool, fallback bool) bool {
 	return *value
 }
 
+func defaultOrInt64(value *int64, fallback int64) int64 {
+	if value == nil {
+		return fallback
+	}
+	return *value
+}
+
 func normalizeTestName(testName string) string {
 	trimmed := strings.TrimSpace(testName)
 	if trimmed == "" {
@@ -254,3 +507,177 @@ func parseFrequency(input string, fallback sim.Freq) (sim.Freq, error) {
 		return 0, fmt.Errorf("unsupported frequency unit %q", matches[2])
 	}
 }
+
+func parseInt64Env(name string) (int64, bool, error) {
+	raw, exists := os.LookupEnv(name)
+	if !exists {
+		return 0, false, nil
+	}
+	trimmed := strings.TrimSpace(raw)
+	if trimmed == "" {
+		return 0, false, nil
+	}
+	value, err := strconv.ParseInt(trimmed, 10, 64)
+	if err != nil {
+		return 0, false, fmt.Errorf("invalid %s=%q: %w", name, raw, err)
+	}
+	return value, true, nil
+}
+
+func parseBoolEnv(name string) (bool, bool, error) {
+	raw, exists := os.LookupEnv(name)
+	if !exists {
+		return false, false, nil
+	}
+	trimmed := strings.TrimSpace(raw)
+	if trimmed == "" {
+		return false, false, nil
+	}
+	value, err := strconv.ParseBool(trimmed)
+	if err != nil {
+		return false, false, fmt.Errorf("invalid %s=%q: %w", name, raw, err)
+	}
+	return value, true, nil
+}
+
+func normalizeExecutionPolicy(input string) (string, error) {
+	text := strings.ToLower(strings.TrimSpace(input))
+	switch text {
+	case "", "in_order_dataflow", "in-order-dataflow", "dynamic":
+		return "in_order_dataflow", nil
+	case "elastic_scheduled", "elastic-scheduled", "hybrid":
+		return "elastic_scheduled", nil
+	case "strict_timed", "strict-timed", "static":
+		return "strict_timed", nil
+	default:
+		return "", fmt.Errorf(
+			"unsupported execution_policy %q (supported: strict_timed, elastic_scheduled, in_order_dataflow)",
+			input,
+		)
+	}
+}
+
+func normalizeMemoryMode(input string) (string, error) {
+	text := strings.ToLower(strings.TrimSpace(input))
+	switch text {
+	case "", "simple":
+		return "simple", nil
+	case "shared":
+		return "shared", nil
+	case "local":
+		return "local", nil
+	default:
+		return "", fmt.Errorf("unsupported memory_mode %q (supported: simple, shared, local)", input)
+	}
+}
+
+func resolvePositive(value, fallback int, field string) (int, error) {
+	if value == 0 {
+		return fallback, nil
+	}
+	if value < 0 {
+		return 0, fmt.Errorf("%s must be > 0, got %d", field, value)
+	}
+	return value, nil
+}
+
+func resolvePositivePtr(value *int, fallback int, field string) (int, error) {
+	if value == nil {
+		return fallback, nil
+	}
+	if *value <= 0 {
+		return 0, fmt.Errorf("%s must be > 0, got %d", field, *value)
+	}
+	return *value, nil
+}
+
+func resolveNonNegativePtr(value *int, fallback int, field string) (int, error) {
+	if value == nil {
+		return fallback, nil
+	}
+	if *value < 0 {
+		return 0, fmt.Errorf("%s must be >= 0, got %d", field, *value)
+	}
+	return *value, nil
+}
+
+func resolveMemoryShare(mode string, rows, cols int, entries []MemoryShareEntry) (map[[2]int]int, error) {
+	if mode != "shared" {
+		return nil, nil
+	}
+
+	share := make(map[[2]int]int, rows*cols)
+	for y := 0; y < rows; y++ {
+		for x := 0; x < cols; x++ {
+			share[[2]int{x, y}] = 0
+		}
+	}
+
+	for _, entry := range entries {
+		if entry.TileX < 0 || entry.TileX >= cols || entry.TileY < 0 || entry.TileY >= rows {
+			return nil, fmt.Errorf(
+				"simulator.device.memory_share has out-of-range tile (%d,%d) for grid %dx%d",
+				entry.TileX,
+				entry.TileY,
+				cols,
+				rows,
+			)
+		}
+		if entry.Group < 0 {
+			return nil, fmt.Errorf("simulator.device.memory_share group must be >= 0, got %d", entry.Group)
+		}
+		share[[2]int{entry.TileX, entry.TileY}] = entry.Group
+	}
+	return share, nil
+}
+
+type teeHandler struct {
+	handlers []slog.Handler
+}
+
+func newTeeHandler(handlers ...slog.Handler) slog.Handler {
+	cleaned := make([]slog.Handler, 0, len(handlers))
+	for _, handler := range handlers {
+		if handler != nil {
+			cleaned = append(cleaned, handler)
+		}
+	}
+	return &teeHandler{handlers: cleaned}
+}
+
+func (h *teeHandler) Enabled(ctx context.Context, level slog.Level) bool {
+	for _, handler := range h.handlers {
+		if handler.Enabled(ctx, level) {
+			return true
+		}
+	}
+	return false
+}
+
+func (h *teeHandler) Handle(ctx context.Context, record slog.Record) error {
+	for _, handler := range h.handlers {
+		if !handler.Enabled(ctx, record.Level) {
+			continue
+		}
+		if err := handler.Handle(ctx, record.Clone()); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (h *teeHandler) WithAttrs(attrs []slog.Attr) slog.Handler {
+	next := make([]slog.Handler, 0, len(h.handlers))
+	for _, handler := range h.handlers {
+		next = append(next, handler.WithAttrs(attrs))
+	}
+	return &teeHandler{handlers: next}
+}
+
+func (h *teeHandler) WithGroup(name string) slog.Handler {
+	next := make([]slog.Handler, 0, len(h.handlers))
+	for _, handler := range h.handlers {
+		next = append(next, handler.WithGroup(name))
+	}
+	return &teeHandler{handlers: next}
+}
diff --git a/runtimecfg/runtime_test.go b/runtimecfg/runtime_test.go
new file mode 100644
index 0000000..108c5c3
--- /dev/null
+++ b/runtimecfg/runtime_test.go
@@ -0,0 +1,316 @@
+package runtimecfg
+
+import (
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/sarchlab/zeonica/core"
+)
+
+func TestResolveExecutionPolicyDefaultsToInOrder(t *testing.T) {
+	cfg, err := Resolve(ArchSpec{}, "policy-default")
+	if err != nil {
+		t.Fatalf("Resolve returned error: %v", err)
+	}
+	if cfg.ExecutionPolicy != "in_order_dataflow" {
+		t.Fatalf("unexpected default execution policy: %q", cfg.ExecutionPolicy)
+	}
+}
+
+func TestResolveExecutionPolicyAlias(t *testing.T) {
+	spec := ArchSpec{
+		Simulator: Simulator{
+			ExecutionPolicy: "hybrid",
+		},
+	}
+	cfg, err := Resolve(spec, "policy-alias")
+	if err != nil {
+		t.Fatalf("Resolve returned error: %v", err)
+	}
+	if cfg.ExecutionPolicy != "elastic_scheduled" {
+		t.Fatalf("unexpected normalized policy: %q", cfg.ExecutionPolicy)
+	}
+}
+
+func TestResolveExecutionPolicyInvalid(t *testing.T) {
+	spec := ArchSpec{
+		Simulator: Simulator{
+			ExecutionPolicy: "unknown_mode",
+		},
+	}
+	_, err := Resolve(spec, "policy-invalid")
+	if err == nil {
+		t.Fatal("expected error for invalid policy, got nil")
+	}
+	if !strings.Contains(err.Error(), "unsupported execution_policy") {
+		t.Fatalf("unexpected error: %v", err)
+	}
+}
+
+func TestResolveStrictDefaults(t *testing.T) {
+	cfg, err := Resolve(ArchSpec{}, "strict-default")
+	if err != nil {
+		t.Fatalf("Resolve returned error: %v", err)
+	}
+	if cfg.StrictMaxSlip != 4 {
+		t.Fatalf("unexpected strict max slip: got %d want 4", cfg.StrictMaxSlip)
+	}
+	if cfg.StrictFailOnViolation {
+		t.Fatalf("unexpected strict fail flag: got true want false")
+	}
+}
+
+func TestResolveStrictEnvOverrides(t *testing.T) {
+	t.Setenv("ZEONICA_STRICT_MAX_SLIP", "8")
+	t.Setenv("ZEONICA_STRICT_FAIL_ON_VIOLATION", "true")
+
+	cfg, err := Resolve(ArchSpec{
+		Simulator: Simulator{
+			StrictMaxSlip:         int64Ptr(2),
+			StrictFailOnViolation: boolPtr(false),
+		},
+	}, "strict-env")
+	if err != nil {
+		t.Fatalf("Resolve returned error: %v", err)
+	}
+	if cfg.StrictMaxSlip != 8 {
+		t.Fatalf("unexpected strict max slip from env: got %d want 8", cfg.StrictMaxSlip)
+	}
+	if !cfg.StrictFailOnViolation {
+		t.Fatalf("unexpected strict fail flag from env: got false want true")
+	}
+}
+
+func TestResolveStrictInvalidEnv(t *testing.T) {
+	t.Setenv("ZEONICA_STRICT_MAX_SLIP", "bad")
+	_, err := Resolve(ArchSpec{}, "strict-invalid-env")
+	if err == nil {
+		t.Fatal("expected error for invalid strict env")
+	}
+	if !strings.Contains(err.Error(), "ZEONICA_STRICT_MAX_SLIP") {
+		t.Fatalf("unexpected error: %v", err)
+	}
+}
+
+func TestResolveMicroarchitectureDefaults(t *testing.T) {
+	cfg, err := Resolve(ArchSpec{}, "microarch-defaults")
+	if err != nil {
+		t.Fatalf("Resolve returned error: %v", err)
+	}
+
+	if cfg.DriverPortIncomingBufferDepth != 1 || cfg.DriverPortOutgoingBufferDepth != 1 {
+		t.Fatalf(
+			"unexpected driver port depth defaults: in=%d out=%d",
+			cfg.DriverPortIncomingBufferDepth,
+			cfg.DriverPortOutgoingBufferDepth,
+		)
+	}
+	if cfg.CorePortIncomingBufferDepth != 1 || cfg.CorePortOutgoingBufferDepth != 1 {
+		t.Fatalf(
+			"unexpected core port depth defaults: in=%d out=%d",
+			cfg.CorePortIncomingBufferDepth,
+			cfg.CorePortOutgoingBufferDepth,
+		)
+	}
+	if cfg.NumRegisters != 64 || cfg.LocalMemoryWords != 1024 {
+		t.Fatalf("unexpected tile defaults: regs=%d mem=%d", cfg.NumRegisters, cfg.LocalMemoryWords)
+	}
+	if cfg.MemoryMode != "simple" {
+		t.Fatalf("unexpected memory mode default: %q", cfg.MemoryMode)
+	}
+	if cfg.LinkLatency != 1 || cfg.LinkBandwidth != 32 {
+		t.Fatalf("unexpected link defaults: latency=%d bandwidth=%d", cfg.LinkLatency, cfg.LinkBandwidth)
+	}
+	if cfg.LinkTimingModel != "parse_only" {
+		t.Fatalf("unexpected link timing model: %q", cfg.LinkTimingModel)
+	}
+	if cfg.EnableFIFOModel {
+		t.Fatalf("unexpected fifo model default: got true want false")
+	}
+	if cfg.ProgramYAML != "" || cfg.ReportName != "" || len(cfg.QueueWatches) != 0 || len(cfg.BufferSweepDepths) != 0 {
+		t.Fatalf(
+			"unexpected experiment defaults: program=%q report=%q watches=%d depths=%d",
+			cfg.ProgramYAML,
+			cfg.ReportName,
+			len(cfg.QueueWatches),
+			len(cfg.BufferSweepDepths),
+		)
+	}
+}
+
+func TestResolveMicroarchitectureOverrides(t *testing.T) {
+	spec := ArchSpec{
+		CGRADefaults: CGRADefaults{Rows: 2, Columns: 2},
+		TileDefaults: TileDefaults{NumRegisters: 96, LocalMemoryWords: 2048},
+		LinkDefaults: LinkDefaults{Latency: intPtr(3), Bandwidth: intPtr(128)},
+		Simulator: Simulator{
+			EnableFIFOModel: boolPtr(true),
+			Driver: NamedComponent{
+				PortIncomingBufferDepth: intPtr(4),
+				PortOutgoingBufferDepth: intPtr(5),
+			},
+			Device: DeviceComponent{
+				MemoryMode:              "shared",
+				PortIncomingBufferDepth: intPtr(6),
+				PortOutgoingBufferDepth: intPtr(7),
+				MemoryShare:             []MemoryShareEntry{{TileX: 1, TileY: 1, Group: 9}},
+			},
+		},
+	}
+
+	cfg, err := Resolve(spec, "microarch-overrides")
+	if err != nil {
+		t.Fatalf("Resolve returned error: %v", err)
+	}
+
+	if cfg.DriverPortIncomingBufferDepth != 4 || cfg.DriverPortOutgoingBufferDepth != 5 {
+		t.Fatalf(
+			"driver buffer depth override failed: in=%d out=%d",
+			cfg.DriverPortIncomingBufferDepth,
+			cfg.DriverPortOutgoingBufferDepth,
+		)
+	}
+	if cfg.CorePortIncomingBufferDepth != 6 || cfg.CorePortOutgoingBufferDepth != 7 {
+		t.Fatalf(
+			"core buffer depth override failed: in=%d out=%d",
+			cfg.CorePortIncomingBufferDepth,
+			cfg.CorePortOutgoingBufferDepth,
+		)
+	}
+	if cfg.NumRegisters != 96 || cfg.LocalMemoryWords != 2048 {
+		t.Fatalf("tile override failed: regs=%d mem=%d", cfg.NumRegisters, cfg.LocalMemoryWords)
+	}
+	if cfg.MemoryMode != "shared" {
+		t.Fatalf("memory mode override failed: %q", cfg.MemoryMode)
+	}
+	if len(cfg.MemoryShare) != 4 {
+		t.Fatalf("shared mode should materialize full 2x2 map, got %d", len(cfg.MemoryShare))
+	}
+	if got := cfg.MemoryShare[[2]int{1, 1}]; got != 9 {
+		t.Fatalf("memory_share override for (1,1) failed: got %d want 9", got)
+	}
+	if cfg.LinkLatency != 3 || cfg.LinkBandwidth != 128 {
+		t.Fatalf("link override failed: latency=%d bandwidth=%d", cfg.LinkLatency, cfg.LinkBandwidth)
+	}
+	if !cfg.EnableFIFOModel {
+		t.Fatalf("fifo model override failed: got false want true")
+	}
+}
+
+func TestResolveMicroarchitectureInvalidDepth(t *testing.T) {
+	spec := ArchSpec{
+		Simulator: Simulator{
+			Driver: NamedComponent{PortIncomingBufferDepth: intPtr(0)},
+		},
+	}
+	_, err := Resolve(spec, "microarch-invalid-depth")
+	if err == nil {
+		t.Fatal("expected invalid depth error")
+	}
+	if !strings.Contains(err.Error(), "port_incoming_buffer_depth") {
+		t.Fatalf("unexpected error: %v", err)
+	}
+}
+
+func TestResolveInvalidMemoryMode(t *testing.T) {
+	spec := ArchSpec{Simulator: Simulator{Device: DeviceComponent{MemoryMode: "foo"}}}
+	_, err := Resolve(spec, "memory-mode-invalid")
+	if err == nil {
+		t.Fatal("expected invalid memory mode error")
+	}
+	if !strings.Contains(err.Error(), "unsupported memory_mode") {
+		t.Fatalf("unexpected error: %v", err)
+	}
+}
+
+func TestResolveInvalidLinkLatency(t *testing.T) {
+	spec := ArchSpec{LinkDefaults: LinkDefaults{Latency: intPtr(-1)}}
+	_, err := Resolve(spec, "link-latency-invalid")
+	if err == nil {
+		t.Fatal("expected invalid link latency error")
+	}
+	if !strings.Contains(err.Error(), "link_defaults.latency") {
+		t.Fatalf("unexpected error: %v", err)
+	}
+}
+
+func TestResolveInvalidMemoryShareCoordinate(t *testing.T) {
+	spec := ArchSpec{
+		CGRADefaults: CGRADefaults{Rows: 2, Columns: 2},
+		Simulator: Simulator{
+			Device: DeviceComponent{
+				MemoryMode:  "shared",
+				MemoryShare: []MemoryShareEntry{{TileX: 3, TileY: 0, Group: 0}},
+			},
+		},
+	}
+	_, err := Resolve(spec, "memory-share-invalid")
+	if err == nil {
+		t.Fatal("expected invalid memory share coordinate error")
+	}
+	if !strings.Contains(err.Error(), "out-of-range tile") {
+		t.Fatalf("unexpected error: %v", err)
+	}
+}
+
+func TestResolveWithSpecPathExperimentConfig(t *testing.T) {
+	specPath := filepath.Join(t.TempDir(), "base_arch_spec.yaml")
+	spec := ArchSpec{
+		Simulator: Simulator{
+			ProgramYAML:       "fir+histogram/tmp-generated-instructions.yaml",
+			ReportName:        "fir_histogram",
+			BufferSweepDepths: []int{1, 2, 4, 8, 16},
+			QueueWatches: []core.QueueWatchSpec{
+				{Label: "hist_upstream", X: 1, Y: 1, Kind: "recv", Direction: "West", Color: "RED"},
+				{Label: "hist_downstream", X: 2, Y: 1, Kind: "recv", Direction: "West", Color: "RED"},
+			},
+		},
+	}
+
+	cfg, err := ResolveWithSpecPath(spec, specPath, "")
+	if err != nil {
+		t.Fatalf("ResolveWithSpecPath returned error: %v", err)
+	}
+
+	expectedProgram := filepath.Join(filepath.Dir(specPath), "fir+histogram", "tmp-generated-instructions.yaml")
+	if cfg.ProgramYAML != expectedProgram {
+		t.Fatalf("unexpected resolved program path: got %q want %q", cfg.ProgramYAML, expectedProgram)
+	}
+	if cfg.ReportName != "fir_histogram" {
+		t.Fatalf("unexpected report name: %q", cfg.ReportName)
+	}
+	if cfg.TestName != "fir_histogram" {
+		t.Fatalf("expected report_name to seed test name, got %q", cfg.TestName)
+	}
+	if len(cfg.QueueWatches) != 2 {
+		t.Fatalf("unexpected queue watch count: %d", len(cfg.QueueWatches))
+	}
+	if len(cfg.BufferSweepDepths) != 5 {
+		t.Fatalf("unexpected buffer sweep depth count: %d", len(cfg.BufferSweepDepths))
+	}
+}
+
+func TestResolveWithSpecPathRejectsInvalidQueueWatch(t *testing.T) {
+	spec := ArchSpec{
+		Simulator: Simulator{
+			QueueWatches: []core.QueueWatchSpec{
+				{Label: "bad", X: 0, Y: 0, Kind: "recv", Direction: "Bogus", Color: "RED"},
+			},
+		},
+	}
+
+	_, err := ResolveWithSpecPath(spec, "", "invalid-watch")
+	if err == nil {
+		t.Fatal("expected invalid queue watch error")
+	}
+	if !strings.Contains(err.Error(), "simulator.queue_watches") {
+		t.Fatalf("unexpected error: %v", err)
+	}
+}
+
+func int64Ptr(v int64) *int64 { return &v }
+
+func boolPtr(v bool) *bool { return &v }
+
+func intPtr(v int) *int { return &v }
diff --git a/runtimecfg/spec.go b/runtimecfg/spec.go
index 1325203..ba183a6 100644
--- a/runtimecfg/spec.go
+++ b/runtimecfg/spec.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"os"
 
+	"github.com/sarchlab/zeonica/core"
 	"gopkg.in/yaml.v3"
 )
 
@@ -13,6 +14,8 @@ import (
 // extension without changing callers.
 type ArchSpec struct {
 	CGRADefaults CGRADefaults   `yaml:"cgra_defaults"`
+	TileDefaults TileDefaults   `yaml:"tile_defaults"`
+	LinkDefaults LinkDefaults   `yaml:"link_defaults"`
 	Simulator    Simulator      `yaml:"simulator"`
 	Extra        map[string]any `yaml:",inline"`
 }
@@ -24,35 +27,74 @@ type CGRADefaults struct {
 	Extra   map[string]any `yaml:",inline"`
 }
 
+// TileDefaults defines default per-tile microarchitecture parameters.
+type TileDefaults struct {
+	NumRegisters     int            `yaml:"num_registers"`
+	LocalMemoryWords int            `yaml:"local_memory_words"`
+	Extra            map[string]any `yaml:",inline"`
+}
+
+// LinkDefaults captures inter-tile link metadata. This release parses and validates
+// these fields, but does not feed them into cycle-accurate link timing yet.
+type LinkDefaults struct {
+	Latency   *int           `yaml:"latency"`
+	Bandwidth *int           `yaml:"bandwidth"`
+	Extra     map[string]any `yaml:",inline"`
+}
+
 // Simulator contains simulator runtime settings from arch spec.
 type Simulator struct {
-	ExecutionModel string           `yaml:"execution_model"`
-	Logging        SimulatorLogging `yaml:"logging"`
-	Driver         NamedComponent   `yaml:"driver"`
-	Device         DeviceComponent  `yaml:"device"`
-	Extra          map[string]any   `yaml:",inline"`
+	ExecutionModel        string                `yaml:"execution_model"`
+	ExecutionPolicy       string                `yaml:"execution_policy"`
+	EnableFIFOModel       *bool                 `yaml:"enable_fifo_model"`
+	EnableQueueWatches    *bool                 `yaml:"enable_queue_watches"`
+	ProgramYAML           string                `yaml:"program_yaml"`
+	ReportName            string                `yaml:"report_name"`
+	QueueWatches          []core.QueueWatchSpec `yaml:"queue_watches"`
+	BufferSweepDepths     []int                 `yaml:"buffer_sweep_depths"`
+	StrictMaxSlip         *int64                `yaml:"strict_max_slip"`
+	StrictFailOnViolation *bool                 `yaml:"strict_fail_on_violation"`
+	Logging               SimulatorLogging      `yaml:"logging"`
+	Driver                NamedComponent        `yaml:"driver"`
+	Device                DeviceComponent       `yaml:"device"`
+	Extra                 map[string]any        `yaml:",inline"`
 }
 
 // SimulatorLogging configures trace logging behavior.
 type SimulatorLogging struct {
-	Enabled *bool          `yaml:"enabled"`
-	File    string         `yaml:"file"`
-	Extra   map[string]any `yaml:",inline"`
+	Enabled     *bool          `yaml:"enabled"`
+	EnableTrace *bool          `yaml:"enableTrace"`
+	File        string         `yaml:"file"`
+	Extra       map[string]any `yaml:",inline"`
 }
 
 // NamedComponent contains shared component naming/frequency fields.
 type NamedComponent struct {
-	Name      string         `yaml:"name"`
-	Frequency string         `yaml:"frequency"`
-	Extra     map[string]any `yaml:",inline"`
+	Name                    string         `yaml:"name"`
+	Frequency               string         `yaml:"frequency"`
+	PortIncomingBufferDepth *int           `yaml:"port_incoming_buffer_depth"`
+	PortOutgoingBufferDepth *int           `yaml:"port_outgoing_buffer_depth"`
+	Extra                   map[string]any `yaml:",inline"`
+}
+
+// MemoryShareEntry maps one tile coordinate to a shared-memory controller group.
+type MemoryShareEntry struct {
+	TileX int            `yaml:"tile_x"`
+	TileY int            `yaml:"tile_y"`
+	Group int            `yaml:"group"`
+	Extra map[string]any `yaml:",inline"`
 }
 
 // DeviceComponent defines simulator device-specific settings.
 type DeviceComponent struct {
-	Name               string         `yaml:"name"`
-	Frequency          string         `yaml:"frequency"`
-	BindToArchitecture *bool          `yaml:"bind_to_architecture"`
-	Extra              map[string]any `yaml:",inline"`
+	Name                    string             `yaml:"name"`
+	Frequency               string             `yaml:"frequency"`
+	BindToArchitecture      *bool              `yaml:"bind_to_architecture"`
+	MemoryMode              string             `yaml:"memory_mode"`
+	MemoryShare             []MemoryShareEntry `yaml:"memory_share"`
+	PortIncomingBufferDepth *int               `yaml:"port_incoming_buffer_depth"`
+	PortOutgoingBufferDepth *int               `yaml:"port_outgoing_buffer_depth"`
+	Extra                   map[string]any     `yaml:",inline"`
 }
 
 // Load reads and parses an architecture spec YAML file.
diff --git a/test/arch_spec/arch_spec.yaml b/test/arch_spec/arch_spec.yaml
index acb94d3..1dc6c53 100644
--- a/test/arch_spec/arch_spec.yaml
+++ b/test/arch_spec/arch_spec.yaml
@@ -15,6 +15,7 @@ cgra_defaults:
 
 tile_defaults:
   num_registers: 32
+  local_memory_words: 2048
   fu_types: ["add", "mul", "div", "fadd", "fmul", "fdiv", "logic", "cmp", "sel", "type_conv", "vfmul", "fadd_fadd", "fmul_fadd", "grant", "loop_control", "phi", "constant", "mem", "return", "mem_indexed", "alloca", "shift"]
 
 link_defaults:
@@ -45,16 +46,38 @@ extensions:
 
 simulator:
   execution_model: "serial"
+  execution_policy: "in_order_dataflow"
+  enable_fifo_model: false
+  # three policy: strict_timed, elastic_scheduled, in_order_dataflow
 
   logging:
     enabled: true
+    enableTrace: true
     file: "<test>.json.log"
 
   driver:
     name: "Driver"
     frequency: "1GHz"
+    port_incoming_buffer_depth: 4
+    port_outgoing_buffer_depth: 4
 
   device:
     name: "Device"
     frequency: "1GHz"
     bind_to_architecture: true
+    memory_mode: "local"
+    port_incoming_buffer_depth: 8
+    port_outgoing_buffer_depth: 8
+    memory_share:
+      - tile_x: 0
+        tile_y: 0
+        group: 0
+      - tile_x: 1
+        tile_y: 0
+        group: 0
+      - tile_x: 2
+        tile_y: 0
+        group: 1
+      - tile_x: 3
+        tile_y: 0
+        group: 1
diff --git a/test/testbench/axpy/main.go b/test/testbench/axpy/main.go
index 11c1f53..07d4bbf 100644
--- a/test/testbench/axpy/main.go
+++ b/test/testbench/axpy/main.go
@@ -177,13 +177,9 @@ func main() {
 	}
 
 	passed := mismatch == 0
-	if rt.Config.LoggingEnabled {
-		reportPath, err := rt.GenerateSaveAndPrintReport(5, &passed, &mismatch)
-		if err != nil {
-			panic(err)
-		}
-		fmt.Printf("report saved: %s\n", reportPath)
-	} else {
-		fmt.Println("logging disabled in arch spec, skipped report generation")
+	reportPath, err := rt.GenerateSaveAndPrintReport(5, &passed, &mismatch)
+	if err != nil {
+		panic(err)
 	}
+	fmt.Printf("report saved: %s\n", reportPath)
 }
diff --git a/test/testbench/branch_for/main.go b/test/testbench/branch_for/main.go
index dfc7055..02b80bc 100644
--- a/test/testbench/branch_for/main.go
+++ b/test/testbench/branch_for/main.go
@@ -146,13 +146,9 @@ func main() {
 	}
 
 	passed := mismatch == 0
-	if rt.Config.LoggingEnabled {
-		reportPath, err := rt.GenerateSaveAndPrintReport(5, &passed, &mismatch)
-		if err != nil {
-			panic(err)
-		}
-		fmt.Printf("report saved: %s\n", reportPath)
-	} else {
-		fmt.Println("logging disabled in arch spec, skipped report generation")
+	reportPath, err := rt.GenerateSaveAndPrintReport(5, &passed, &mismatch)
+	if err != nil {
+		panic(err)
 	}
+	fmt.Printf("report saved: %s\n", reportPath)
 }
diff --git a/test/testbench/fir/main.go b/test/testbench/fir/main.go
index 93197bd..b5a657b 100644
--- a/test/testbench/fir/main.go
+++ b/test/testbench/fir/main.go
@@ -166,13 +166,9 @@ func main() {
 	}
 
 	passed := mismatchCount == 0
-	if rt.Config.LoggingEnabled {
-		reportPath, err := rt.GenerateSaveAndPrintReport(5, &passed, &mismatchCount)
-		if err != nil {
-			panic(err)
-		}
-		fmt.Printf("report saved: %s\n", reportPath)
-	} else {
-		fmt.Println("logging disabled in arch spec, skipped report generation")
+	reportPath, err := rt.GenerateSaveAndPrintReport(5, &passed, &mismatchCount)
+	if err != nil {
+		panic(err)
 	}
+	fmt.Printf("report saved: %s\n", reportPath)
 }
diff --git a/test/testbench/histogram/main.go b/test/testbench/histogram/main.go
index 3376de8..4d3937a 100644
--- a/test/testbench/histogram/main.go
+++ b/test/testbench/histogram/main.go
@@ -13,8 +13,158 @@ import (
 	"github.com/sarchlab/akita/v4/sim"
 	"github.com/sarchlab/zeonica/core"
 	"github.com/sarchlab/zeonica/runtimecfg"
+	"gopkg.in/yaml.v3"
 )
 
+// YAML shapes mirror core/program.go (LoadProgramFileFromYAML) for patching only.
+
+type histogramYAMLRoot struct {
+	ArrayConfig histogramArrayConfig `yaml:"array_config"`
+}
+
+type histogramArrayConfig struct {
+	Rows       int                    `yaml:"rows"`
+	Cols       int                    `yaml:"columns"`
+	CompiledII int                    `yaml:"compiled_ii"`
+	Cores      []histogramYAMLCore    `yaml:"cores"`
+}
+
+type histogramYAMLCore struct {
+	Row     int               `yaml:"row"`
+	Column  int               `yaml:"column"`
+	CoreID  string            `yaml:"core_id"`
+	Entries []histogramYAMLEntry `yaml:"entries"`
+}
+
+type histogramYAMLEntry struct {
+	EntryID           string                        `yaml:"entry_id"`
+	Type              string                        `yaml:"type"`
+	InstructionGroups []histogramYAMLInstGroup      `yaml:"instructions"`
+}
+
+type histogramYAMLInstGroup struct {
+	Operations []histogramYAMLOperation `yaml:"operations"`
+	IndexPerII int                      `yaml:"index_per_ii"`
+}
+
+type histogramYAMLOperation struct {
+	OpCode            string               `yaml:"opcode"`
+	SrcOperands       []histogramYAMLOperand `yaml:"src_operands"`
+	DstOperands       []histogramYAMLOperand `yaml:"dst_operands"`
+	ID                int                  `yaml:"id"`
+	InvalidIterations int                  `yaml:"invalid_iterations"`
+	TimeStep          int                  `yaml:"time_step"`
+}
+
+type histogramYAMLOperand struct {
+	Operand string `yaml:"operand"`
+	Color   string `yaml:"color"`
+}
+
+// gepArgReplacements maps LLVM-style kernel parameters to immediates for GEP.
+//
+// Matches histogram_int.cpp:
+//
+//	void kernel(int input[], int histogram[])
+//
+// arg0 — base of input[] (input_data); testbench preloads at tile (3,2) starting offset 0.
+// arg1 — base of histogram[]; preloads at tile (2,1) starting offset 0.
+//
+// Override with ZEONICA_GEP_ARG0 / ZEONICA_GEP_ARG1 (e.g. "0" or "#0").
+func gepArgReplacements() map[string]string {
+	m := make(map[string]string)
+	if v := strings.TrimSpace(os.Getenv("ZEONICA_GEP_ARG0")); v != "" {
+		m["arg0"] = normalizeImmediateYAMLOperand(v)
+	} else {
+		m["arg0"] = "#0"
+	}
+	if v := strings.TrimSpace(os.Getenv("ZEONICA_GEP_ARG1")); v != "" {
+		m["arg1"] = normalizeImmediateYAMLOperand(v)
+	} else {
+		m["arg1"] = "#0"
+	}
+	return m
+}
+
+func normalizeImmediateYAMLOperand(s string) string {
+	s = strings.TrimSpace(s)
+	if strings.HasPrefix(s, "#") {
+		return s
+	}
+	return "#" + s
+}
+
+// patchGEPArgOperands replaces arg0/arg1 in GEP source operands with immediates.
+// Zeonica's readOperand only accepts $reg, ports, or numeric immediates — not symbolic args.
+func patchGEPArgOperands(root *histogramYAMLRoot, repl map[string]string) bool {
+	changed := false
+	for ci := range root.ArrayConfig.Cores {
+		core := &root.ArrayConfig.Cores[ci]
+		for ei := range core.Entries {
+			entry := &core.Entries[ei]
+			for gi := range entry.InstructionGroups {
+				group := &entry.InstructionGroups[gi]
+				for oi := range group.Operations {
+					op := &group.Operations[oi]
+					if op.OpCode != "GEP" {
+						continue
+					}
+					for si := range op.SrcOperands {
+						src := &op.SrcOperands[si]
+						if newOp, ok := repl[src.Operand]; ok {
+							src.Operand = newOp
+							changed = true
+						}
+					}
+				}
+			}
+		}
+	}
+	return changed
+}
+
+// resolveProgramYAMLWithGEPArgs reads compiler-generated YAML, patches GEP arg operands, and
+// returns a path suitable for core.LoadProgramFileFromYAML. If nothing changed, returns the
+// original path and a no-op cleanup.
+func resolveProgramYAMLWithGEPArgs(programPath string) (resolved string, cleanup func()) {
+	data, err := os.ReadFile(programPath)
+	if err != nil {
+		panic(fmt.Sprintf("Failed to read program file %q: %v", programPath, err))
+	}
+
+	var root histogramYAMLRoot
+	if err := yaml.Unmarshal(data, &root); err != nil {
+		panic(fmt.Sprintf("Failed to parse YAML %q: %v", programPath, err))
+	}
+
+	repl := gepArgReplacements()
+	if !patchGEPArgOperands(&root, repl) {
+		return programPath, func() {}
+	}
+
+	out, err := yaml.Marshal(&root)
+	if err != nil {
+		panic(err)
+	}
+
+	tmp, err := os.CreateTemp("", "zeonica-histogram-patched-*.yaml")
+	if err != nil {
+		panic(err)
+	}
+	path := tmp.Name()
+	if _, err := tmp.Write(out); err != nil {
+		_ = tmp.Close()
+		_ = os.Remove(path)
+		panic(err)
+	}
+	if err := tmp.Close(); err != nil {
+		_ = os.Remove(path)
+		panic(err)
+	}
+
+	return path, func() { _ = os.Remove(path) }
+}
+
 // Histogram runs the histogram testbench on the configured runtime.
 //
 //nolint:gocyclo,funlen
@@ -36,10 +186,12 @@ func Histogram(rt *runtimecfg.Runtime) int {
 
 	programPath := os.Getenv("ZEONICA_PROGRAM_YAML")
 	if programPath == "" {
-		//programPath = "test/Zeonica_Testbench/kernel/histogram/histogram-instructions.yaml"
 		programPath = "tmp-generated-instructions.yaml"
 	}
-	program := core.LoadProgramFileFromYAML(programPath)
+	resolvedPath, cleanupYAML := resolveProgramYAMLWithGEPArgs(programPath)
+	defer cleanupYAML()
+
+	program := core.LoadProgramFileFromYAML(resolvedPath)
 	fmt.Println("program:", program)
 
 	if len(program) == 0 {
@@ -62,13 +214,27 @@ func Histogram(rt *runtimecfg.Runtime) int {
 	}
 	expected := computeHistogram(inputData, 5)
 
-	// histogram tile (2,1): initialize histogram[0..4] to 0
-	for addr := 0; addr < 5; addr++ {
-		driver.PreloadMemory(2, 1, 0, uint32(addr))
+	// Tile layout must match the mapped kernel in tmp-generated-instructions.yaml (and .asm):
+	//   LOAD / input GEP live on core column=1, row=0  -> tile (1,0)
+	//   STORE / histogram GEP on column=0, row=1      -> tile (0,1)
+	// Older hand-tuned testbenches used (3,2)/(2,1); compiler-generated mapping differs.
+	const (
+		inputTileX   = 1
+		inputTileY   = 0
+		histTileX    = 0
+		histTileY    = 1
+		histBins     = 5
+		inputDataLen = 20
+	)
+
+	for addr := 0; addr < histBins; addr++ {
+		driver.PreloadMemory(histTileX, histTileY, 0, uint32(addr))
 	}
-	// data tile (3,2): input_data[0..19]
 	for addr, val := range inputData {
-		driver.PreloadMemory(3, 2, val, uint32(addr))
+		if addr >= inputDataLen {
+			break
+		}
+		driver.PreloadMemory(inputTileX, inputTileY, val, uint32(addr))
 	}
 
 	// fire all the cores in the beginning
@@ -86,17 +252,15 @@ func Histogram(rt *runtimecfg.Runtime) int {
 	fmt.Println("========================")
 	fmt.Println("========================")
 
-	// print output memory data
-	outputTile := [2]int{2, 1}
+	// Histogram results written by STORE on tile (0,1); read same tile.
+	outputTile := [2]int{histTileX, histTileY}
 	fmt.Printf("output memory @ tile (%d,%d):\n", outputTile[0], outputTile[1])
-	scanLimit := 5
+	scanLimit := histBins
 	outputData := make([]uint32, scanLimit)
 	for addr := 0; addr < scanLimit; addr++ {
 		val := driver.ReadMemory(outputTile[0], outputTile[1], uint32(addr))
 		outputData[addr] = val
-		if addr < len(inputData) {
-			fmt.Printf("  addr %d -> %d\n", addr, val)
-		}
+		fmt.Printf("  addr %d -> %d\n", addr, val)
 	}
 
 	fmt.Println("expected histogram (CPU):")
@@ -198,13 +362,9 @@ func main() {
 	}
 
 	passed := mismatch == 0
-	if rt.Config.LoggingEnabled {
-		reportPath, err := rt.GenerateSaveAndPrintReport(5, &passed, &mismatch)
-		if err != nil {
-			panic(err)
-		}
-		fmt.Printf("report saved: %s\n", reportPath)
-	} else {
-		fmt.Println("logging disabled in arch spec, skipped report generation")
+	reportPath, err := rt.GenerateSaveAndPrintReport(5, &passed, &mismatch)
+	if err != nil {
+		panic(err)
 	}
+	fmt.Printf("report saved: %s\n", reportPath)
 }
diff --git a/test/testbench/histogram/tmp-generated-dfg.png b/test/testbench/histogram/tmp-generated-dfg.png
new file mode 100644
index 0000000..303e2a4
Binary files /dev/null and b/test/testbench/histogram/tmp-generated-dfg.png differ
diff --git a/test/testbench/histogram/tmp-generated-instructions.asm b/test/testbench/histogram/tmp-generated-instructions.asm
index 815642b..eec2cd2 100644
--- a/test/testbench/histogram/tmp-generated-instructions.asm
+++ b/test/testbench/histogram/tmp-generated-instructions.asm
@@ -1,75 +1,72 @@
-# Compiled II: 5
+# Compiled II: 6
 
-PE(2,1):
+PE(0,0):
 {
-  ADD, [$0], [#1] -> [$0] (t=10, inv_iters=2)
-  DATA_MOV, [EAST, RED] -> [$1] (t=10, inv_iters=2)
+  GRANT_ONCE, [#0] -> [EAST, RED] (t=0, inv_iters=0)
 } (idx_per_ii=0)
 {
-  STORE, [$0], [$1] (t=11, inv_iters=2)
-} (idx_per_ii=1)
-{
-  LOAD, [EAST, RED] -> [$0] (t=9, inv_iters=1)
-} (idx_per_ii=4)
+  ADD, [EAST, RED], [#-5] -> [NORTH, RED] (t=5, inv_iters=0)
+} (idx_per_ii=5)
 
-PE(3,1):
+PE(1,0):
 {
-  ADD, [NORTH, RED], [#-5] -> [$0] (t=5, inv_iters=1)
+  CTRL_MOV, [NORTH, RED] -> [$0] (t=6, inv_iters=1)
 } (idx_per_ii=0)
 {
-  DIV, [$0], [#18] -> [$0] (t=6, inv_iters=1)
+  PHI_START, [WEST, RED], [$0] -> [$0], [NORTH, RED] (t=1, inv_iters=0)
 } (idx_per_ii=1)
 {
-  SEXT, [$0] -> [$0] (t=7, inv_iters=1)
+  GEP, [arg0], [$0] -> [$0] (t=2, inv_iters=0)
 } (idx_per_ii=2)
 {
-  GEP, [$0] -> [WEST, RED], [$0] (t=8, inv_iters=1)
+  LOAD, [$0] -> [$0] (t=3, inv_iters=0)
 } (idx_per_ii=3)
 {
-  DATA_MOV, [$0] -> [WEST, RED] (t=9, inv_iters=1)
-} (idx_per_ii=4)
-
-PE(1,2):
-{
-  DATA_MOV, [EAST, RED] -> [$1] (t=5, inv_iters=1)
-  GRANT_PREDICATE, [$0], [$1] -> [$2] (t=10, inv_iters=2)
-} (idx_per_ii=0)
-{
-  RETURN_VOID, [$2] (t=11, inv_iters=2)
-} (idx_per_ii=1)
-{
-  DATA_MOV, [EAST, RED] -> [$0] (t=4, inv_iters=0)
+  MUL, [$0], [#5] -> [WEST, RED] (t=4, inv_iters=0)
 } (idx_per_ii=4)
 
-PE(2,2):
+PE(0,1):
 {
-  GRANT_PREDICATE, [$1], [$0] -> [$0] (t=5, inv_iters=1)
+  DIV, [SOUTH, RED], [#18] -> [$0] (t=6, inv_iters=1)
 } (idx_per_ii=0)
 {
-  PHI_START, [EAST, RED], [$0] -> [EAST, RED], [$0] (t=1, inv_iters=0)
+  SEXT, [$0] -> [$0] (t=7, inv_iters=1)
 } (idx_per_ii=1)
 {
-  ADD, [$0], [#1] -> [$0], [$1] (t=2, inv_iters=0)
+  GEP, [arg1], [$0] -> [$0], [$1] (t=8, inv_iters=1)
 } (idx_per_ii=2)
 {
-  ICMP_EQ, [$0], [#20] -> [$0], [WEST, RED], [$2] (t=3, inv_iters=0)
+  LOAD, [$0] -> [$0] (t=9, inv_iters=1)
 } (idx_per_ii=3)
 {
-  NOT, [$0] -> [$0] (t=4, inv_iters=0)
-  DATA_MOV, [$2] -> [WEST, RED] (t=4, inv_iters=0)
+  ADD, [$0], [#1] -> [$0] (t=10, inv_iters=1)
 } (idx_per_ii=4)
-
-PE(3,2):
 {
-  GRANT_ONCE, [#0] -> [WEST, RED] (t=0, inv_iters=0)
-} (idx_per_ii=0)
+  STORE, [$0], [$1] (t=11, inv_iters=1)
+} (idx_per_ii=5)
+
+PE(1,1):
 {
-  GEP, [WEST, RED] -> [$0] (t=2, inv_iters=0)
+  ADD, [SOUTH, RED], [#1] -> [$0], [$1] (t=2, inv_iters=0)
 } (idx_per_ii=2)
 {
-  LOAD, [$0] -> [$0] (t=3, inv_iters=0)
+  ICMP_EQ, [$0], [#20] -> [$0], [NORTH, RED], [$2] (t=3, inv_iters=0)
 } (idx_per_ii=3)
 {
-  MUL, [$0], [#5] -> [SOUTH, RED] (t=4, inv_iters=0)
+  NOT, [$0] -> [$0] (t=4, inv_iters=0)
+  DATA_MOV, [$2] -> [NORTH, RED] (t=4, inv_iters=0)
 } (idx_per_ii=4)
+{
+  GRANT_PREDICATE, [$1], [$0] -> [SOUTH, RED] (t=5, inv_iters=0)
+} (idx_per_ii=5)
+
+PE(1,2):
+{
+  DATA_MOV, [SOUTH, RED] -> [$0] (t=4, inv_iters=0)
+  GRANT_PREDICATE, [$0], [$1] -> [$1] (t=10, inv_iters=1)
+} (idx_per_ii=4)
+{
+  DATA_MOV, [SOUTH, RED] -> [$1] (t=5, inv_iters=0)
+  RETURN_VOID, [$1] (t=11, inv_iters=1)
+} (idx_per_ii=5)
 
diff --git a/test/testbench/histogram/tmp-generated-instructions.yaml b/test/testbench/histogram/tmp-generated-instructions.yaml
index aff64b0..3432b84 100644
--- a/test/testbench/histogram/tmp-generated-instructions.yaml
+++ b/test/testbench/histogram/tmp-generated-instructions.yaml
@@ -1,102 +1,83 @@
 array_config:
   columns: 4
   rows: 4
-  compiled_ii: 5
+  compiled_ii: 6
   cores:
-    - column: 2
-      row: 1
-      core_id: "6"
+    - column: 0
+      row: 0
+      core_id: "0"
       entries:
         - entry_id: "entry0"
           instructions:
             - index_per_ii: 0
               operations:
-                - opcode: "ADD"
-                  id: 38
-                  time_step: 10
-                  invalid_iterations: 2
+                - opcode: "GRANT_ONCE"
+                  id: 0
+                  time_step: 0
+                  invalid_iterations: 0
                   src_operands:
-                    - operand: "$0"
-                      color: "RED"
-                    - operand: "#1"
+                    - operand: "#0"
                       color: "RED"
                   dst_operands:
-                    - operand: "$0"
-                      color: "RED"
-                - opcode: "DATA_MOV"
-                  id: 34
-                  time_step: 10
-                  invalid_iterations: 2
-                  src_operands:
                     - operand: "EAST"
                       color: "RED"
-                  dst_operands:
-                    - operand: "$1"
-                      color: "RED"
-            - index_per_ii: 1
-              operations:
-                - opcode: "STORE"
-                  id: 40
-                  time_step: 11
-                  invalid_iterations: 2
-                  src_operands:
-                    - operand: "$0"
-                      color: "RED"
-                    - operand: "$1"
-                      color: "RED"
-            - index_per_ii: 4
+            - index_per_ii: 5
               operations:
-                - opcode: "LOAD"
-                  id: 36
-                  time_step: 9
-                  invalid_iterations: 1
+                - opcode: "ADD"
+                  id: 26
+                  time_step: 5
+                  invalid_iterations: 0
                   src_operands:
                     - operand: "EAST"
                       color: "RED"
+                    - operand: "#-5"
+                      color: "RED"
                   dst_operands:
-                    - operand: "$0"
+                    - operand: "NORTH"
                       color: "RED"
-    - column: 3
-      row: 1
-      core_id: "7"
+    - column: 1
+      row: 0
+      core_id: "1"
       entries:
         - entry_id: "entry0"
           instructions:
             - index_per_ii: 0
               operations:
-                - opcode: "ADD"
-                  id: 26
-                  time_step: 5
+                - opcode: "CTRL_MOV"
+                  id: 27
+                  time_step: 6
                   invalid_iterations: 1
                   src_operands:
                     - operand: "NORTH"
                       color: "RED"
-                    - operand: "#-5"
-                      color: "RED"
                   dst_operands:
                     - operand: "$0"
                       color: "RED"
             - index_per_ii: 1
               operations:
-                - opcode: "DIV"
-                  id: 29
-                  time_step: 6
-                  invalid_iterations: 1
+                - opcode: "PHI_START"
+                  id: 4
+                  time_step: 1
+                  invalid_iterations: 0
                   src_operands:
-                    - operand: "$0"
+                    - operand: "WEST"
                       color: "RED"
-                    - operand: "#18"
+                    - operand: "$0"
                       color: "RED"
                   dst_operands:
                     - operand: "$0"
                       color: "RED"
+                    - operand: "NORTH"
+                      color: "RED"
             - index_per_ii: 2
               operations:
-                - opcode: "SEXT"
-                  id: 31
-                  time_step: 7
-                  invalid_iterations: 1
+                - opcode: "GEP"
+                  id: 8
+                  time_step: 2
+                  invalid_iterations: 0
                   src_operands:
+                    - operand: "arg0"
+                      color: "RED"
                     - operand: "$0"
                       color: "RED"
                   dst_operands:
@@ -104,117 +85,121 @@ array_config:
                       color: "RED"
             - index_per_ii: 3
               operations:
-                - opcode: "GEP"
-                  id: 33
-                  time_step: 8
-                  invalid_iterations: 1
+                - opcode: "LOAD"
+                  id: 13
+                  time_step: 3
+                  invalid_iterations: 0
                   src_operands:
                     - operand: "$0"
                       color: "RED"
                   dst_operands:
-                    - operand: "WEST"
-                      color: "RED"
                     - operand: "$0"
                       color: "RED"
             - index_per_ii: 4
               operations:
-                - opcode: "DATA_MOV"
-                  id: 340000
-                  time_step: 9
-                  invalid_iterations: 1
+                - opcode: "MUL"
+                  id: 20
+                  time_step: 4
+                  invalid_iterations: 0
                   src_operands:
                     - operand: "$0"
                       color: "RED"
+                    - operand: "#5"
+                      color: "RED"
                   dst_operands:
                     - operand: "WEST"
                       color: "RED"
-    - column: 1
-      row: 2
-      core_id: "9"
+    - column: 0
+      row: 1
+      core_id: "4"
       entries:
         - entry_id: "entry0"
           instructions:
             - index_per_ii: 0
               operations:
-                - opcode: "DATA_MOV"
-                  id: 15
-                  time_step: 5
+                - opcode: "DIV"
+                  id: 29
+                  time_step: 6
                   invalid_iterations: 1
                   src_operands:
-                    - operand: "EAST"
+                    - operand: "SOUTH"
+                      color: "RED"
+                    - operand: "#18"
                       color: "RED"
                   dst_operands:
-                    - operand: "$1"
+                    - operand: "$0"
                       color: "RED"
-                - opcode: "GRANT_PREDICATE"
-                  id: 18
-                  time_step: 10
-                  invalid_iterations: 2
+            - index_per_ii: 1
+              operations:
+                - opcode: "SEXT"
+                  id: 31
+                  time_step: 7
+                  invalid_iterations: 1
                   src_operands:
                     - operand: "$0"
                       color: "RED"
-                    - operand: "$1"
-                      color: "RED"
                   dst_operands:
-                    - operand: "$2"
+                    - operand: "$0"
                       color: "RED"
-            - index_per_ii: 1
+            - index_per_ii: 2
               operations:
-                - opcode: "RETURN_VOID"
-                  id: 24
-                  time_step: 11
-                  invalid_iterations: 2
+                - opcode: "GEP"
+                  id: 33
+                  time_step: 8
+                  invalid_iterations: 1
                   src_operands:
-                    - operand: "$2"
+                    - operand: "arg1"
                       color: "RED"
-            - index_per_ii: 4
-              operations:
-                - opcode: "DATA_MOV"
-                  id: 14
-                  time_step: 4
-                  invalid_iterations: 0
-                  src_operands:
-                    - operand: "EAST"
+                    - operand: "$0"
                       color: "RED"
                   dst_operands:
                     - operand: "$0"
                       color: "RED"
-    - column: 2
-      row: 2
-      core_id: "10"
-      entries:
-        - entry_id: "entry0"
-          instructions:
-            - index_per_ii: 0
+                    - operand: "$1"
+                      color: "RED"
+            - index_per_ii: 3
               operations:
-                - opcode: "GRANT_PREDICATE"
-                  id: 25
-                  time_step: 5
+                - opcode: "LOAD"
+                  id: 36
+                  time_step: 9
                   invalid_iterations: 1
                   src_operands:
-                    - operand: "$1"
-                      color: "RED"
                     - operand: "$0"
                       color: "RED"
                   dst_operands:
                     - operand: "$0"
                       color: "RED"
-            - index_per_ii: 1
+            - index_per_ii: 4
               operations:
-                - opcode: "PHI_START"
-                  id: 4
-                  time_step: 1
-                  invalid_iterations: 0
+                - opcode: "ADD"
+                  id: 38
+                  time_step: 10
+                  invalid_iterations: 1
                   src_operands:
-                    - operand: "EAST"
-                      color: "RED"
                     - operand: "$0"
                       color: "RED"
+                    - operand: "#1"
+                      color: "RED"
                   dst_operands:
-                    - operand: "EAST"
+                    - operand: "$0"
                       color: "RED"
+            - index_per_ii: 5
+              operations:
+                - opcode: "STORE"
+                  id: 40
+                  time_step: 11
+                  invalid_iterations: 1
+                  src_operands:
                     - operand: "$0"
                       color: "RED"
+                    - operand: "$1"
+                      color: "RED"
+    - column: 1
+      row: 1
+      core_id: "5"
+      entries:
+        - entry_id: "entry0"
+          instructions:
             - index_per_ii: 2
               operations:
                 - opcode: "ADD"
@@ -222,7 +207,7 @@ array_config:
                   time_step: 2
                   invalid_iterations: 0
                   src_operands:
-                    - operand: "$0"
+                    - operand: "SOUTH"
                       color: "RED"
                     - operand: "#1"
                       color: "RED"
@@ -245,7 +230,7 @@ array_config:
                   dst_operands:
                     - operand: "$0"
                       color: "RED"
-                    - operand: "WEST"
+                    - operand: "NORTH"
                       color: "RED"
                     - operand: "$2"
                       color: "RED"
@@ -269,61 +254,68 @@ array_config:
                     - operand: "$2"
                       color: "RED"
                   dst_operands:
-                    - operand: "WEST"
+                    - operand: "NORTH"
                       color: "RED"
-    - column: 3
-      row: 2
-      core_id: "11"
-      entries:
-        - entry_id: "entry0"
-          instructions:
-            - index_per_ii: 0
+            - index_per_ii: 5
               operations:
-                - opcode: "GRANT_ONCE"
-                  id: 0
-                  time_step: 0
+                - opcode: "GRANT_PREDICATE"
+                  id: 25
+                  time_step: 5
                   invalid_iterations: 0
                   src_operands:
-                    - operand: "#0"
+                    - operand: "$1"
+                      color: "RED"
+                    - operand: "$0"
                       color: "RED"
                   dst_operands:
-                    - operand: "WEST"
+                    - operand: "SOUTH"
                       color: "RED"
-            - index_per_ii: 2
+    - column: 1
+      row: 2
+      core_id: "9"
+      entries:
+        - entry_id: "entry0"
+          instructions:
+            - index_per_ii: 4
               operations:
-                - opcode: "GEP"
-                  id: 8
-                  time_step: 2
+                - opcode: "DATA_MOV"
+                  id: 14
+                  time_step: 4
                   invalid_iterations: 0
                   src_operands:
-                    - operand: "WEST"
+                    - operand: "SOUTH"
                       color: "RED"
                   dst_operands:
                     - operand: "$0"
                       color: "RED"
-            - index_per_ii: 3
-              operations:
-                - opcode: "LOAD"
-                  id: 13
-                  time_step: 3
-                  invalid_iterations: 0
+                - opcode: "GRANT_PREDICATE"
+                  id: 18
+                  time_step: 10
+                  invalid_iterations: 1
                   src_operands:
                     - operand: "$0"
                       color: "RED"
+                    - operand: "$1"
+                      color: "RED"
                   dst_operands:
-                    - operand: "$0"
+                    - operand: "$1"
                       color: "RED"
-            - index_per_ii: 4
+            - index_per_ii: 5
               operations:
-                - opcode: "MUL"
-                  id: 20
-                  time_step: 4
+                - opcode: "DATA_MOV"
+                  id: 15
+                  time_step: 5
                   invalid_iterations: 0
                   src_operands:
-                    - operand: "$0"
-                      color: "RED"
-                    - operand: "#5"
+                    - operand: "SOUTH"
                       color: "RED"
                   dst_operands:
-                    - operand: "SOUTH"
+                    - operand: "$1"
+                      color: "RED"
+                - opcode: "RETURN_VOID"
+                  id: 24
+                  time_step: 11
+                  invalid_iterations: 1
+                  src_operands:
+                    - operand: "$1"
                       color: "RED"
diff --git a/test/testbench/policy_behavior/late_arrival.yaml b/test/testbench/policy_behavior/late_arrival.yaml
new file mode 100644
index 0000000..4092381
--- /dev/null
+++ b/test/testbench/policy_behavior/late_arrival.yaml
@@ -0,0 +1,47 @@
+array_config:
+  columns: 2
+  rows: 1
+  compiled_ii: 8
+  cores:
+    - column: 0
+      row: 0
+      core_id: "relay"
+      entries:
+        - entry_id: "entry0"
+          instructions:
+            - index_per_ii: 0
+              operations:
+                - opcode: "DATA_MOV"
+                  id: 0
+                  time_step: 0
+                  invalid_iterations: 0
+                  src_operands:
+                    - operand: "WEST"
+                      color: "RED"
+                  dst_operands:
+                    - operand: "EAST"
+                      color: "RED"
+    - column: 1
+      row: 0
+      core_id: "sink"
+      entries:
+        - entry_id: "entry0"
+          instructions:
+            - index_per_ii: 0
+              operations:
+                - opcode: "STORE"
+                  id: 1
+                  time_step: 0
+                  invalid_iterations: 0
+                  src_operands:
+                    - operand: "WEST"
+                      color: "RED"
+                    - operand: "0"
+                      color: "RED"
+                - opcode: "RETURN_VALUE"
+                  id: 2
+                  time_step: 0
+                  invalid_iterations: 0
+                  src_operands:
+                    - operand: "1"
+                      color: "RED"
diff --git a/test/testbench/policy_behavior/policy_behavior_test.go b/test/testbench/policy_behavior/policy_behavior_test.go
new file mode 100644
index 0000000..208f7df
--- /dev/null
+++ b/test/testbench/policy_behavior/policy_behavior_test.go
@@ -0,0 +1,137 @@
+package main
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"testing"
+
+	"github.com/sarchlab/akita/v4/sim"
+	"github.com/sarchlab/zeonica/cgra"
+	"github.com/sarchlab/zeonica/core"
+	"github.com/sarchlab/zeonica/runtimecfg"
+)
+
+type runResult struct {
+	panicMsg string
+	memValue uint32
+	retValue uint32
+	endNS    int64
+}
+
+func resolveScenarioPath(t *testing.T, filename string) string {
+	t.Helper()
+
+	_, thisFile, _, ok := runtime.Caller(0)
+	if !ok {
+		t.Fatalf("cannot resolve current test file path")
+	}
+
+	path := filepath.Clean(filepath.Join(filepath.Dir(thisFile), filename))
+	if _, err := os.Stat(path); err != nil {
+		t.Fatalf("scenario file %s not found: %v", path, err)
+	}
+	return path
+}
+
+func writePolicyArchSpec(t *testing.T, policy string) string {
+	t.Helper()
+
+	spec := fmt.Sprintf(`cgra_defaults:
+  rows: 1
+  columns: 2
+simulator:
+  execution_model: "serial"
+  execution_policy: "%s"
+  logging:
+    enabled: false
+  driver:
+    name: "Driver"
+    frequency: "1GHz"
+  device:
+    name: "Device"
+    frequency: "1GHz"
+    bind_to_architecture: true
+`, policy)
+
+	specPath := filepath.Join(t.TempDir(), "arch_spec.yaml")
+	if err := os.WriteFile(specPath, []byte(spec), 0o600); err != nil {
+		t.Fatalf("write arch spec: %v", err)
+	}
+	return specPath
+}
+
+func runWorkloadWithPolicy(t *testing.T, policy, scenarioPath string) (result runResult) {
+	t.Helper()
+
+	defer func() {
+		if recovered := recover(); recovered != nil {
+			result.panicMsg = fmt.Sprint(recovered)
+		}
+	}()
+
+	specPath := writePolicyArchSpec(t, policy)
+	rt, err := runtimecfg.LoadRuntime(specPath, "policy_behavior_"+policy)
+	if err != nil {
+		t.Fatalf("load runtime: %v", err)
+	}
+
+	program := core.LoadProgramFileFromYAML(scenarioPath)
+	if len(program) == 0 {
+		t.Fatalf("empty program map from %s", scenarioPath)
+	}
+
+	width := rt.Config.Columns
+	height := rt.Config.Rows
+	for x := 0; x < width; x++ {
+		for y := 0; y < height; y++ {
+			coord := fmt.Sprintf("(%d,%d)", x, y)
+			if prog, exists := program[coord]; exists {
+				rt.Driver.MapProgram(prog, [2]int{x, y})
+			}
+		}
+	}
+
+	for x := 0; x < width; x++ {
+		for y := 0; y < height; y++ {
+			tile := rt.Device.GetTile(x, y)
+			rt.Engine.Schedule(sim.MakeTickEvent(tile.GetTickingComponent(), 0))
+		}
+	}
+
+	rt.Driver.FeedIn([]uint32{42}, cgra.West, [2]int{0, 1}, 1, "R")
+	rt.Driver.Run()
+
+	result.memValue = rt.Driver.ReadMemory(1, 0, 0)
+	result.retValue = rt.Device.GetTile(1, 0).GetRetVal()
+	result.endNS = int64(rt.Engine.CurrentTime() * 1e9)
+
+	return result
+}
+
+func TestPolicyBehaviorLateArrival(t *testing.T) {
+	scenarioPath := resolveScenarioPath(t, "late_arrival.yaml")
+
+	strict := runWorkloadWithPolicy(t, "strict_timed", scenarioPath)
+	if !strings.Contains(strict.panicMsg, "synchronization violation") {
+		t.Fatalf("strict_timed should report synchronization violation, got: %q", strict.panicMsg)
+	}
+
+	elastic := runWorkloadWithPolicy(t, "elastic_scheduled", scenarioPath)
+	if elastic.panicMsg != "" {
+		t.Fatalf("elastic_scheduled should tolerate late arrival, got panic: %s", elastic.panicMsg)
+	}
+	if elastic.memValue != 42 || elastic.retValue != 1 {
+		t.Fatalf("elastic_scheduled wrong result: mem=%d ret=%d want mem=42 ret=1", elastic.memValue, elastic.retValue)
+	}
+
+	inOrder := runWorkloadWithPolicy(t, "in_order_dataflow", scenarioPath)
+	if inOrder.panicMsg != "" {
+		t.Fatalf("in_order_dataflow should tolerate late arrival, got panic: %s", inOrder.panicMsg)
+	}
+	if inOrder.memValue != 42 || inOrder.retValue != 1 {
+		t.Fatalf("in_order_dataflow wrong result: mem=%d ret=%d want mem=42 ret=1", inOrder.memValue, inOrder.retValue)
+	}
+}
diff --git a/test/testbench/relu/main.go b/test/testbench/relu/main.go
index e47dd49..1ff2085 100644
--- a/test/testbench/relu/main.go
+++ b/test/testbench/relu/main.go
@@ -2,47 +2,36 @@ package main
 
 import (
 	"fmt"
-	"log/slog"
 	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
 
 	"github.com/sarchlab/akita/v4/sim"
-	"github.com/sarchlab/zeonica/api"
-	"github.com/sarchlab/zeonica/config"
 	"github.com/sarchlab/zeonica/core"
+	"github.com/sarchlab/zeonica/runtimecfg"
 )
 
-func Relu() {
-	width := 4
-	height := 4
-
-	engine := sim.NewSerialEngine()
-
-	driver := api.DriverBuilder{}.
-		WithEngine(engine).
-		WithFreq(1 * sim.GHz).
-		Build("Driver")
-
-	device := config.DeviceBuilder{}.
-		WithEngine(engine).
-		WithFreq(1 * sim.GHz).
-		WithWidth(width).
-		WithHeight(height).
-		Build("Device")
-
-	driver.RegisterDevice(device)
-
-	programPath := "test/testbench/relu/relu.yaml"
-
-	// preload data
-
-	data := []int32{1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12, 13, 14, -15, 16, 17, 18, 19, 20, -21, 22, 23, 24, -25, 26, 27, 28, -29, 30, -31, 32} // length is 32
-
-	for i := 0; i < len(data); i++ {
-		driver.PreloadMemory(3, 2, uint32(data[i]), uint32(i))
+// Relu runs the ReLU testbench on the configured runtime.
+//
+//nolint:gocyclo
+func Relu(rt *runtimecfg.Runtime) int {
+	width := rt.Config.Columns
+	height := rt.Config.Rows
+
+	driver := rt.Driver
+	device := rt.Device
+	engine := rt.Engine
+
+	programPath := strings.TrimSpace(os.Getenv("ZEONICA_PROGRAM_YAML"))
+	if programPath == "" {
+		if _, err := os.Stat("relu.yaml"); err == nil {
+			programPath = "relu.yaml"
+		} else {
+			programPath = "relu/relu.yaml"
+		}
 	}
-
 	program := core.LoadProgramFileFromYAML(programPath)
-
 	fmt.Println("program:", program)
 
 	if len(program) == 0 {
@@ -58,43 +47,132 @@ func Relu() {
 		}
 	}
 
+	// preload input data at tile (3,2): 32 int32 values
+	inputData := []int32{1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12, 13, 14, -15, 16, 17, 18, 19, 20, -21, 22, 23, 24, -25, 26, 27, 28, -29, 30, -31, 32}
+	for i := 0; i < len(inputData); i++ {
+		driver.PreloadMemory(3, 2, uint32(inputData[i]), uint32(i))
+	}
+
 	// fire all the cores in the beginning
 	for x := 0; x < width; x++ {
 		for y := 0; y < height; y++ {
 			tile := device.GetTile(x, y)
-			// convert to tileCore
 			tickingComponent := tile.GetTickingComponent()
 			engine.Schedule(sim.MakeTickEvent(tickingComponent, 0))
 		}
 	}
 
-	// TODO: Add PreloadMemory calls if needed for relu test
-	// driver.PreloadMemory(x, y, data, baseAddr)
-
 	driver.Run()
 
 	fmt.Println("========================")
 	fmt.Println("========================")
 	fmt.Println("========================")
 
-	// get memory values in (1,3) from 0x0-0x31
-	for i := 0; i < 32; i++ {
-		value := driver.ReadMemory(1, 3, uint32(i))
-		fmt.Println("memory[", i, "]:", value)
+	// output tile (1,3), 32 elements
+	outputTile := [2]int{1, 3}
+	scanLimit := 32
+	fmt.Printf("output memory @ tile (%d,%d):\n", outputTile[0], outputTile[1])
+	outputData := make([]uint32, scanLimit)
+	for addr := 0; addr < scanLimit; addr++ {
+		val := driver.ReadMemory(outputTile[0], outputTile[1], uint32(addr))
+		outputData[addr] = val
+		fmt.Printf("  addr %d -> %d\n", addr, val)
+	}
+
+	expected := computeReLU(inputData)
+	fmt.Println("expected ReLU (CPU):")
+	reluMismatch := 0
+	for i, val := range expected {
+		fmt.Printf("  addr %d -> %d\n", i, val)
+		if i < len(outputData) && outputData[i] != val {
+			reluMismatch++
+		}
+	}
+	if reluMismatch == 0 {
+		fmt.Println("✅ output matches expected ReLU")
+	} else {
+		fmt.Printf("❌ output mismatches ReLU: %d\n", reluMismatch)
+	}
+	return reluMismatch
+}
+
+func computeReLU(input []int32) []uint32 {
+	out := make([]uint32, len(input))
+	for i, v := range input {
+		if v > 0 {
+			out[i] = uint32(v)
+		} else {
+			out[i] = 0
+		}
+	}
+	return out
+}
+
+func resolveArchSpecPath() (string, error) {
+	fromEnv := strings.TrimSpace(os.Getenv("ZEONICA_ARCH_SPEC"))
+	if fromEnv != "" {
+		if _, err := os.Stat(fromEnv); err == nil {
+			return fromEnv, nil
+		}
+		return "", fmt.Errorf("ZEONICA_ARCH_SPEC points to a missing file: %s", fromEnv)
+	}
+
+	candidates := []string{
+		"test/arch_spec/arch_spec.yaml",
+		"../../arch_spec/arch_spec.yaml",
 	}
+
+	if _, thisFile, _, ok := runtime.Caller(0); ok {
+		candidates = append(candidates,
+			filepath.Clean(filepath.Join(filepath.Dir(thisFile), "..", "..", "arch_spec", "arch_spec.yaml")),
+		)
+	}
+
+	seen := make(map[string]struct{}, len(candidates))
+	normalized := make([]string, 0, len(candidates))
+	for _, candidate := range candidates {
+		clean := filepath.Clean(candidate)
+		if _, exists := seen[clean]; exists {
+			continue
+		}
+		seen[clean] = struct{}{}
+		normalized = append(normalized, clean)
+		if _, err := os.Stat(clean); err == nil {
+			return clean, nil
+		}
+	}
+
+	return "", fmt.Errorf("cannot locate arch spec, tried: %s", strings.Join(normalized, ", "))
 }
 
 func main() {
-	f, err := os.Create("relu.json.log")
+	const testName = "relu"
+
+	archSpecPath, err := resolveArchSpecPath()
 	if err != nil {
 		panic(err)
 	}
-	defer f.Close()
 
-	handler := slog.NewJSONHandler(f, &slog.HandlerOptions{
-		Level: core.LevelTrace,
-	})
+	rt, err := runtimecfg.LoadRuntime(archSpecPath, testName)
+	if err != nil {
+		panic(err)
+	}
+
+	traceLog, err := rt.InitTraceLogger(core.LevelTrace)
+	if err != nil {
+		panic(err)
+	}
+
+	mismatch := Relu(rt)
 
-	slog.SetDefault(slog.New(handler))
-	Relu()
+	if err := runtimecfg.CloseTraceLog(traceLog); err != nil {
+		panic(err)
+	}
+
+	passed := mismatch == 0
+	reportPath, err := rt.GenerateSaveAndPrintReport(5, &passed, &mismatch)
+	if err != nil {
+		panic(err)
+	}
+	fmt.Printf("report saved: %s\n", reportPath)
 }
diff --git a/tool/viz/README.md b/tool/viz/README.md
index 6f6c54c..7af721d 100644
--- a/tool/viz/README.md
+++ b/tool/viz/README.md
@@ -1,6 +1,9 @@
 # CGRA Log Viewer
 
-This viewer visualizes JSONL traces like `gemm.json.log` with a cycle slider and playback.
+This viewer has two synchronized views:
+
+- Timeline replay for JSONL traces (cycle slider + playback)
+- **Strict Timing Offset View** (program YAML + trace correlation by op ID)
 
 ## Run
 
@@ -13,13 +16,155 @@ python3 -m http.server 8000
 Open:
 
 ```text
-http://localhost:8000/viz/
+http://localhost:8000/tool/viz/
 ```
 
-It will try to load `../gemm.json.log` automatically. You can also load any other trace from the file picker.
+The page tries to auto-load files (example):
+
+- `../gemm.json.log` (trace)
+- `../gemm.yaml` (program)
+
+If not found, use file pickers manually.
+
+## Inputs
+
+You need both files to get strict timing comparison:
+
+- **Trace log**: JSONL with `Inst` events (`X`, `Y`, `ID`, `Time`)
+- **Program YAML**: includes `array_config.compiled_ii`, per-core operations with `id` and `time_step`
+
+Optional aggregate report input:
+
+- **Report JSON**: generated report (for example `fir.report.json`) with `grid`, global counters, per-tile `utilizationPct`, and `topHotTiles`.
+- Report can be loaded independently from trace/yaml for quick utilization review.
+- Backpressure metrics are supported when report includes runtime `Backpressure` events:
+  - `backpressureCount`: total downstream backpressure hits (`SendBufBusy`)
+  - `backpressureCycles`: cycles containing at least one backpressure hit
+  - `tiles[].backpressureCount` and `topBackpressureTiles`
+
+## Strict Timing Offset View
+
+Layout behavior:
+
+- Top grid uses hybrid adaptation based on detected grid size:
+  - If YAML provides `array_config.columns/rows`, mesh size uses YAML array bounds first.
+  - If YAML is unavailable, bounds are inferred from trace events.
+  - Prefer fitting into current canvas viewport by scaling tile/gap.
+  - If tile would become too small, switch to expanded `viewBox` to keep readability.
+- Top mesh supports free zoom/pan (wheel zoom + drag pan) for large arrays.
+- If report is loaded, mesh adds a utilization heat overlay from `tiles[].utilizationPct` (missing tiles treated as 0).
+- Active tiles render per-cycle summary text in-tile:
+  - `OP:` instruction opcode summary
+  - `MEM:` direct memory behaviors (e.g. `LoadDirect` / `StoreDirect`)
+  - `RX:` / `TX:` data snippets from `Send` / `Recv` / `FeedIn` / `Collect`
+- DataFlow links keep pulse animation and include inline data labels from trace `Data` fields (deduplicated for same path/data in one cycle).
+- Bottom timing view uses a timeline axis (`Y=core`, `X=cycle`) + drilldown.
+
+Report view:
+
+- Report panel shows summary cards: `totalCycles`, `activeCyclesGlobal`, `idleCyclesGlobal`, `passed`, `mismatchCount`, `activeTileCount`, `totalEvents`.
+- Hot-tile table shows ranked `coord`, `utilizationPct`, `activeCycles`, `totalEvents`.
+- Backpressure section shows ranked `coord`, `bp-count` from `topBackpressureTiles` (if available).
+- If report grid and current mesh grid differ, viewer shows a warning; overlay is clipped to current mesh bounds.
+
+Timing view layout:
+
+- One lane per core `(x,y)` with:
+  - upper sub-row blocks: Expected slots
+  - lower sub-row blocks: Actual slots (all samples in full trace)
+- Timeline blocks are expanded by **all actual samples across full trace length** (not just first occurrence).
+- For each actual sample occurrence, expected block is back-computed and aligned by that sample's delta.
+- Fractional `Time` values are rounded with `Math.round` before slot/time comparison and rendering.
+- `baseline-view` supports:
+  - `strict`: strict baseline only
+  - `compensated`: compensated baseline only
+  - `split`: strict + compensated side-by-side rows for comparison
+- Mismatch blocks/links are drawn as rectangles (not points) for slot-level readability
+- Drilldown panel still shows operation-level details for selected `(core, slot)`
+- `window-start` + `window-size` let you pan/zoom through full trace cycles
+- Optional IO waveform expansion supports multiple cores:
+  - `io-wave-all`: expand waveform rows for all visible cores
+  - `io-wave-core`: multi-select a subset of cores to expand
+  - double-click Y-axis core label: toggle that core's IO wave quickly
+- Expanded IO rows are bus-style waveform segments (trapezoid/diamond transition with parallel top and bottom edges):
+  - `IN` row: DataFlow values from `FeedIn(to tile)` and `Recv(dst tile)`
+  - `OUT` row: DataFlow values from `Send(src tile)` and `Collect(from tile)`
+- IO waveform values are rendered in signed decimal; when multiple values occur in one cycle, the waveform label shows a compact summary and full values remain in tooltip.
+
+Default view (hybrid as main):
+
+- **Default** is `baseline-view=compensated` and `comp-model=hybrid`. The timeline and anomaly filter use **hybrid** status only, so you focus on "mid-trace" offsets after subtracting expected propagation delay; strict remains in summary and drilldown for reference.
+- Use `strict` or `split` only when you want to double-check raw schedule vs trace or debug compiler/schedule issues.
+
+Default interaction:
+
+- `anomaly-only` is disabled by default; when in compensated view it filters by **hybrid** status (not strict).
+- `show-phase-explain` is enabled by default to expose per-core phase offsets
+- `boundary-only` can focus edge PEs to verify boundary shift patterns quickly
+- **Jump to first hybrid mismatch** button moves the time window to the first cycle where any op is a hybrid mismatch.
+- `Ctrl + mouse wheel` zooms timeline quickly (X/Y together). Zoom anchor follows mouse position on X-axis to reduce view jump.
+- `y-zoom` slider adjusts lane height/readability; `Reset Zoom` restores default zoom and window.
+- `comp-model` supports:
+  - `distance-heuristic`: infer propagation delay from core-to-ingress distance
+  - `trace-fitted-phase`: use per-core fitted phase (`modeDelta`)
+  - `hybrid`: prefer fitted when confidence is high, otherwise fall back to distance (default)
+- Click a timeline block/link to inspect operation-level details in drilldown
+- Drilldown now includes sample source fields so each match can be traced back to `Inst` / `LoadDirect` / `StoreDirect`.
+- Core focus supports two synced entry points: click Y-axis core label, or select from `core-focus` dropdown.
+- When a core is focused, the main timeline keeps only that core and an inline mini panel shows source distribution plus a compact in-window trace list.
+- Y-axis label interaction is split:
+  - single-click: focus/unfocus core for main timeline
+  - double-click: toggle IO waveform expansion for that core
+- `Export PNG` downloads the current timeline window
+- `max-side` controls export scaling upper bound; oversized windows are proportionally downscaled
+- For repeated op executions, timeline labels/tooltips include occurrence tag (e.g. `@2` or `[2/5]`).
+
+Status semantics:
+
+- **Strict baseline (truth reference, unchanged):**
+  - `on-time`: `actualSlot == expectedSlot`
+  - `early`: `actualSlot < expectedSlot` (signed modular delta)
+  - `late`: `actualSlot > expectedSlot`
+  - `missing`: operation exists in YAML but no `Inst` with same `(x,y,id)` in trace
+- **Compensated baseline (explanation layer):**
+  - strict delta is rebased by per-core compensation offset
+  - used to reduce global boundary propagation shift false-positives
+  - never replaces strict verdict; always shown as secondary comparison
+
+Phase explanation layer (additive, does not change strict status):
+
+- `Δcore`: dominant per-core phase offset inferred from mismatch mode (`modeDelta`)
+- `conf`: confidence of that offset from mismatch concentration
+- `phase(boundary, inner, gap)`: weighted-median phase summary comparing boundary vs inner cores
+- `deltaRebased`: per-op delta after subtracting `Δcore` (for separating global shift from local residual anomalies)
+
+Shift-aware annotations:
+
+- `first-divergence`: first mismatch point or delta-change point in a core
+- `propagated`: same-delta continuation after divergence (faded style)
+
+Drilldown fields:
+
+- `opId`, `opcode`
+- `expectedSlot`, `actualSlot`
+- `deltaStrict`
+- `deltaComp(<model>)`
+- `statusStrict` / `statusComp`
+- `deltaPhaseRebased`
+- `firstTime`
+- `samples`
+- `sourceSummary` (for example `Inst*10,LoadDirect*2`)
+- `firstDivergence`
+- `samplePreview` with source tags (for example `1:210:Inst,2:213:LoadDirect`)
+
+Recommended read path:
+
+1. Use default **compensated + hybrid** view to see whether there are any mid-trace offsets (hybrid mismatch). Use "Jump to first hybrid mismatch" to focus the window on the first such cycle.
+2. In drilldown, read `statusComp` / `deltaComp(hybrid)` first; treat `statusStrict` / `deltaStrict` as reference only for double-check.
+3. If you need to verify raw schedule vs trace, switch to `strict` or `split` and compare; strict is the truth reference for pass/fail.
 
-## Supported events
+## Supported event families (timeline view)
 
 - `DataFlow` (`FeedIn`, `Send`, `Recv`, `Collect`)
-- `Inst` (`DATA_MOV`, `MUL_ADD`, `STORE`)
-- `Memory` (`StoreDirect`)
+- `Inst` (generic instruction events)
+- `Memory` (e.g., `StoreDirect`)
diff --git a/tool/viz/__pycache__/run_viz.cpython-313.pyc b/tool/viz/__pycache__/run_viz.cpython-313.pyc
new file mode 100644
index 0000000..f91c781
Binary files /dev/null and b/tool/viz/__pycache__/run_viz.cpython-313.pyc differ
diff --git a/tool/viz/app.js b/tool/viz/app.js
index 13d7c56..ba65ad5 100644
--- a/tool/viz/app.js
+++ b/tool/viz/app.js
@@ -1,6 +1,7 @@
 const state = {
   events: [],
   byTime: new Map(),
+  timeKeys: [],
   minTime: 0,
   maxTime: 0,
   currentTime: 0,
@@ -12,9 +13,52 @@ const state = {
   showInst: true,
   showMemory: true,
   showLabels: true,
+  programSpec: null,
+  yamlGridBounds: null,
+  reportSpec: null,
+  reportReady: false,
+  reportError: "",
+  reportHeatMetric: "utilizationPct",
+  timingRows: [],
+  timingColumns: [],
+  timingReady: false,
+  layoutMode: "fit",
+  timingAnomalyOnly: false,
+  timingSelectedCell: null,
+  timingFocusedCoreKey: null,
+  showPhaseExplain: true,
+  timingBoundaryOnly: false,
+  timingBaselineView: "compensated",
+  timingCompModel: "hybrid",
+  timingIoWaveExpandAll: false,
+  timingIoWaveExpandedCoreKeys: new Set(),
+  timingWindowStart: 0,
+  timingWindowSize: 120,
+  timingZoomX: 1,
+  timingZoomY: 1,
+  timingViewport: null,
+  firstHybridMismatchTime: null,
+  coreIoWaveByTime: new Map(),
+  stepLock: false,
 };
 
 const layout = {
+  baseWidth: 940,
+  baseHeight: 620,
+  baseTileSize: 100,
+  baseGap: 24,
+  baseDriverOffset: 52,
+  marginLeft: 170,
+  marginRight: 92,
+  marginTop: 90,
+  marginBottom: 88,
+  minTileSize: 28,
+  maxTileSize: 124,
+  minReadableTile: 36,
+  minGap: 7,
+  maxGap: 28,
+  minDriverOffset: 20,
+  maxDriverOffset: 66,
   width: 940,
   height: 620,
   originX: 170,
@@ -31,11 +75,15 @@ const colors = {
   Collect: "#8338ec",
   Inst: "#f77f00",
   Memory: "#d62828",
+  Backpressure: "#b91c1c",
 };
 
 const svg = d3.select("#canvas");
+let sceneRoot;
 let staticLayer;
 let dynamicLayer;
+let meshZoomBehavior = null;
+let meshZoomTransform = d3.zoomIdentity;
 
 const controls = {
   playBtn: document.getElementById("playBtn"),
@@ -49,14 +97,142 @@ const controls = {
   showMemory: document.getElementById("showMemory"),
   showLabels: document.getElementById("showLabels"),
   fileInput: document.getElementById("fileInput"),
+  yamlInput: document.getElementById("yamlInput"),
+  reportInput: document.getElementById("reportInput"),
   statsLine: document.getElementById("statsLine"),
   eventDump: document.getElementById("eventDump"),
+  reportSummary: document.getElementById("reportSummary"),
+  reportHotTiles: document.getElementById("reportHotTiles"),
+  reportWarning: document.getElementById("reportWarning"),
+  timingSummary: document.getElementById("timingSummary"),
+  timingGrid: document.getElementById("timingGrid"),
+  timingAnomalyOnly: document.getElementById("timingAnomalyOnly"),
+  timingShowPhaseExplain: document.getElementById("timingShowPhaseExplain"),
+  timingBoundaryOnly: document.getElementById("timingBoundaryOnly"),
+  timingCoreFocus: document.getElementById("timingCoreFocus"),
+  timingIoWaveAll: document.getElementById("timingIoWaveAll"),
+  timingIoWaveCore: document.getElementById("timingIoWaveCore"),
+  timingBaselineView: document.getElementById("timingBaselineView"),
+  timingCompModel: document.getElementById("timingCompModel"),
+  timingWindowStart: document.getElementById("timingWindowStart"),
+  timingWindowSize: document.getElementById("timingWindowSize"),
+  timingWindowStartLabel: document.getElementById("timingWindowStartLabel"),
+  timingWindowSizeLabel: document.getElementById("timingWindowSizeLabel"),
+  timingZoomY: document.getElementById("timingZoomY"),
+  timingZoomYLabel: document.getElementById("timingZoomYLabel"),
+  timingResetZoom: document.getElementById("timingResetZoom"),
+  timingExportPng: document.getElementById("timingExportPng"),
+  timingExportMaxSide: document.getElementById("timingExportMaxSide"),
+  timingJumpFirstMismatch: document.getElementById("timingJumpFirstMismatch"),
+  timingDrilldown: document.getElementById("timingDrilldown"),
+  timingCoreMini: document.getElementById("timingCoreMini"),
+  meshLegend: document.getElementById("meshLegend"),
+  vizPanel: document.querySelector(".panel.viz"),
 };
+let timingCoreLabelClickTimer = null;
 
 function tileKey(x, y) {
   return `${x},${y}`;
 }
 
+function clamp(value, min, max) {
+  return Math.max(min, Math.min(max, value));
+}
+
+function normalizeCycleTime(value, fallback = 0) {
+  const numeric = Math.round(Number(value));
+  return Number.isFinite(numeric) ? numeric : fallback;
+}
+
+function nextIndexedTime(current, direction) {
+  const dir = direction >= 0 ? 1 : -1;
+  const keys = Array.isArray(state.timeKeys) ? state.timeKeys : [];
+  const cur = normalizeCycleTime(current, state.minTime);
+  if (keys.length === 0) {
+    const target = cur + dir;
+    return clamp(target, state.minTime, state.maxTime);
+  }
+  const exactIdx = keys.indexOf(cur);
+  if (exactIdx >= 0) {
+    const nextIdx = clamp(exactIdx + dir, 0, keys.length - 1);
+    return keys[nextIdx];
+  }
+  if (dir > 0) {
+    for (const t of keys) {
+      if (t > cur) return t;
+    }
+    return keys[keys.length - 1];
+  }
+  for (let i = keys.length - 1; i >= 0; i -= 1) {
+    if (keys[i] < cur) return keys[i];
+  }
+  return keys[0];
+}
+
+function resolveTargetViewport() {
+  const hostWidth = controls.vizPanel?.clientWidth || layout.baseWidth;
+  const width = Math.max(720, Math.round(hostWidth) - 8);
+  const height = Math.max(480, Math.round(width * (layout.baseHeight / layout.baseWidth)));
+  return { width, height };
+}
+
+function applyAdaptiveLayout() {
+  const cols = Math.max(1, state.maxX + 1);
+  const rows = Math.max(1, state.maxY + 1);
+  const { width: targetWidth, height: targetHeight } = resolveTargetViewport();
+
+  const contentW = Math.max(1, targetWidth - layout.marginLeft - layout.marginRight);
+  const contentH = Math.max(1, targetHeight - layout.marginTop - layout.marginBottom);
+  const baseGridW = cols * layout.baseTileSize + (cols - 1) * layout.baseGap;
+  const baseGridH = rows * layout.baseTileSize + (rows - 1) * layout.baseGap;
+  const fitScale = Math.min(contentW / baseGridW, contentH / baseGridH);
+  const boundedScale = clamp(fitScale, 0.2, 1.45);
+
+  let tileSize = clamp(
+    Math.round(layout.baseTileSize * boundedScale),
+    layout.minTileSize,
+    layout.maxTileSize,
+  );
+  let gap = clamp(Math.round(layout.baseGap * boundedScale), layout.minGap, layout.maxGap);
+  let driverOffset = clamp(
+    Math.round(layout.baseDriverOffset * boundedScale),
+    layout.minDriverOffset,
+    layout.maxDriverOffset,
+  );
+  let mode = "fit";
+  if (tileSize < layout.minReadableTile) {
+    mode = "expand";
+    tileSize = layout.minReadableTile;
+    const readableScale = tileSize / layout.baseTileSize;
+    gap = clamp(Math.round(layout.baseGap * readableScale), layout.minGap, layout.maxGap);
+    driverOffset = clamp(
+      Math.round(layout.baseDriverOffset * readableScale),
+      layout.minDriverOffset,
+      layout.maxDriverOffset,
+    );
+  }
+
+  const gridW = cols * tileSize + (cols - 1) * gap;
+  const gridH = rows * tileSize + (rows - 1) * gap;
+  const neededW = layout.marginLeft + gridW + layout.marginRight;
+  const neededH = layout.marginTop + gridH + layout.marginBottom;
+  const width = mode === "expand" ? Math.max(targetWidth, neededW) : targetWidth;
+  const height = mode === "expand" ? Math.max(targetHeight, neededH) : targetHeight;
+
+  const freeW = width - layout.marginLeft - layout.marginRight - gridW;
+  const freeH = height - layout.marginTop - layout.marginBottom - gridH;
+  layout.width = width;
+  layout.height = height;
+  layout.tileSize = tileSize;
+  layout.gap = gap;
+  layout.driverOffset = driverOffset;
+  layout.originX = layout.marginLeft + Math.max(0, Math.floor(freeW / 2));
+  layout.originY = layout.marginTop + Math.max(0, Math.floor(freeH / 2));
+  state.layoutMode = mode;
+
+  svg.attr("viewBox", `0 0 ${layout.width} ${layout.height}`);
+}
+
 function tileRect(x, y) {
   const step = layout.tileSize + layout.gap;
   const px = layout.originX + x * step;
@@ -81,100 +257,2159 @@ function parseEndpoint(name) {
   const driverMatch = /^Driver\.Device(North|South|East|West)\[(\d+)\]$/.exec(name);
   if (driverMatch) {
     return {
-      kind: "driver",
-      side: driverMatch[1],
-      idx: Number(driverMatch[2]),
-      raw: name,
+      kind: "driver",
+      side: driverMatch[1],
+      idx: Number(driverMatch[2]),
+      raw: name,
+    };
+  }
+  return { kind: "unknown", raw: name };
+}
+
+function endpointPoint(ep) {
+  if (!ep) {
+    return null;
+  }
+  if (ep.kind === "tilePort") {
+    const r = tileRect(ep.x, ep.y);
+    if (ep.port === "North") return { x: r.x + r.w / 2, y: r.y, tile: tileKey(ep.x, ep.y) };
+    if (ep.port === "South") return { x: r.x + r.w / 2, y: r.y + r.h, tile: tileKey(ep.x, ep.y) };
+    if (ep.port === "West") return { x: r.x, y: r.y + r.h / 2, tile: tileKey(ep.x, ep.y) };
+    if (ep.port === "East") return { x: r.x + r.w, y: r.y + r.h / 2, tile: tileKey(ep.x, ep.y) };
+  }
+  if (ep.kind === "driver") {
+    const side = ep.side;
+    const idx = ep.idx;
+    if (side === "North" && idx <= state.maxX) {
+      const r = tileRect(idx, state.maxY);
+      return { x: r.x + r.w / 2, y: r.y - layout.driverOffset };
+    }
+    if (side === "South" && idx <= state.maxX) {
+      const r = tileRect(idx, 0);
+      return { x: r.x + r.w / 2, y: r.y + r.h + layout.driverOffset };
+    }
+    if (side === "West" && idx <= state.maxY) {
+      const r = tileRect(0, idx);
+      return { x: r.x - layout.driverOffset, y: r.y + r.h / 2 };
+    }
+    if (side === "East" && idx <= state.maxY) {
+      const r = tileRect(state.maxX, idx);
+      return { x: r.x + r.w + layout.driverOffset, y: r.y + r.h / 2 };
+    }
+  }
+  return null;
+}
+
+function normalizePortName(value) {
+  const raw = String(value || "").trim().toLowerCase();
+  if (raw === "north" || raw === "n") return "North";
+  if (raw === "south" || raw === "s") return "South";
+  if (raw === "east" || raw === "e") return "East";
+  if (raw === "west" || raw === "w") return "West";
+  return null;
+}
+
+function oppositePort(port) {
+  if (port === "North") return "South";
+  if (port === "South") return "North";
+  if (port === "East") return "West";
+  if (port === "West") return "East";
+  return null;
+}
+
+function tilePortEndpoint(x, y, port) {
+  return {
+    kind: "tilePort",
+    x,
+    y,
+    port,
+    raw: `Device.Tile[${y}][${x}].Core.${port}`,
+  };
+}
+
+function driverEndpoint(side, idx) {
+  return {
+    kind: "driver",
+    side,
+    idx,
+    raw: `Driver.Device${side}[${idx}]`,
+  };
+}
+
+function neighborEndpointFromTilePort(x, y, port) {
+  const opposite = oppositePort(port);
+  if (!opposite) return null;
+  if (port === "North") {
+    const ny = y + 1;
+    if (ny <= state.maxY) return tilePortEndpoint(x, ny, opposite);
+    return driverEndpoint("North", x);
+  }
+  if (port === "South") {
+    const ny = y - 1;
+    if (ny >= 0) return tilePortEndpoint(x, ny, opposite);
+    return driverEndpoint("South", x);
+  }
+  if (port === "East") {
+    const nx = x + 1;
+    if (nx <= state.maxX) return tilePortEndpoint(nx, y, opposite);
+    return driverEndpoint("East", y);
+  }
+  if (port === "West") {
+    const nx = x - 1;
+    if (nx >= 0) return tilePortEndpoint(nx, y, opposite);
+    return driverEndpoint("West", y);
+  }
+  return null;
+}
+
+function inferBounds(events) {
+  let maxX = 0;
+  let maxY = 0;
+  for (const e of events) {
+    if (Number.isInteger(e.X)) maxX = Math.max(maxX, e.X);
+    if (Number.isInteger(e.Y)) maxY = Math.max(maxY, e.Y);
+    for (const f of ["Src", "Dst", "From", "To"]) {
+      if (!e[f]) continue;
+      const ep = parseEndpoint(e[f]);
+      if (ep && ep.kind === "tilePort") {
+        maxX = Math.max(maxX, ep.x);
+        maxY = Math.max(maxY, ep.y);
+      }
+    }
+  }
+  return { maxX, maxY };
+}
+
+function boundsFromProgramSpec(programSpec) {
+  if (!programSpec) return null;
+  const cols = Number(programSpec.arrayColumns);
+  const rows = Number(programSpec.arrayRows);
+  if (!Number.isFinite(cols) || !Number.isFinite(rows) || cols <= 0 || rows <= 0) return null;
+  return {
+    maxX: Math.max(0, Math.round(cols) - 1),
+    maxY: Math.max(0, Math.round(rows) - 1),
+  };
+}
+
+function boundsFromReportSpec(reportSpec) {
+  if (!reportSpec?.grid) return null;
+  const width = Number(reportSpec.grid.width);
+  const height = Number(reportSpec.grid.height);
+  if (!Number.isFinite(width) || !Number.isFinite(height) || width <= 0 || height <= 0) return null;
+  return {
+    maxX: Math.max(0, Math.round(width) - 1),
+    maxY: Math.max(0, Math.round(height) - 1),
+  };
+}
+
+function resolveMeshBounds(events) {
+  const yamlBounds = boundsFromProgramSpec(state.programSpec);
+  if (yamlBounds) return yamlBounds;
+  const traceBounds = inferBounds(events);
+  const hasTraceBounds = traceBounds.maxX > 0 || traceBounds.maxY > 0;
+  if (hasTraceBounds) return traceBounds;
+  const reportBounds = boundsFromReportSpec(state.reportSpec);
+  if (reportBounds) return reportBounds;
+  return traceBounds;
+}
+
+function parseJsonLines(text) {
+  const lines = text.split(/\r?\n/).map((s) => s.trim()).filter(Boolean);
+  const rows = [];
+  let lastTime = null;
+  for (const line of lines) {
+    try {
+      const obj = JSON.parse(line);
+      if (obj && Number.isFinite(Number(obj.Time))) {
+        obj.Time = Math.round(Number(obj.Time));
+        lastTime = obj.Time;
+        rows.push(obj);
+        continue;
+      }
+      // Some memory traces (e.g. LoadDirect/StoreDirect) may omit Time.
+      // Reuse the latest observed cycle to keep them alignable in strict matching.
+      if (obj && obj.msg === "Memory" && Number.isFinite(lastTime)) {
+        obj.Time = lastTime;
+        rows.push(obj);
+      }
+    } catch (_) {
+      // Ignore malformed lines.
+    }
+  }
+  return rows;
+}
+
+function indexByTime(events) {
+  const byTime = new Map();
+  let minTime = Number.POSITIVE_INFINITY;
+  let maxTime = Number.NEGATIVE_INFINITY;
+  for (const e of events) {
+    const tKey = Math.round(Number(e.Time));
+    if (!byTime.has(tKey)) byTime.set(tKey, []);
+    byTime.get(tKey).push(e);
+    minTime = Math.min(minTime, tKey);
+    maxTime = Math.max(maxTime, tKey);
+  }
+  if (!Number.isFinite(minTime) || !Number.isFinite(maxTime)) {
+    minTime = 0;
+    maxTime = 0;
+  }
+  const sortedTimes = [...byTime.keys()].sort((a, b) => a - b);
+  return { byTime, minTime, maxTime, sortedTimes };
+}
+
+function normalizeSlot(value, ii) {
+  const v = Math.round(Number(value));
+  if (!Number.isFinite(v)) return 0;
+  if (ii > 0) {
+    let slot = v % ii;
+    if (slot < 0) slot += ii;
+    return slot;
+  }
+  return v;
+}
+
+function signedDelta(actualSlot, expectedSlot, ii) {
+  if (!Number.isFinite(actualSlot) || !Number.isFinite(expectedSlot)) return null;
+  if (ii <= 0) return actualSlot - expectedSlot;
+  const raw = normalizeSlot(actualSlot - expectedSlot, ii);
+  if (raw === 0) return 0;
+  return raw <= ii / 2 ? raw : raw - ii;
+}
+
+function sortCore(a, b) {
+  if (b.y !== a.y) return b.y - a.y;
+  return a.x - b.x;
+}
+
+function escapeHtml(text) {
+  return String(text)
+    .replaceAll("&", "&amp;")
+    .replaceAll("<", "&lt;")
+    .replaceAll(">", "&gt;")
+    .replaceAll('"', "&quot;");
+}
+
+function formatDelta(delta) {
+  if (delta == null || !Number.isFinite(Number(delta))) return "N/A";
+  const v = Number(delta);
+  return `${v >= 0 ? "+" : ""}${v}`;
+}
+
+function formatDataAsDecimal(value) {
+  if (value == null) return null;
+  const numeric = Number(value);
+  if (Number.isFinite(numeric)) {
+    if (Number.isInteger(numeric)) return String(numeric);
+    return String(numeric);
+  }
+  const text = String(value).trim();
+  return text.length > 0 ? text : null;
+}
+
+function numberOr(value, fallback = 0) {
+  const v = Number(value);
+  return Number.isFinite(v) ? v : fallback;
+}
+
+function integerOr(value, fallback = 0) {
+  return Math.round(numberOr(value, fallback));
+}
+
+function nullableBool(value) {
+  if (typeof value === "boolean") return value;
+  return null;
+}
+
+function parseReportJson(text) {
+  let raw;
+  try {
+    raw = JSON.parse(text);
+  } catch (err) {
+    throw new Error(`Invalid JSON: ${err.message}`);
+  }
+  if (!raw || typeof raw !== "object") {
+    throw new Error("Report root must be an object.");
+  }
+
+  const tiles = (Array.isArray(raw.tiles) ? raw.tiles : [])
+    .map((item) => {
+      const x = integerOr(item?.x, NaN);
+      const y = integerOr(item?.y, NaN);
+      if (!Number.isFinite(x) || !Number.isFinite(y)) return null;
+      const util = Math.max(0, Math.min(100, numberOr(item?.utilizationPct, 0)));
+      return {
+        x,
+        y,
+        coord: String(item?.coord || `(${x},${y})`),
+        activeCycles: Math.max(0, integerOr(item?.activeCycles, 0)),
+        utilizationPct: util,
+        instCount: Math.max(0, integerOr(item?.instCount, 0)),
+        sendCount: Math.max(0, integerOr(item?.sendCount, 0)),
+        recvCount: Math.max(0, integerOr(item?.recvCount, 0)),
+        memoryCount: Math.max(0, integerOr(item?.memoryCount, 0)),
+        totalEvents: Math.max(0, integerOr(item?.totalEvents, 0)),
+        backpressureCount: Math.max(0, integerOr(item?.backpressureCount, 0)),
+      };
+    })
+    .filter(Boolean);
+
+  const topHotTiles = (Array.isArray(raw.topHotTiles) ? raw.topHotTiles : [])
+    .map((item) => {
+      const x = integerOr(item?.x, NaN);
+      const y = integerOr(item?.y, NaN);
+      if (!Number.isFinite(x) || !Number.isFinite(y)) return null;
+      return {
+        x,
+        y,
+        coord: String(item?.coord || `(${x},${y})`),
+        utilizationPct: Math.max(0, Math.min(100, numberOr(item?.utilizationPct, 0))),
+        activeCycles: Math.max(0, integerOr(item?.activeCycles, 0)),
+        totalEvents: Math.max(0, integerOr(item?.totalEvents, 0)),
+      };
+    })
+    .filter(Boolean);
+  const topBackpressureTiles = (Array.isArray(raw.topBackpressureTiles) ? raw.topBackpressureTiles : [])
+    .map((item) => {
+      const x = integerOr(item?.x, NaN);
+      const y = integerOr(item?.y, NaN);
+      if (!Number.isFinite(x) || !Number.isFinite(y)) return null;
+      return {
+        x,
+        y,
+        coord: String(item?.coord || `(${x},${y})`),
+        backpressureCount: Math.max(0, integerOr(item?.backpressureCount, 0)),
+      };
+    })
+    .filter(Boolean);
+
+  const gridWidth = Math.max(0, integerOr(raw?.grid?.width, 0));
+  const gridHeight = Math.max(0, integerOr(raw?.grid?.height, 0));
+  const activeTileCount = Math.max(0, integerOr(raw?.activeTileCount, tiles.length));
+  const fallbackHot = [...tiles]
+    .sort((a, b) => {
+      if (b.utilizationPct !== a.utilizationPct) return b.utilizationPct - a.utilizationPct;
+      if (b.activeCycles !== a.activeCycles) return b.activeCycles - a.activeCycles;
+      return b.totalEvents - a.totalEvents;
+    })
+    .slice(0, 8)
+    .map((t) => ({
+      x: t.x,
+      y: t.y,
+      coord: t.coord,
+      utilizationPct: t.utilizationPct,
+      activeCycles: t.activeCycles,
+      totalEvents: t.totalEvents,
+    }));
+  const fallbackBackpressure = [...tiles]
+    .filter((t) => t.backpressureCount > 0)
+    .sort((a, b) => {
+      if (b.backpressureCount !== a.backpressureCount) return b.backpressureCount - a.backpressureCount;
+      if (b.totalEvents !== a.totalEvents) return b.totalEvents - a.totalEvents;
+      return b.activeCycles - a.activeCycles;
+    })
+    .slice(0, 8)
+    .map((t) => ({
+      x: t.x,
+      y: t.y,
+      coord: t.coord,
+      backpressureCount: t.backpressureCount,
+    }));
+
+  return {
+    testName: String(raw.testName || ""),
+    logPath: String(raw.logPath || ""),
+    grid: {
+      width: gridWidth,
+      height: gridHeight,
+    },
+    totalCycles: Math.max(0, integerOr(raw.totalCycles, 0)),
+    activeCyclesGlobal: Math.max(0, integerOr(raw.activeCyclesGlobal, 0)),
+    idleCyclesGlobal: Math.max(0, integerOr(raw.idleCyclesGlobal, 0)),
+    passed: nullableBool(raw.passed),
+    mismatchCount: raw.mismatchCount == null ? null : Math.max(0, integerOr(raw.mismatchCount, 0)),
+    instCount: Math.max(0, integerOr(raw.instCount, 0)),
+    sendCount: Math.max(0, integerOr(raw.sendCount, 0)),
+    recvCount: Math.max(0, integerOr(raw.recvCount, 0)),
+    memoryCount: Math.max(0, integerOr(raw.memoryCount, 0)),
+    totalEvents: Math.max(0, integerOr(raw.totalEvents, 0)),
+    backpressureCount: Math.max(0, integerOr(raw.backpressureCount, 0)),
+    backpressureCycles: Math.max(0, integerOr(raw.backpressureCycles, 0)),
+    activeTileCount,
+    tiles,
+    topHotTiles: topHotTiles.length > 0 ? topHotTiles : fallbackHot,
+    topBackpressureTiles: topBackpressureTiles.length > 0 ? topBackpressureTiles : fallbackBackpressure,
+  };
+}
+
+function formatPercent(v) {
+  if (!Number.isFinite(Number(v))) return "N/A";
+  return `${Number(v).toFixed(1)}%`;
+}
+
+function renderReportView() {
+  if (!controls.reportSummary || !controls.reportHotTiles || !controls.reportWarning) return;
+
+  if (state.reportError) {
+    controls.reportWarning.textContent = state.reportError;
+    controls.reportWarning.className = "report-warning error";
+    controls.reportSummary.innerHTML = "<div class=\"report-empty\">Report parse failed. Please provide a valid report JSON.</div>";
+    controls.reportHotTiles.innerHTML = "";
+    return;
+  }
+
+  if (!state.reportReady || !state.reportSpec) {
+    controls.reportWarning.textContent = "Load a report JSON to see aggregate utilization and hot-tile stats.";
+    controls.reportWarning.className = "report-warning";
+    controls.reportSummary.innerHTML = "<div class=\"report-empty\">No report loaded.</div>";
+    controls.reportHotTiles.innerHTML = "";
+    return;
+  }
+
+  const report = state.reportSpec;
+  const cards = [
+    ["test", report.testName || "N/A"],
+    ["passed", report.passed == null ? "N/A" : (report.passed ? "yes" : "no")],
+    ["mismatch", report.mismatchCount == null ? "N/A" : report.mismatchCount],
+    ["cycles", report.totalCycles],
+    ["active(global)", report.activeCyclesGlobal],
+    ["idle(global)", report.idleCyclesGlobal],
+    ["active-tiles", report.activeTileCount],
+    ["events", report.totalEvents],
+    ["bp-count", report.backpressureCount],
+    ["bp-cycles", report.backpressureCycles],
+  ];
+  controls.reportSummary.innerHTML = cards.map(
+    ([k, v]) => `<div class="report-card"><div class="report-card-k">${escapeHtml(k)}</div><div class="report-card-v">${escapeHtml(v)}</div></div>`,
+  ).join("");
+
+  const meshW = state.maxX + 1;
+  const meshH = state.maxY + 1;
+  const reportW = integerOr(report.grid?.width, 0);
+  const reportH = integerOr(report.grid?.height, 0);
+  if (reportW > 0 && reportH > 0 && (reportW !== meshW || reportH !== meshH)) {
+    controls.reportWarning.textContent =
+      `grid mismatch: report=${reportW}x${reportH}, mesh=${meshW}x${meshH}. Heat overlay is clipped to current mesh.`;
+    controls.reportWarning.className = "report-warning warn";
+  } else {
+    controls.reportWarning.textContent = `report loaded: ${reportW || "?"}x${reportH || "?"}, log=${report.logPath || "N/A"}`;
+    controls.reportWarning.className = "report-warning";
+  }
+
+  const hotTiles = (Array.isArray(report.topHotTiles) ? report.topHotTiles : []).slice(0, 12);
+  const bpTiles = (Array.isArray(report.topBackpressureTiles) ? report.topBackpressureTiles : []).slice(0, 12);
+  const sections = [];
+  if (hotTiles.length > 0) {
+    const rows = hotTiles.map((tile, idx) =>
+      `<tr><td>${idx + 1}</td><td>${escapeHtml(tile.coord || `(${tile.x},${tile.y})`)}</td><td>${escapeHtml(formatPercent(tile.utilizationPct))}</td><td>${escapeHtml(tile.activeCycles)}</td><td>${escapeHtml(tile.totalEvents)}</td></tr>`).join("");
+    sections.push([
+      "<div class=\"report-hot-title\">Top Hot Tiles</div>",
+      "<table class=\"report-hot-table\">",
+      "<thead><tr><th>#</th><th>coord</th><th>utilization</th><th>activeCycles</th><th>events</th></tr></thead>",
+      `<tbody>${rows}</tbody>`,
+      "</table>",
+    ].join(""));
+  } else {
+    sections.push("<div class=\"report-empty\">No hot-tile entries.</div>");
+  }
+  if (bpTiles.length > 0) {
+    const bpRows = bpTiles.map((tile, idx) =>
+      `<tr><td>${idx + 1}</td><td>${escapeHtml(tile.coord || `(${tile.x},${tile.y})`)}</td><td>${escapeHtml(tile.backpressureCount)}</td></tr>`).join("");
+    sections.push([
+      "<div class=\"report-hot-title\">Top Backpressure Tiles</div>",
+      "<table class=\"report-hot-table\">",
+      "<thead><tr><th>#</th><th>coord</th><th>bp-count</th></tr></thead>",
+      `<tbody>${bpRows}</tbody>`,
+      "</table>",
+    ].join(""));
+  }
+  controls.reportHotTiles.innerHTML = sections.join("");
+}
+
+function applyReportHeatOverlay() {
+  if (!staticLayer) return;
+  const heatTiles = staticLayer.selectAll(".tile-report-heat");
+  if (!heatTiles || heatTiles.empty()) return;
+
+  heatTiles
+    .style("display", "none")
+    .attr("opacity", 0);
+
+  if (!state.reportReady || !state.reportSpec || state.reportHeatMetric !== "utilizationPct") return;
+
+  const byCore = new Map();
+  for (const tile of state.reportSpec.tiles || []) {
+    byCore.set(tileKey(tile.x, tile.y), Math.max(0, Math.min(100, numberOr(tile.utilizationPct, 0))));
+  }
+  heatTiles.each(function (d) {
+    const k = tileKey(d.x, d.y);
+    if (!byCore.has(k)) return;
+    const util = byCore.get(k);
+    const alpha = 0.08 + (util / 100) * 0.52;
+    d3.select(this)
+      .style("display", null)
+      .attr("opacity", alpha)
+      .attr("data-util", util.toFixed(1));
+  });
+}
+
+function loadReport(text) {
+  try {
+    state.reportSpec = parseReportJson(text);
+    state.reportReady = true;
+    state.reportError = "";
+    if (!state.programSpec && state.events.length === 0) {
+      const rb = boundsFromReportSpec(state.reportSpec);
+      if (rb) {
+        state.maxX = rb.maxX;
+        state.maxY = rb.maxY;
+        applyAdaptiveLayout();
+        drawStaticScene();
+      }
+    }
+    applyReportHeatOverlay();
+    renderReportView();
+  } catch (err) {
+    state.reportSpec = null;
+    state.reportReady = false;
+    state.reportError = `Report JSON parse error: ${err.message}`;
+    applyReportHeatOverlay();
+    renderReportView();
+  }
+}
+
+function abbrevOpLabel(slot, maxLen) {
+  const len = maxLen ?? 5;
+  const occTag = slot.occurrenceTotal > 1 ? `@${slot.sampleIndex}` : "";
+  if (slot.opcode && String(slot.opcode).trim()) {
+    const s = String(slot.opcode).trim();
+    const head = s.length <= len ? s : s.slice(0, len);
+    return `${head}${occTag}`;
+  }
+  return `#${slot.opId}${occTag}`;
+}
+
+function weightedMedian(samples) {
+  if (!samples || samples.length === 0) return null;
+  const sorted = [...samples]
+    .filter((s) => Number.isFinite(s.value) && Number.isFinite(s.weight) && s.weight > 0)
+    .sort((a, b) => a.value - b.value);
+  if (sorted.length === 0) return null;
+  const total = sorted.reduce((acc, s) => acc + s.weight, 0);
+  let accWeight = 0;
+  for (const s of sorted) {
+    accWeight += s.weight;
+    if (accWeight >= total / 2) return s.value;
+  }
+  return sorted[sorted.length - 1].value;
+}
+
+function boundaryLabel(x, y, bounds) {
+  const tags = [];
+  if (y === bounds.maxY) tags.push("N");
+  if (y === bounds.minY) tags.push("S");
+  if (x === bounds.minX) tags.push("W");
+  if (x === bounds.maxX) tags.push("E");
+  return tags.length > 0 ? tags.join("") : "Inner";
+}
+
+function computeDeltaRebased(rawDelta, corePhaseOffset, ii) {
+  if (!Number.isFinite(rawDelta) || !Number.isFinite(corePhaseOffset)) return null;
+  return signedDelta(rawDelta - corePhaseOffset, 0, ii);
+}
+
+function summarizeTimingCell(items) {
+  const statusCounts = { onTime: 0, early: 0, late: 0, missing: 0 };
+  let hasFirstDivergence = false;
+  let propagatedCount = 0;
+  let maxAbsDelta = 0;
+  for (const item of items) {
+    if (item.status === "on-time") statusCounts.onTime += 1;
+    if (item.status === "early") statusCounts.early += 1;
+    if (item.status === "late") statusCounts.late += 1;
+    if (item.status === "missing") statusCounts.missing += 1;
+    if (item.firstDivergence) hasFirstDivergence = true;
+    if (item.propagated) propagatedCount += 1;
+    if (Number.isFinite(item.delta)) {
+      maxAbsDelta = Math.max(maxAbsDelta, Math.abs(Number(item.delta)));
+    }
+  }
+  const anomalyCount = statusCounts.early + statusCounts.late + statusCounts.missing;
+  const anomalyScore =
+    statusCounts.missing * 4 +
+    statusCounts.late * 3 +
+    statusCounts.early * 2 +
+    (hasFirstDivergence ? 2 : 0) +
+    (propagatedCount > 0 ? 1 : 0);
+  let dominantStatus = "on-time";
+  if (statusCounts.missing > 0) dominantStatus = "missing";
+  else if (statusCounts.late > 0) dominantStatus = "late";
+  else if (statusCounts.early > 0) dominantStatus = "early";
+  return {
+    statusCounts,
+    dominantStatus,
+    anomalyCount,
+    anomalyScore,
+    hasAnomaly: anomalyCount > 0,
+    hasFirstDivergence,
+    opCount: items.length,
+    maxAbsDelta,
+  };
+}
+
+function buildTimingHeatmap(view) {
+  const cells = new Map();
+  let maxScore = 1;
+  for (const c of view.columns) {
+    for (const slot of view.slots) {
+      const cellKey = `${c.coreKey}|${slot}`;
+      const items = view.cellMap.get(cellKey) || [];
+      const summary = summarizeTimingCell(items);
+      cells.set(cellKey, {
+        cellKey,
+        coreKey: c.coreKey,
+        x: c.x,
+        y: c.y,
+        slot,
+        ...summary,
+      });
+      maxScore = Math.max(maxScore, summary.anomalyScore);
+    }
+  }
+  return { cells, maxScore };
+}
+
+function buildPhaseExplain(view) {
+  const xs = view.columns.map((c) => c.x);
+  const ys = view.columns.map((c) => c.y);
+  const bounds = {
+    minX: xs.length > 0 ? Math.min(...xs) : 0,
+    maxX: xs.length > 0 ? Math.max(...xs) : 0,
+    minY: ys.length > 0 ? Math.min(...ys) : 0,
+    maxY: ys.length > 0 ? Math.max(...ys) : 0,
+  };
+  const coreMap = new Map();
+  const boundarySamples = [];
+  const innerSamples = [];
+
+  for (const c of view.columns) {
+    const label = boundaryLabel(c.x, c.y, bounds);
+    const isBoundary = label !== "Inner";
+    const phaseOffset = Number.isFinite(c.phaseOffset) ? c.phaseOffset : null;
+    const confidence = Number.isFinite(c.phaseConfidence) ? c.phaseConfidence : 0;
+    const detail = {
+      isBoundary,
+      boundaryLabel: label,
+      phaseOffset,
+      phaseConfidence: confidence,
+      modeCount: c.modeCount,
+    };
+    coreMap.set(c.coreKey, detail);
+    if (phaseOffset == null) continue;
+    const sample = { value: phaseOffset, weight: Math.max(1, c.modeCount || 0) };
+    if (isBoundary) {
+      boundarySamples.push(sample);
+    } else {
+      innerSamples.push(sample);
+    }
+  }
+
+  const boundaryPhase = weightedMedian(boundarySamples);
+  const innerPhase = weightedMedian(innerSamples);
+  const phaseGap = Number.isFinite(boundaryPhase) && Number.isFinite(innerPhase)
+    ? signedDelta(boundaryPhase, innerPhase, view.ii)
+    : null;
+
+  return {
+    coreMap,
+    boundaryPhase,
+    innerPhase,
+    phaseGap,
+  };
+}
+
+function inferIngressSidesFromTrace(events) {
+  const sides = new Set();
+  for (const e of events) {
+    if (e.msg !== "DataFlow" || e.Behavior !== "FeedIn" || !e.To) continue;
+    const ep = parseEndpoint(e.To);
+    if (ep && ep.kind === "tilePort") {
+      sides.add(ep.port);
+    }
+  }
+  if (sides.size === 0) {
+    sides.add("North");
+    sides.add("West");
+  }
+  return [...sides];
+}
+
+function distanceToIngress(x, y, bounds, ingressSides) {
+  const d = [];
+  for (const side of ingressSides) {
+    if (side === "North") d.push(bounds.maxY - y);
+    if (side === "South") d.push(y - bounds.minY);
+    if (side === "West") d.push(x - bounds.minX);
+    if (side === "East") d.push(bounds.maxX - x);
+  }
+  if (d.length === 0) return 0;
+  // In GEMM-like wavefronts, readiness is dominated by the slower upstream stream.
+  return Math.max(...d);
+}
+
+function statusFromDelta(deltaValue, missing) {
+  if (missing) return "missing";
+  if (!Number.isFinite(deltaValue)) return "missing";
+  if (deltaValue === 0) return "on-time";
+  return deltaValue < 0 ? "early" : "late";
+}
+
+function getModelOffset(modelItem, compModel) {
+  if (!modelItem) return null;
+  if (compModel === "distance") return modelItem.distanceOffset;
+  if (compModel === "fitted") return modelItem.fittedOffset;
+  return modelItem.hybridOffset;
+}
+
+function getCompDeltaByModel(slot, compModel) {
+  if (compModel === "distance") return slot.deltaCompDistance;
+  if (compModel === "fitted") return slot.deltaCompFitted;
+  return slot.deltaCompHybrid;
+}
+
+function getCompStatusByModel(slot, compModel) {
+  if (compModel === "distance") return slot.statusCompDistance;
+  if (compModel === "fitted") return slot.statusCompFitted;
+  return slot.statusCompHybrid;
+}
+
+function summarizeModelBoundary(ii, phaseExplain, coreOffsets, modelKey) {
+  const boundarySamples = [];
+  const innerSamples = [];
+  for (const [coreKey, offsetInfo] of coreOffsets.entries()) {
+    const offset = offsetInfo[modelKey];
+    if (!Number.isFinite(offset)) continue;
+    const meta = phaseExplain.coreMap.get(coreKey);
+    const weight = Math.max(1, Number(meta?.modeCount || 0));
+    const sample = { value: Number(offset), weight };
+    if (meta?.isBoundary) boundarySamples.push(sample);
+    else innerSamples.push(sample);
+  }
+  const boundary = weightedMedian(boundarySamples);
+  const inner = weightedMedian(innerSamples);
+  const gap = Number.isFinite(boundary) && Number.isFinite(inner)
+    ? signedDelta(boundary, inner, ii)
+    : null;
+  return { boundary, inner, gap };
+}
+
+function buildCompensationModels(view, phaseExplain, events) {
+  const ingressSides = inferIngressSidesFromTrace(events);
+  const xs = view.columns.map((c) => c.x);
+  const ys = view.columns.map((c) => c.y);
+  const bounds = {
+    minX: xs.length > 0 ? Math.min(...xs) : 0,
+    maxX: xs.length > 0 ? Math.max(...xs) : 0,
+    minY: ys.length > 0 ? Math.min(...ys) : 0,
+    maxY: ys.length > 0 ? Math.max(...ys) : 0,
+  };
+
+  const rawDistancePhaseSamples = [];
+  for (const c of view.columns) {
+    const dist = distanceToIngress(c.x, c.y, bounds, ingressSides);
+    const phase = view.ii > 0 ? normalizeSlot(dist, view.ii) : dist;
+    rawDistancePhaseSamples.push({ value: phase, weight: 1, coreKey: c.coreKey, rawDist: dist });
+  }
+  const center = weightedMedian(rawDistancePhaseSamples);
+  const coreOffsets = new Map();
+  for (const c of view.columns) {
+    const phaseMeta = phaseExplain.coreMap.get(c.coreKey) || {};
+    const row = rawDistancePhaseSamples.find((v) => v.coreKey === c.coreKey);
+    const rawPhase = row ? row.value : 0;
+    const distanceOffset = view.ii > 0
+      ? signedDelta(rawPhase, Number.isFinite(center) ? center : 0, view.ii)
+      : rawPhase - (Number.isFinite(center) ? center : 0);
+    const fittedOffset = Number.isFinite(phaseMeta.phaseOffset) ? Number(phaseMeta.phaseOffset) : null;
+    const fittedConfidence = Number.isFinite(phaseMeta.phaseConfidence) ? Number(phaseMeta.phaseConfidence) : 0;
+    const hybridOffset = (Number.isFinite(fittedOffset) && fittedConfidence >= 0.4)
+      ? fittedOffset
+      : distanceOffset;
+    coreOffsets.set(c.coreKey, {
+      distanceOffset,
+      fittedOffset,
+      hybridOffset,
+      fittedConfidence,
+      ingressDistance: row ? row.rawDist : 0,
+    });
+  }
+
+  return {
+    ingressSides,
+    coreOffsets,
+    models: {
+      distance: summarizeModelBoundary(view.ii, phaseExplain, coreOffsets, "distanceOffset"),
+      fitted: summarizeModelBoundary(view.ii, phaseExplain, coreOffsets, "fittedOffset"),
+      hybrid: summarizeModelBoundary(view.ii, phaseExplain, coreOffsets, "hybridOffset"),
+    },
+  };
+}
+
+function alignSlotAtOrAfter(startTime, expectedSlot, ii) {
+  const t0 = Math.round(Number(startTime || 0));
+  if (ii <= 0) return expectedSlot;
+  const slot = normalizeSlot(expectedSlot, ii);
+  let offset = slot - normalizeSlot(t0, ii);
+  if (offset < 0) offset += ii;
+  return t0 + offset;
+}
+
+function buildTimelineLanes(view, visibleColumns, phaseExplain, compensation) {
+  const lanes = [];
+  let minT = Number.POSITIVE_INFINITY;
+  let maxT = Number.NEGATIVE_INFINITY;
+  let totalSlots = 0;
+
+  for (const c of visibleColumns) {
+    const coreMeta = phaseExplain.coreMap.get(c.coreKey) || {};
+    const expectedSlots = [];
+    const actualSlots = [];
+    const compMeta = compensation.coreOffsets.get(c.coreKey) || null;
+    for (const item of c.items) {
+      const samples = (Array.isArray(item.allSamples) && item.allSamples.length > 0)
+        ? item.allSamples
+        : [null];
+      const occurrenceTotal = samples.length;
+      for (let sampleIdx = 0; sampleIdx < samples.length; sampleIdx += 1) {
+        const sample = samples[sampleIdx];
+        const hasActual = sample && Number.isFinite(sample.time);
+        const actualTime = hasActual ? Math.round(sample.time) : null;
+        const deltaStrict = hasActual ? sample.delta : item.delta;
+        const statusStrict = hasActual ? sample.status : item.status;
+        const missing = !hasActual;
+
+        let expectedTime = null;
+        if (hasActual && Number.isFinite(deltaStrict)) {
+          // Expand expected per actual occurrence to cover full trace length.
+          expectedTime = Math.round(actualTime - deltaStrict);
+        } else if (Number.isFinite(item.firstTime) && Number.isFinite(item.delta)) {
+          expectedTime = Math.round(item.firstTime - item.delta);
+        } else if (view.ii > 0) {
+          expectedTime = alignSlotAtOrAfter(state.minTime, item.expectedSlot, view.ii);
+        } else {
+          expectedTime = item.expectedSlot;
+        }
+
+        const deltaRebased = computeDeltaRebased(deltaStrict, coreMeta.phaseOffset, view.ii);
+        const deltaCompDistance = computeDeltaRebased(deltaStrict, compMeta?.distanceOffset, view.ii);
+        const deltaCompFitted = computeDeltaRebased(deltaStrict, compMeta?.fittedOffset, view.ii);
+        const deltaCompHybrid = computeDeltaRebased(deltaStrict, compMeta?.hybridOffset, view.ii);
+        const statusCompDistance = statusFromDelta(deltaCompDistance, missing);
+        const statusCompFitted = statusFromDelta(deltaCompFitted, missing);
+        const statusCompHybrid = statusFromDelta(deltaCompHybrid, missing);
+        const cellKey = `${c.coreKey}|${item.expectedSlot}`;
+        const slot = {
+          coreKey: c.coreKey,
+          x: c.x,
+          y: c.y,
+          expectedSlot: item.expectedSlot,
+          opId: item.id,
+          opcode: item.opcode || "",
+          status: statusStrict,
+          statusStrict,
+          expectedTime,
+          actualTime,
+          delta: deltaStrict,
+          deltaStrict,
+          deltaRebased,
+          deltaCompDistance,
+          deltaCompFitted,
+          deltaCompHybrid,
+          statusCompDistance,
+          statusCompFitted,
+          statusCompHybrid,
+          compDistanceOffset: compMeta?.distanceOffset ?? null,
+          compFittedOffset: compMeta?.fittedOffset ?? null,
+          compHybridOffset: compMeta?.hybridOffset ?? null,
+          firstDivergence: item.firstDivergence,
+          propagated: item.propagated,
+          cellKey,
+          sampleIdx,
+          sampleIndex: sampleIdx + 1,
+          occurrenceTotal,
+          samplePred: hasActual ? sample.pred : null,
+          sampleSource: hasActual ? String(sample.source || "Unknown") : null,
+        };
+        expectedSlots.push(slot);
+        if (Number.isFinite(actualTime)) {
+          actualSlots.push(slot);
+        }
+
+        if (Number.isFinite(expectedTime)) {
+          minT = Math.min(minT, expectedTime);
+          maxT = Math.max(maxT, expectedTime);
+          totalSlots += 1;
+        }
+        if (Number.isFinite(actualTime)) {
+          minT = Math.min(minT, actualTime);
+          maxT = Math.max(maxT, actualTime);
+          totalSlots += 1;
+        }
+      }
+    }
+    lanes.push({
+      coreKey: c.coreKey,
+      x: c.x,
+      y: c.y,
+      modeDelta: c.modeDelta,
+      modeCount: c.modeCount,
+      statusCounts: c.statusCounts,
+      phaseOffset: coreMeta.phaseOffset ?? null,
+      phaseConfidence: coreMeta.phaseConfidence ?? 0,
+      compDistanceOffset: compMeta?.distanceOffset ?? null,
+      compFittedOffset: compMeta?.fittedOffset ?? null,
+      compHybridOffset: compMeta?.hybridOffset ?? null,
+      compFittedConfidence: compMeta?.fittedConfidence ?? 0,
+      boundaryLabel: coreMeta.boundaryLabel || "Inner",
+      isBoundary: Boolean(coreMeta.isBoundary),
+      expectedSlots,
+      actualSlots,
+    });
+  }
+
+  if (!Number.isFinite(minT) || !Number.isFinite(maxT)) {
+    minT = state.minTime;
+    maxT = state.maxTime;
+  }
+  minT = Math.min(minT, state.minTime);
+  maxT = Math.max(maxT, state.maxTime);
+  if (minT === maxT) maxT = minT + 1;
+
+  return {
+    lanes,
+    timeMin: minT,
+    timeMax: maxT,
+    totalSlots,
+  };
+}
+
+function tickStepByRange(span) {
+  if (span <= 40) return 2;
+  if (span <= 120) return 5;
+  if (span <= 360) return 10;
+  if (span <= 900) return 25;
+  return 50;
+}
+
+function renderTimelineSvg(_view, timeline) {
+  const wrap = controls.timingGrid;
+  if (!wrap) return;
+  wrap.innerHTML = "";
+
+  const baselineView = ["strict", "compensated", "split"].includes(state.timingBaselineView)
+    ? state.timingBaselineView
+    : "strict";
+  const compModel = ["distance", "fitted", "hybrid"].includes(state.timingCompModel)
+    ? state.timingCompModel
+    : "hybrid";
+
+  const fullMin = timeline.timeMin;
+  const fullMax = timeline.timeMax;
+  const fullSpan = Math.max(1, fullMax - fullMin + 1);
+  const minWindow = 1;
+  const windowSize = clamp(
+    Math.round(Number(state.timingWindowSize || Math.min(120, fullSpan))),
+    minWindow,
+    fullSpan,
+  );
+  const startMax = Math.max(fullMin, fullMax - windowSize + 1);
+  const windowStart = clamp(
+    Math.round(Number(state.timingWindowStart || fullMin)),
+    fullMin,
+    startMax,
+  );
+  const windowEnd = windowStart + windowSize - 1;
+  state.timingWindowStart = windowStart;
+  state.timingWindowSize = windowSize;
+
+  if (controls.timingWindowStart) {
+    controls.timingWindowStart.min = String(fullMin);
+    controls.timingWindowStart.max = String(startMax);
+    controls.timingWindowStart.value = String(windowStart);
+    controls.timingWindowStart.disabled = fullSpan <= 1;
+  }
+  if (controls.timingWindowSize) {
+    controls.timingWindowSize.min = String(minWindow);
+    controls.timingWindowSize.max = String(fullSpan);
+    controls.timingWindowSize.value = String(windowSize);
+  }
+  if (controls.timingWindowStartLabel) {
+    controls.timingWindowStartLabel.textContent = `T${windowStart}-T${windowEnd}`;
+  }
+  if (controls.timingWindowSizeLabel) {
+    controls.timingWindowSizeLabel.textContent = `${windowSize} cycles`;
+  }
+
+  const zoomY = clamp(Number(state.timingZoomY || 1), 0.6, 4);
+  state.timingZoomX = 1;
+  state.timingZoomY = zoomY;
+  if (controls.timingZoomY) {
+    controls.timingZoomY.value = String(Math.round(zoomY * 100));
+  }
+  if (controls.timingZoomYLabel) {
+    controls.timingZoomYLabel.textContent = `${zoomY.toFixed(2)}x`;
+  }
+
+  const leftPad = 242;
+  const rightPad = 30;
+  const topPad = 30;
+  const bottomPad = 38;
+  const slotHeight = clamp(Math.round(8 * zoomY), 6, 34);
+  const subLaneGap = clamp(Math.round(5 * zoomY), 3, 22);
+  const laneGap = clamp(Math.round(10 * zoomY), 6, 46);
+  const splitView = baselineView === "split";
+  const baseLaneRows = splitView ? 3 : 2;
+  const availableCoreKeys = new Set(timeline.lanes.map((lane) => lane.coreKey));
+  const selectedIoKeys = new Set([...(state.timingIoWaveExpandedCoreKeys || [])]);
+  const ioWaveExpandedKeys = state.timingIoWaveExpandAll
+    ? new Set([...availableCoreKeys])
+    : new Set([...selectedIoKeys].filter((key) => availableCoreKeys.has(key)));
+  const rowStep = slotHeight + subLaneGap;
+  const laneData = [];
+  let yCursor = topPad;
+  for (let idx = 0; idx < timeline.lanes.length; idx += 1) {
+    const lane = timeline.lanes[idx];
+    const hasIoRows = ioWaveExpandedKeys.has(lane.coreKey);
+    const laneRows = baseLaneRows + (hasIoRows ? 2 : 0);
+    const laneHeight = laneRows * slotHeight + (laneRows - 1) * subLaneGap;
+    laneData.push({
+      ...lane,
+      idx,
+      hasIoRows,
+      laneRows,
+      yBase: yCursor,
+      yExpected: yCursor,
+      yStrict: yCursor + rowStep,
+      yComp: yCursor + rowStep * 2,
+      yIoIn: hasIoRows ? yCursor + rowStep * baseLaneRows : null,
+      yIoOut: hasIoRows ? yCursor + rowStep * (baseLaneRows + 1) : null,
+    });
+    yCursor += laneHeight + laneGap;
+  }
+  const laneCount = Math.max(1, laneData.length);
+  const plotH = Math.max(1, yCursor - topPad - laneGap);
+  const wrapWidth = Math.max(860, Math.round(wrap.clientWidth || 0) - 2);
+  const plotW = Math.max(620, wrapWidth - leftPad - rightPad);
+  const width = leftPad + plotW + rightPad;
+  const height = topPad + plotH + bottomPad;
+  const labelFontSize = clamp(Math.round(slotHeight * 0.72), 7, 13);
+  const labelMinWidth = Math.max(14, labelFontSize * 2 + 2);
+
+  const svgEl = d3.create("svg")
+    .attr("id", "timingTimelineSvg")
+    .attr("class", "timing-timeline-svg")
+    .attr("viewBox", `0 0 ${width} ${height}`)
+    .attr("width", width)
+    .attr("height", height);
+
+  const xScale = d3.scaleLinear()
+    .domain([windowStart, windowEnd + 1])
+    .range([leftPad, leftPad + plotW]);
+  state.timingViewport = {
+    fullMin,
+    fullMax,
+    fullSpan,
+    windowStart,
+    windowSize,
+    windowEnd,
+    leftPad,
+    plotW,
+  };
+
+  const ticks = [];
+  for (let t = windowStart; t <= windowEnd; t += 1) ticks.push(t);
+
+  // Cycle boundaries (X): dashed vertical lines only; no other grid
+  const grid = svgEl.append("g").attr("class", "timeline-grid");
+  for (const t of ticks) {
+    const x = xScale(t);
+    grid.append("line")
+      .attr("x1", x).attr("x2", x)
+      .attr("y1", topPad).attr("y2", topPad + plotH)
+      .attr("class", "timeline-cycle-sep");
+    grid.append("text")
+      .attr("x", x + 2)
+      .attr("y", topPad + plotH + 16)
+      .attr("class", "timeline-tick")
+      .text(`T${t}`);
+  }
+  svgEl.append("line")
+    .attr("x1", leftPad).attr("x2", leftPad + plotW)
+    .attr("y1", topPad + plotH).attr("y2", topPad + plotH)
+    .attr("class", "timeline-axis");
+
+  // Core boundaries (Y): one dashed/dark line between each core for easier row matching
+  const coreSep = svgEl.append("g").attr("class", "timeline-core-seps");
+  for (let idx = 1; idx < laneCount; idx += 1) {
+    const y = laneData[idx].yBase;
+    coreSep.append("line")
+      .attr("x1", leftPad)
+      .attr("x2", leftPad + plotW)
+      .attr("y1", y)
+      .attr("y2", y)
+      .attr("class", "timeline-core-sep");
+  }
+
+  // Lane labels (no inner sub-row grid lines)
+  const lanesG = svgEl.append("g").attr("class", "timeline-lanes");
+  for (const lane of laneData) {
+    const modelOffset = compModel === "distance"
+      ? lane.compDistanceOffset
+      : (compModel === "fitted" ? lane.compFittedOffset : lane.compHybridOffset);
+    const phaseText = state.showPhaseExplain
+      ? ` SΔ=${lane.phaseOffset == null ? "N/A" : formatDelta(lane.phaseOffset)} CΔ=${formatDelta(modelOffset)}`
+      : "";
+    lanesG.append("text")
+      .attr("x", 8)
+      .attr("y", lane.yExpected + slotHeight + 1)
+      .attr("class", [
+        "timeline-core-label",
+        lane.isBoundary ? "boundary" : "",
+        lane.hasIoRows ? "io-expanded" : "",
+        state.timingFocusedCoreKey === lane.coreKey ? "focused" : "",
+      ].filter(Boolean).join(" "))
+      .attr("data-core-key", lane.coreKey)
+      .attr("title", `Click: focus core (${lane.x},${lane.y}) | Double-click: toggle IO wave`)
+      .text(`(${lane.x},${lane.y}) ${lane.boundaryLabel}${phaseText}`);
+    lanesG.append("text")
+      .attr("x", leftPad - 64)
+      .attr("y", lane.yExpected + slotHeight - 1)
+      .attr("class", "timeline-lane-tag")
+      .text("E");
+    lanesG.append("text")
+      .attr("x", leftPad - 64)
+      .attr("y", lane.yStrict + slotHeight - 1)
+      .attr("class", "timeline-lane-tag")
+      .text(splitView ? "S" : (baselineView === "strict" ? "A" : "C"));
+    if (splitView) {
+      lanesG.append("text")
+        .attr("x", leftPad - 64)
+        .attr("y", lane.yComp + slotHeight - 1)
+        .attr("class", "timeline-lane-tag")
+        .text("C");
+    }
+    if (lane.hasIoRows) {
+      lanesG.append("text")
+        .attr("x", leftPad - 64)
+        .attr("y", lane.yIoIn + slotHeight - 1)
+        .attr("class", "timeline-lane-tag timeline-io-tag-in")
+        .text("IN");
+      lanesG.append("text")
+        .attr("x", leftPad - 64)
+        .attr("y", lane.yIoOut + slotHeight - 1)
+        .attr("class", "timeline-lane-tag timeline-io-tag-out")
+        .text("OUT");
+    }
+  }
+
+  const slotG = svgEl.append("g").attr("class", "timeline-rects");
+  const keepSlot = (slot) => {
+    if (!state.timingAnomalyOnly) return true;
+    const strictAnomaly = slot.statusStrict !== "on-time";
+    const compAnomaly = getCompStatusByModel(slot, compModel) !== "on-time";
+    if (baselineView === "strict") return strictAnomaly;
+    if (baselineView === "compensated") return compAnomaly;
+    return strictAnomaly || compAnomaly;
+  };
+  const applyStackLayout = (items, keyOf, tieBreakOf) => {
+    const buckets = new Map();
+    for (const item of items) {
+      const key = keyOf(item);
+      const arr = buckets.get(key) || [];
+      arr.push(item);
+      buckets.set(key, arr);
+    }
+    for (const group of buckets.values()) {
+      group.sort((a, b) => tieBreakOf(a) - tieBreakOf(b));
+      const total = group.length;
+      if (total <= 1) {
+        group[0].stackIndex = 0;
+        group[0].stackTotal = 1;
+        continue;
+      }
+      for (let i = 0; i < group.length; i += 1) {
+        group[i].stackIndex = i;
+        group[i].stackTotal = total;
+      }
+    }
+  };
+  const resolveStackGeometry = (baseY, baseH, stackIndex, stackTotal) => {
+    if (!Number.isFinite(stackTotal) || stackTotal <= 1) {
+      return { y: baseY, h: baseH };
+    }
+    // Keep all stacked blocks visible within one cycle slot row.
+    const gap = 1;
+    const innerH = Math.max(2, Math.floor((baseH - gap * (stackTotal - 1)) / stackTotal));
+    const y = baseY + stackIndex * (innerH + gap);
+    return { y, h: innerH };
+  };
+  const summarizeWaveValues = (values, maxItems = 2) => {
+    const arr = Array.isArray(values) ? values : [];
+    if (arr.length === 0) return "";
+    const shown = arr.slice(0, maxItems).map((v) => shortText(v, 7));
+    const remain = arr.length - shown.length;
+    if (remain > 0) shown.push(`+${remain}`);
+    return shown.join(",");
+  };
+  const ioBusPath = (xLeft, xRight, yTop, yBottom) => {
+    const yMid = (yTop + yBottom) / 2;
+    const w = Math.max(1, xRight - xLeft);
+    const edge = Math.min(5, Math.max(1, Math.round(w * 0.22)));
+    return [
+      `M${xLeft + edge},${yTop}`,
+      `L${xRight - edge},${yTop}`,
+      `L${xRight},${yMid}`,
+      `L${xRight - edge},${yBottom}`,
+      `L${xLeft + edge},${yBottom}`,
+      `L${xLeft},${yMid}`,
+      "Z",
+    ].join(" ");
+  };
+  const drawIoWaveRow = (lane, yPos, direction) => {
+    if (!Number.isFinite(yPos)) return;
+    const byTime = state.coreIoWaveByTime.get(lane.coreKey);
+    if (!byTime) return;
+    for (let t = windowStart; t <= windowEnd; t += 1) {
+      const entry = byTime.get(t);
+      const values = direction === "in" ? entry?.inVals : entry?.outVals;
+      if (!Array.isArray(values) || values.length === 0) continue;
+      const xLeft = xScale(t);
+      const xRight = xScale(t + 1);
+      const w = Math.max(1, xRight - xLeft);
+      slotG.append("path")
+        .attr("d", ioBusPath(xLeft, xRight, yPos, yPos + slotHeight))
+        .attr("class", `timeline-io-bus ${direction === "in" ? "timeline-io-bus-in" : "timeline-io-bus-out"}`)
+        .attr(
+          "title",
+          `${direction === "in" ? "Input" : "Output"} core=(${lane.x},${lane.y}) t=${t} values=${values.join(",")}`,
+        );
+      if (w >= labelMinWidth + 6) {
+        slotG.append("text")
+          .attr("x", xLeft + w / 2)
+          .attr("y", yPos + slotHeight / 2)
+          .attr("text-anchor", "middle")
+          .attr("dominant-baseline", "middle")
+          .attr("class", `timeline-io-bus-label ${direction === "in" ? "timeline-io-bus-label-in" : "timeline-io-bus-label-out"}`)
+          .attr("font-size", labelFontSize)
+          .text(summarizeWaveValues(values));
+      }
+    }
+  };
+  const drawActualRow = (lane, yPos, rowMode) => {
+    const drawables = [];
+    for (const slot of lane.expectedSlots) {
+      if (!keepSlot(slot)) continue;
+      const rowStatus = rowMode === "strict" ? slot.statusStrict : getCompStatusByModel(slot, compModel);
+      let drawTime = null;
+      let cls = "actual-ok";
+      if (rowStatus === "missing") {
+        drawTime = slot.expectedTime;
+        cls = "missing";
+      } else if (Number.isFinite(slot.actualTime)) {
+        drawTime = slot.actualTime;
+        if (rowMode === "strict") {
+          cls = rowStatus === "on-time" ? "actual-ok" : "actual-bad";
+        } else {
+          cls = rowStatus === "on-time" ? "actual-comp-ok" : "actual-comp-bad";
+        }
+      }
+      if (!Number.isFinite(drawTime)) continue;
+      if (drawTime < windowStart || drawTime > windowEnd) continue;
+      drawables.push({ slot, rowStatus, drawTime, cls });
+    }
+    applyStackLayout(
+      drawables,
+      (d) => `${lane.coreKey}|${rowMode}|${d.drawTime}`,
+      (d) => (d.slot.opId * 10000) + (d.slot.sampleIndex || 0),
+    );
+    for (const d of drawables) {
+      const { slot, rowStatus, drawTime, cls, stackIndex = 0, stackTotal = 1 } = d;
+      const x0 = xScale(drawTime);
+      const x1 = xScale(drawTime + 1);
+      const w = Math.max(1, Math.floor(x1 - x0 - 1));
+      const selected = state.timingSelectedCell === slot.cellKey ? "selected" : "";
+      const geom = resolveStackGeometry(yPos, slotHeight, stackIndex, stackTotal);
+      slotG.append("rect")
+        .attr("x", x0 + 0.5)
+        .attr("y", geom.y)
+        .attr("width", w)
+        .attr("height", geom.h)
+        .attr("class", `timeline-rect ${cls} ${selected}`)
+        .attr("data-timing-cell", slot.cellKey)
+        .attr(
+          "title",
+          `${rowMode === "strict" ? "Strict" : `Comp(${compModel})`} #${slot.opId}[${slot.sampleIndex}/${slot.occurrenceTotal}] status=${rowStatus} t=${Number.isFinite(slot.actualTime) ? slot.actualTime : "N/A"} deltaS=${formatDelta(slot.deltaStrict)} deltaC=${formatDelta(getCompDeltaByModel(slot, compModel))}`,
+        );
+      if (w >= labelMinWidth && geom.h >= 8) {
+        slotG.append("text")
+          .attr("x", x0 + 0.5 + w / 2)
+          .attr("y", geom.y + geom.h / 2)
+          .attr("text-anchor", "middle")
+          .attr("dominant-baseline", "middle")
+          .attr("class", "timeline-rect-label timeline-rect-label-actual")
+          .attr("font-size", labelFontSize)
+          .text(abbrevOpLabel(slot));
+      }
+
+      if (state.timingSelectedCell === slot.cellKey && Number.isFinite(slot.actualTime) && Number.isFinite(slot.expectedTime)) {
+        const xe = xScale(slot.expectedTime) + Math.max(1, Math.floor((xScale(slot.expectedTime + 1) - xScale(slot.expectedTime)) / 2));
+        const xa = xScale(slot.actualTime) + Math.max(1, Math.floor((xScale(slot.actualTime + 1) - xScale(slot.actualTime)) / 2));
+        const linkClass = rowMode === "strict"
+          ? (rowStatus === "on-time" ? "ok" : "bad")
+          : (rowStatus === "on-time" ? "comp-ok" : "comp-bad");
+        slotG.append("line")
+          .attr("x1", xe).attr("y1", lane.yExpected + slotHeight)
+          .attr("x2", xa).attr("y2", geom.y)
+          .attr("class", `timeline-link ${linkClass}`);
+      }
+    }
+  };
+
+  for (const lane of laneData) {
+    const expectedDrawables = [];
+    for (const slot of lane.expectedSlots) {
+      if (!keepSlot(slot)) continue;
+      if (!Number.isFinite(slot.expectedTime)) continue;
+      if (slot.expectedTime < windowStart || slot.expectedTime > windowEnd) continue;
+      expectedDrawables.push({ slot, drawTime: slot.expectedTime });
+    }
+    applyStackLayout(
+      expectedDrawables,
+      (d) => `${lane.coreKey}|expected|${d.drawTime}`,
+      (d) => (d.slot.opId * 10000) + (d.slot.sampleIndex || 0),
+    );
+    for (const d of expectedDrawables) {
+      const { slot, stackIndex = 0, stackTotal = 1 } = d;
+      const x0 = xScale(slot.expectedTime);
+      const x1 = xScale(slot.expectedTime + 1);
+      const w = Math.max(1, Math.floor(x1 - x0 - 1));
+      const selected = state.timingSelectedCell === slot.cellKey ? "selected" : "";
+      const geom = resolveStackGeometry(lane.yExpected, slotHeight, stackIndex, stackTotal);
+      slotG.append("rect")
+        .attr("x", x0 + 0.5)
+        .attr("y", geom.y)
+        .attr("width", w)
+        .attr("height", geom.h)
+        .attr("class", `timeline-rect expected ${selected}`)
+        .attr("data-timing-cell", slot.cellKey)
+        .attr(
+          "title",
+          `Expected #${slot.opId}[${slot.sampleIndex}/${slot.occurrenceTotal}] (${slot.opcode || "N/A"}) t=${slot.expectedTime} deltaS=${formatDelta(slot.deltaStrict)} deltaC=${formatDelta(getCompDeltaByModel(slot, compModel))}`,
+        );
+      if (w >= labelMinWidth && geom.h >= 8) {
+        slotG.append("text")
+          .attr("x", x0 + 0.5 + w / 2)
+          .attr("y", geom.y + geom.h / 2)
+          .attr("text-anchor", "middle")
+          .attr("dominant-baseline", "middle")
+          .attr("class", "timeline-rect-label timeline-rect-label-expected")
+          .attr("font-size", labelFontSize)
+          .text(abbrevOpLabel(slot));
+      }
+    }
+    if (baselineView === "strict") {
+      drawActualRow(lane, lane.yStrict, "strict");
+    } else if (baselineView === "compensated") {
+      drawActualRow(lane, lane.yStrict, "comp");
+    } else {
+      drawActualRow(lane, lane.yStrict, "strict");
+      drawActualRow(lane, lane.yComp, "comp");
+    }
+    if (lane.hasIoRows) {
+      drawIoWaveRow(lane, lane.yIoIn, "in");
+      drawIoWaveRow(lane, lane.yIoOut, "out");
+    }
+  }
+
+  // Legend
+  const legend = svgEl.append("g").attr("class", "timeline-legend").attr("transform", `translate(${leftPad},14)`);
+  const legendItems = baselineView === "strict"
+    ? [
+      ["Expected slot", "timeline-legend-exp"],
+      ["Strict on-time", "timeline-legend-act-ok"],
+      ["Strict mismatch", "timeline-legend-act-bad"],
+      ["Missing", "timeline-legend-missing"],
+    ]
+    : (baselineView === "compensated"
+      ? [
+        ["Expected slot", "timeline-legend-exp"],
+        [compModel === "hybrid" ? "Hybrid on-time" : `Comp(${compModel}) on-time`, "timeline-legend-comp-ok"],
+        [compModel === "hybrid" ? "Hybrid mismatch" : `Comp(${compModel}) mismatch`, "timeline-legend-comp-bad"],
+        ["Missing", "timeline-legend-missing"],
+      ]
+      : [
+        ["Expected slot", "timeline-legend-exp"],
+        ["Strict on-time", "timeline-legend-act-ok"],
+        ["Strict mismatch", "timeline-legend-act-bad"],
+        [`Comp(${compModel}) on-time`, "timeline-legend-comp-ok"],
+        [`Comp(${compModel}) mismatch`, "timeline-legend-comp-bad"],
+        ["Missing", "timeline-legend-missing"],
+      ]);
+  if (laneData.some((lane) => lane.hasIoRows)) {
+    legendItems.push(["IN bus", "timeline-legend-io-in"]);
+    legendItems.push(["OUT bus", "timeline-legend-io-out"]);
+  }
+  const legendGap = 132;
+  legendItems.forEach((it, i) => {
+    const gx = i * legendGap;
+    legend.append("rect").attr("x", gx).attr("y", -4).attr("width", 10).attr("height", 8).attr("class", it[1]);
+    legend.append("text").attr("x", gx + 14).attr("y", 4).attr("class", "timeline-legend-text").text(it[0]);
+  });
+
+  wrap.appendChild(svgEl.node());
+}
+
+function timelineZoomAnchorTimeFromWheel(event) {
+  const vp = state.timingViewport;
+  if (!vp) {
+    return Number(state.timingWindowStart || 0) + Number(state.timingWindowSize || 1) / 2;
+  }
+  const svgElement = document.getElementById("timingTimelineSvg");
+  if (!svgElement) {
+    return vp.windowStart + vp.windowSize / 2;
+  }
+  const rect = svgElement.getBoundingClientRect();
+  const localX = event.clientX - rect.left;
+  const ratio = clamp((localX - vp.leftPad) / Math.max(1, vp.plotW), 0, 0.999);
+  return vp.windowStart + ratio * vp.windowSize;
+}
+
+function handleTimelineCtrlWheelZoom(event) {
+  if (!event.ctrlKey) return;
+  if (state.events.length === 0 || !state.programSpec) return;
+  event.preventDefault();
+
+  const vp = state.timingViewport || {
+    fullMin: state.minTime,
+    fullMax: state.maxTime,
+    fullSpan: Math.max(1, state.maxTime - state.minTime + 1),
+  };
+  const fullMin = vp.fullMin;
+  const fullMax = vp.fullMax;
+  const fullSpan = Math.max(1, vp.fullSpan || (fullMax - fullMin + 1));
+  const minWindow = 1;
+  const oldWindow = Math.max(1, Number(state.timingWindowSize || Math.min(120, fullSpan)));
+  const oldStart = Number(state.timingWindowStart || fullMin);
+  const zoomIn = event.deltaY < 0;
+  const anchorTime = timelineZoomAnchorTimeFromWheel(event);
+  const nextWindow = clamp(
+    Math.round(oldWindow * (zoomIn ? 0.88 : 1.14)),
+    minWindow,
+    fullSpan,
+  );
+  const anchorRatio = clamp((anchorTime - oldStart) / oldWindow, 0, 1);
+  const startMax = Math.max(fullMin, fullMax - nextWindow + 1);
+  const nextStart = clamp(
+    Math.round(anchorTime - anchorRatio * nextWindow),
+    fullMin,
+    startMax,
+  );
+  const factor = zoomIn ? 1.08 : 1 / 1.08;
+  state.timingWindowStart = nextStart;
+  state.timingWindowSize = nextWindow;
+  state.timingZoomX = 1;
+  state.timingZoomY = clamp(Number(state.timingZoomY || 1) * factor, 0.6, 4);
+  renderTimingView();
+}
+
+function getTimelineSvgSize(svgElement) {
+  const vb = svgElement.getAttribute("viewBox");
+  if (vb) {
+    const parts = vb.trim().split(/\s+/).map(Number);
+    if (parts.length === 4 && Number.isFinite(parts[2]) && Number.isFinite(parts[3])) {
+      return { width: parts[2], height: parts[3] };
+    }
+  }
+  const width = Number(svgElement.getAttribute("width")) || svgElement.clientWidth || 1200;
+  const height = Number(svgElement.getAttribute("height")) || svgElement.clientHeight || 800;
+  return { width, height };
+}
+
+function parseMaxSide() {
+  const fallback = 4096;
+  if (!controls.timingExportMaxSide) return fallback;
+  const v = Math.round(Number(controls.timingExportMaxSide.value));
+  if (!Number.isFinite(v)) return fallback;
+  return clamp(v, 512, 16000);
+}
+
+function timelineExportCss() {
+  return `
+.timing-timeline-svg { background: #fffaf0; }
+.timeline-axis { stroke: #8f846d; stroke-width: 1; }
+.timeline-cycle-sep { stroke: #c4b89a; stroke-width: 1; stroke-dasharray: 3 2; }
+.timeline-core-sep { stroke: #7a6f58; stroke-width: 1.2; stroke-dasharray: 4 3; }
+.timeline-tick { fill: #7a6f58; font-size: 10px; font-family: ui-monospace, SFMono-Regular, Menlo, monospace; }
+.timeline-core-label { fill: #5a5347; font-size: 11px; font-family: ui-monospace, SFMono-Regular, Menlo, monospace; }
+.timeline-core-label.boundary { font-weight: 700; }
+.timeline-core-label.focused { fill: #1f4eb5; font-weight: 700; text-decoration: underline; }
+.timeline-core-label.io-expanded { fill: #6a2b96; text-decoration: underline; text-decoration-style: dashed; }
+.timeline-lane-tag { fill: #7f7460; font-size: 10px; font-family: ui-monospace, SFMono-Regular, Menlo, monospace; }
+.timeline-io-tag-in { fill: #2d6cdf; font-weight: 700; }
+.timeline-io-tag-out { fill: #8f2ac7; font-weight: 700; }
+.timeline-rect.expected { fill: #f4f4f4; stroke: #8f8f8f; stroke-width: 0.8; }
+.timeline-rect.actual-ok { fill: #2a7f62; stroke: #1f604a; stroke-width: 0.7; }
+.timeline-rect.actual-bad { fill: #d62828; stroke: #8f1717; stroke-width: 0.7; }
+.timeline-rect.actual-comp-ok { fill: #2d6cdf; stroke: #1d4a97; stroke-width: 0.8; opacity: 0.84; }
+.timeline-rect.actual-comp-bad { fill: #9b2ce0; stroke: #5d178a; stroke-width: 0.85; opacity: 0.9; }
+.timeline-rect.missing { fill: #f4f4f4; stroke: #7a7a7a; stroke-width: 1.1; stroke-dasharray: 2 1; }
+.timeline-rect.selected { stroke-width: 1.8; }
+.timeline-io-bus { stroke-width: 0.9; }
+.timeline-io-bus-in { fill: #deebff; stroke: #7da3ea; }
+.timeline-io-bus-out { fill: #f1e1ff; stroke: #b589dd; }
+.timeline-io-bus-label { font-family: ui-monospace, SFMono-Regular, Menlo, monospace; pointer-events: none; font-size: 7px; }
+.timeline-io-bus-label-in { fill: #214a9c; }
+.timeline-io-bus-label-out { fill: #6d2094; }
+.timeline-missing { stroke: #7a7a7a; stroke-width: 1.2; }
+.timeline-link.ok { stroke: rgba(54, 132, 103, 0.45); stroke-width: 0.9; }
+.timeline-link.bad { stroke: rgba(214, 40, 40, 0.72); stroke-width: 1.2; }
+.timeline-link.comp-ok { stroke: rgba(45, 108, 223, 0.5); stroke-width: 0.9; stroke-dasharray: 2 1; }
+.timeline-link.comp-bad { stroke: rgba(155, 44, 224, 0.78); stroke-width: 1.2; stroke-dasharray: 2 1; }
+.timeline-legend-text { fill: #615a4f; font-size: 10px; font-family: ui-monospace, SFMono-Regular, Menlo, monospace; }
+.timeline-legend-exp { fill: #fff; stroke: #8c8c8c; }
+.timeline-legend-act-ok { fill: #2a7f62; }
+.timeline-legend-act-bad { fill: #d62828; }
+.timeline-legend-missing { fill: #f4f4f4; stroke: #7a7a7a; stroke-dasharray: 2 1; }
+.timeline-legend-comp-ok { fill: #2d6cdf; }
+.timeline-legend-comp-bad { fill: #9b2ce0; }
+.timeline-legend-io-in { fill: #deebff; stroke: #7da3ea; }
+.timeline-legend-io-out { fill: #f1e1ff; stroke: #b589dd; }
+.timeline-rect-label { font-family: ui-monospace, SFMono-Regular, Menlo, monospace; font-size: 7px; }
+.timeline-rect-label-expected { fill: #444; }
+.timeline-rect-label-actual { fill: #fff; }`;
+}
+
+function exportTimelinePng() {
+  const svgElement = document.getElementById("timingTimelineSvg");
+  if (!svgElement) return;
+  const size = getTimelineSvgSize(svgElement);
+  const maxSide = parseMaxSide();
+  const scale = Math.min(1, maxSide / Math.max(size.width, size.height));
+  const outW = Math.max(1, Math.round(size.width * scale));
+  const outH = Math.max(1, Math.round(size.height * scale));
+
+  const serializer = new XMLSerializer();
+  const clone = svgElement.cloneNode(true);
+  const styleEl = document.createElementNS("http://www.w3.org/2000/svg", "style");
+  styleEl.textContent = timelineExportCss();
+  clone.insertBefore(styleEl, clone.firstChild);
+  let source = serializer.serializeToString(clone);
+  if (!source.includes("xmlns=\"http://www.w3.org/2000/svg\"")) {
+    source = source.replace("<svg", "<svg xmlns=\"http://www.w3.org/2000/svg\"");
+  }
+  const svgBlob = new Blob([source], { type: "image/svg+xml;charset=utf-8" });
+  const url = URL.createObjectURL(svgBlob);
+  const img = new Image();
+  img.onload = () => {
+    const canvas = document.createElement("canvas");
+    canvas.width = outW;
+    canvas.height = outH;
+    const ctx = canvas.getContext("2d");
+    if (!ctx) {
+      URL.revokeObjectURL(url);
+      return;
+    }
+    ctx.fillStyle = "#fffdf7";
+    ctx.fillRect(0, 0, outW, outH);
+    ctx.drawImage(img, 0, 0, outW, outH);
+    canvas.toBlob((blob) => {
+      if (!blob) {
+        URL.revokeObjectURL(url);
+        return;
+      }
+      const dlUrl = URL.createObjectURL(blob);
+      const a = document.createElement("a");
+      a.href = dlUrl;
+      a.download = "timeline.png";
+      a.click();
+      URL.revokeObjectURL(dlUrl);
+      URL.revokeObjectURL(url);
+    }, "image/png");
+  };
+  img.onerror = () => {
+    URL.revokeObjectURL(url);
+  };
+  img.src = url;
+}
+
+function splitCellKey(cellKey) {
+  const pivot = cellKey.lastIndexOf("|");
+  if (pivot <= 0) return { coreKey: "", slot: 0 };
+  return {
+    coreKey: cellKey.slice(0, pivot),
+    slot: Number(cellKey.slice(pivot + 1)),
+  };
+}
+
+function buildCoreIoWaveByTime(events) {
+  const byCore = new Map();
+  const ensure = (coreKey, time) => {
+    if (!byCore.has(coreKey)) byCore.set(coreKey, new Map());
+    const byTime = byCore.get(coreKey);
+    if (!byTime.has(time)) byTime.set(time, { inVals: [], outVals: [] });
+    return byTime.get(time);
+  };
+  for (const e of events) {
+    if (e.msg !== "DataFlow") continue;
+    const time = Math.round(Number(e.Time));
+    if (!Number.isFinite(time)) continue;
+    const value = formatDataAsDecimal(e.Data);
+    if (e.Behavior === "FeedIn" || e.Behavior === "Recv") {
+      const dst = parseEndpoint(e.Behavior === "FeedIn" ? e.To : e.Dst);
+      if (dst?.kind !== "tilePort") continue;
+      const cell = ensure(tileKey(dst.x, dst.y), time);
+      if (value != null) cell.inVals.push(value);
+      continue;
+    }
+    if (e.Behavior === "Send" || e.Behavior === "Collect") {
+      const src = parseEndpoint(e.Behavior === "Collect" ? e.From : e.Src);
+      if (src?.kind !== "tilePort") continue;
+      const cell = ensure(tileKey(src.x, src.y), time);
+      if (value != null) cell.outVals.push(value);
+    }
+  }
+  return byCore;
+}
+
+function refreshCoreFocusControl(columns) {
+  if (!controls.timingCoreFocus) return;
+  const options = [
+    { value: "", label: "All cores" },
+    ...columns.map((c) => ({ value: c.coreKey, label: `(${c.x},${c.y})` })),
+  ];
+  controls.timingCoreFocus.innerHTML = options
+    .map((opt) => `<option value="${escapeHtml(opt.value)}">${escapeHtml(opt.label)}</option>`)
+    .join("");
+  const hasFocused = columns.some((c) => c.coreKey === state.timingFocusedCoreKey);
+  if (!hasFocused) state.timingFocusedCoreKey = null;
+  controls.timingCoreFocus.value = state.timingFocusedCoreKey || "";
+}
+
+function refreshIoWaveCoreControl(columns) {
+  if (!controls.timingIoWaveCore || !controls.timingIoWaveAll) return;
+  const validKeys = new Set(columns.map((c) => c.coreKey));
+  const expanded = new Set(
+    [...(state.timingIoWaveExpandedCoreKeys || [])].filter((key) => validKeys.has(key)),
+  );
+  state.timingIoWaveExpandedCoreKeys = expanded;
+
+  const options = columns.map((c) => ({ value: c.coreKey, label: `(${c.x},${c.y})` }));
+  controls.timingIoWaveCore.innerHTML = options
+    .map((opt) => `<option value="${escapeHtml(opt.value)}">${escapeHtml(opt.label)}</option>`)
+    .join("");
+  const shouldSelectAll = Boolean(state.timingIoWaveExpandAll && columns.length > 0);
+  if (state.timingIoWaveExpandAll && columns.length === 0) {
+    state.timingIoWaveExpandAll = false;
+  }
+  const selectedKeys = shouldSelectAll
+    ? new Set(columns.map((c) => c.coreKey))
+    : expanded;
+  for (const opt of controls.timingIoWaveCore.options) {
+    opt.selected = selectedKeys.has(opt.value);
+  }
+  controls.timingIoWaveAll.checked = shouldSelectAll;
+}
+
+function renderTimingDrilldown(view, heatmap, phaseExplain, compensation) {
+  if (!controls.timingDrilldown) return;
+  if (!state.timingSelectedCell || !heatmap.cells.has(state.timingSelectedCell)) {
+    controls.timingDrilldown.innerHTML =
+      "<div class=\"timing-drill-empty\">Click a timeline mark to inspect operation-level details.</div>";
+    return;
+  }
+  const selected = heatmap.cells.get(state.timingSelectedCell);
+  const { slot } = splitCellKey(state.timingSelectedCell);
+  const items = view.cellMap.get(state.timingSelectedCell) || [];
+  const corePhase = phaseExplain.coreMap.get(selected.coreKey);
+  const coreComp = compensation.coreOffsets.get(selected.coreKey) || null;
+  const compOffset = getModelOffset(coreComp, state.timingCompModel);
+  const corePhaseText = corePhase?.phaseOffset == null ? "N/A" : formatDelta(corePhase.phaseOffset);
+  const coreCompText = compOffset == null ? "N/A" : formatDelta(compOffset);
+  const confPct = `${Math.round((corePhase?.phaseConfidence || 0) * 100)}%`;
+  const summary = [
+    `core=(${selected.x},${selected.y})`,
+    `edge=${corePhase?.boundaryLabel || "N/A"}`,
+    `slot=s${Number.isFinite(slot) ? slot : "N/A"}`,
+    `ops=${selected.opCount}`,
+    `anomaly=${selected.anomalyCount}`,
+    `strictPhase=${corePhaseText}`,
+    `comp(${state.timingCompModel})=${coreCompText}`,
+    `conf=${confPct}`,
+  ].join(" | ");
+
+  if (items.length === 0) {
+    controls.timingDrilldown.innerHTML =
+      `<div class="timing-drill-head">${escapeHtml(summary)}</div><div class="timing-drill-empty">No expected operations in this cell.</div>`;
+    return;
+  }
+
+  let html = `<div class="timing-drill-head">${escapeHtml(summary)}</div>`;
+  html += "<div class=\"timing-drill-list\">";
+  for (const item of items) {
+    const rowCls = [
+      "timing-drill-row",
+      item.status,
+      item.firstDivergence ? "first-divergence" : "",
+      item.propagated ? "propagated" : "",
+    ].filter(Boolean).join(" ");
+    const opLabel = `#${item.id} ${item.opcode || "N/A"}`;
+    const deltaComp = computeDeltaRebased(item.delta, compOffset, view.ii);
+    const statusComp = statusFromDelta(deltaComp, item.status === "missing");
+    const compLabel = state.timingCompModel === "hybrid" ? "hybrid" : `comp(${state.timingCompModel})`;
+    const allSamples = Array.isArray(item.allSamples) ? item.allSamples : [];
+    const sourceCounts = new Map();
+    for (const s of allSamples) {
+      const src = String(s?.source || "Unknown");
+      sourceCounts.set(src, (sourceCounts.get(src) || 0) + 1);
+    }
+    const sourceSummary = sourceCounts.size > 0
+      ? [...sourceCounts.entries()].map(([k, v]) => `${k}*${v}`).join(",")
+      : "N/A";
+    const sampleRange = allSamples.length > 0
+      ? `${allSamples[0].time}..${allSamples[allSamples.length - 1].time}`
+      : "N/A";
+    const samplePreview = allSamples.length > 0
+      ? allSamples.slice(0, 4).map((s, idx) => `${idx + 1}:${s.time}:${s.source || "Unknown"}`).join(",")
+      : "N/A";
+    const fields = [
+      `statusComp=${statusComp}`,
+      `deltaComp(${compLabel})=${formatDelta(deltaComp)}`,
+      `exp=s${item.expectedSlot}`,
+      `act=${item.actualSlot == null ? "N/A" : `s${item.actualSlot}`}`,
+      `statusStrict=${item.status} (reference)`,
+      `deltaStrict=${formatDelta(item.delta)} (reference)`,
+      `deltaPhaseRebased=${formatDelta(computeDeltaRebased(item.delta, corePhase?.phaseOffset, view.ii))}`,
+      `time=${item.firstTime == null ? "N/A" : item.firstTime}`,
+      `samples=${item.sampleCount}`,
+      `sourceSummary=${sourceSummary}`,
+      `sampleRange=${sampleRange}`,
+      `samplePreview=${samplePreview}`,
+      `div=${item.firstDivergence ? "yes" : "no"}`,
+    ].join(" | ");
+    html += `<div class="${rowCls}" title="${escapeHtml(fields)}"><span class="drill-op">${escapeHtml(opLabel)}</span><span class="drill-meta">${escapeHtml(fields)}</span></div>`;
+  }
+  html += "</div>";
+  controls.timingDrilldown.innerHTML = html;
+}
+
+function renderFocusedCoreMini(view, timeline) {
+  if (!controls.timingCoreMini) return;
+  const focusedKey = state.timingFocusedCoreKey;
+  if (!focusedKey) {
+    controls.timingCoreMini.innerHTML =
+      "<div class=\"timing-core-mini-empty\">Click a Y-axis core label or use the core selector to focus one core.</div>";
+    return;
+  }
+  const core = view.columns.find((c) => c.coreKey === focusedKey);
+  const lane = timeline.lanes.find((l) => l.coreKey === focusedKey);
+  if (!core || !lane) {
+    controls.timingCoreMini.innerHTML =
+      "<div class=\"timing-core-mini-empty\">Focused core is not visible under current filters.</div>";
+    return;
+  }
+  const sourceCounts = new Map();
+  for (const item of core.items) {
+    const samples = Array.isArray(item.allSamples) ? item.allSamples : [];
+    for (const s of samples) {
+      const src = String(s?.source || "Unknown");
+      sourceCounts.set(src, (sourceCounts.get(src) || 0) + 1);
+    }
+  }
+  const sourceText = sourceCounts.size > 0
+    ? [...sourceCounts.entries()].map(([k, v]) => `${k}*${v}`).join(" | ")
+    : "N/A";
+  const windowStart = Number(state.timingWindowStart || timeline.timeMin);
+  const windowEnd = windowStart + Number(state.timingWindowSize || 1) - 1;
+  const compModel = ["distance", "fitted", "hybrid"].includes(state.timingCompModel)
+    ? state.timingCompModel
+    : "hybrid";
+  const rows = lane.expectedSlots
+    .filter((slot) => {
+      const inExp = Number.isFinite(slot.expectedTime) && slot.expectedTime >= windowStart && slot.expectedTime <= windowEnd;
+      const inAct = Number.isFinite(slot.actualTime) && slot.actualTime >= windowStart && slot.actualTime <= windowEnd;
+      return inExp || inAct;
+    })
+    .sort((a, b) => {
+      const ta = Number.isFinite(a.actualTime) ? a.actualTime : a.expectedTime;
+      const tb = Number.isFinite(b.actualTime) ? b.actualTime : b.expectedTime;
+      if (ta !== tb) return ta - tb;
+      if (a.opId !== b.opId) return a.opId - b.opId;
+      return (a.sampleIndex || 0) - (b.sampleIndex || 0);
+    })
+    .slice(0, 28);
+  const listHtml = rows.length > 0
+    ? rows.map((slot) => {
+      const occ = `[${slot.sampleIndex}/${slot.occurrenceTotal}]`;
+      const src = slot.sampleSource || "N/A";
+      const strict = slot.statusStrict;
+      const comp = getCompStatusByModel(slot, compModel);
+      const line = `#${slot.opId}${occ} ${slot.opcode || "N/A"} expT=${slot.expectedTime ?? "N/A"} actT=${slot.actualTime ?? "N/A"} strict=${strict} comp=${comp} src=${src}`;
+      return `<div class="timing-core-mini-row">${escapeHtml(line)}</div>`;
+    }).join("")
+    : "<div class=\"timing-core-mini-empty\">No blocks from this core in current window.</div>";
+
+  controls.timingCoreMini.innerHTML = [
+    `<div class="timing-core-mini-head">focused-core=(${core.x},${core.y}) | window=T${windowStart}..T${windowEnd} | sources=${escapeHtml(sourceText)}</div>`,
+    `<div class="timing-core-mini-list">${listHtml}</div>`,
+  ].join("");
+}
+
+function parseProgramYaml(text) {
+  if (!window.jsyaml) {
+    throw new Error("js-yaml is unavailable in current page.");
+  }
+  const parsed = window.jsyaml.load(text);
+  const cfg = parsed?.array_config;
+  if (!cfg || !Array.isArray(cfg.cores)) {
+    throw new Error("Program YAML must contain array_config.cores.");
+  }
+
+  const ii = Math.max(0, Math.round(Number(cfg.compiled_ii || 0)));
+  const arrayColumns = Math.round(Number(cfg.columns));
+  const arrayRows = Math.round(Number(cfg.rows));
+  const hasArraySize = Number.isFinite(arrayColumns) && Number.isFinite(arrayRows) && arrayColumns > 0 && arrayRows > 0;
+  const expectedOps = [];
+  const coreSet = new Map();
+
+  for (const core of cfg.cores) {
+    const x = Number(core.column);
+    const y = Number(core.row);
+    if (!Number.isFinite(x) || !Number.isFinite(y)) continue;
+    const coreKey = tileKey(x, y);
+    if (!coreSet.has(coreKey)) coreSet.set(coreKey, { coreKey, x, y });
+
+    const entries = Array.isArray(core.entries) ? core.entries : [];
+    for (const entry of entries) {
+      const groups = Array.isArray(entry.instructions) ? entry.instructions : [];
+      for (const ig of groups) {
+        const fallbackSlot = normalizeSlot(ig.index_per_ii || 0, ii);
+        const ops = Array.isArray(ig.operations) ? ig.operations : [];
+        for (const op of ops) {
+          const id = Number(op.id);
+          if (!Number.isFinite(id)) continue;
+          const rawTimeStep = Number(op.time_step);
+          const hasTimeStep = Number.isFinite(rawTimeStep);
+          const expectedSlot = normalizeSlot(hasTimeStep ? rawTimeStep : fallbackSlot, ii);
+          expectedOps.push({
+            coreKey,
+            x,
+            y,
+            id: Math.round(id),
+            opcode: String(op.opcode || ""),
+            expectedSlot,
+            rawTimeStep: hasTimeStep ? Math.round(rawTimeStep) : null,
+          });
+        }
+      }
+    }
+  }
+
+  const columns = [...coreSet.values()].sort(sortCore);
+  const maxSlot = expectedOps.reduce((acc, op) => Math.max(acc, op.expectedSlot), 0);
+  const slots = ii > 0
+    ? Array.from({ length: ii }, (_v, idx) => idx)
+    : Array.from({ length: maxSlot + 1 }, (_v, idx) => idx);
+
+  return {
+    ii,
+    expectedOps,
+    columns,
+    slots,
+    arrayColumns: hasArraySize ? arrayColumns : null,
+    arrayRows: hasArraySize ? arrayRows : null,
+  };
+}
+
+function buildActualByCoreAndId(events, ii) {
+  const actualByCore = new Map();
+  for (const e of events) {
+    const isInst = e.msg === "Inst";
+    const isMemoryDirect = e.msg === "Memory"
+      && (String(e.Behavior || "") === "LoadDirect" || String(e.Behavior || "") === "StoreDirect");
+    if (!isInst && !isMemoryDirect) continue;
+    if (!Number.isFinite(Number(e.Time))
+      || !Number.isFinite(Number(e.ID))
+      || !Number.isFinite(Number(e.X))
+      || !Number.isFinite(Number(e.Y))) {
+      continue;
+    }
+    const coreKey = tileKey(Number(e.X), Number(e.Y));
+    if (!actualByCore.has(coreKey)) actualByCore.set(coreKey, new Map());
+    const byId = actualByCore.get(coreKey);
+    const id = Math.round(Number(e.ID));
+    if (!byId.has(id)) byId.set(id, []);
+    byId.get(id).push({
+      time: Math.round(Number(e.Time)),
+      slot: normalizeSlot(e.Time, ii),
+      pred: e.Pred,
+      source: isInst ? "Inst" : String(e.Behavior || "MemoryDirect"),
+    });
+  }
+  for (const byId of actualByCore.values()) {
+    for (const samples of byId.values()) {
+      samples.sort((a, b) => a.time - b.time);
+    }
+  }
+  return actualByCore;
+}
+
+function buildStrictTimingView(programSpec, events) {
+  const actualByCoreAndId = buildActualByCoreAndId(events, programSpec.ii);
+  const cellMap = new Map();
+  const byCore = new Map();
+
+  for (const op of programSpec.expectedOps) {
+    const actuals = actualByCoreAndId.get(op.coreKey)?.get(op.id) || [];
+    const first = actuals.length > 0 ? actuals[0] : null;
+    const delta = first ? signedDelta(first.slot, op.expectedSlot, programSpec.ii) : null;
+    const status = !first ? "missing" : (delta === 0 ? "on-time" : (delta < 0 ? "early" : "late"));
+    const allSamples = actuals.map((sample, sampleIdx) => {
+      const sampleDelta = signedDelta(sample.slot, op.expectedSlot, programSpec.ii);
+      const sampleStatus = sampleDelta === 0 ? "on-time" : (sampleDelta < 0 ? "early" : "late");
+      return {
+        ...sample,
+        sampleIdx,
+        delta: sampleDelta,
+        status: sampleStatus,
+      };
+    });
+    const compareItem = {
+      ...op,
+      actualSlot: first ? first.slot : null,
+      firstTime: first ? first.time : null,
+      sampleCount: actuals.length,
+      delta,
+      status,
+      allSamples,
+      firstDivergence: false,
+      propagated: false,
+    };
+
+    if (!byCore.has(op.coreKey)) byCore.set(op.coreKey, []);
+    byCore.get(op.coreKey).push(compareItem);
+
+    const cellKey = `${op.coreKey}|${op.expectedSlot}`;
+    if (!cellMap.has(cellKey)) cellMap.set(cellKey, []);
+    cellMap.get(cellKey).push(compareItem);
+  }
+
+  const columns = programSpec.columns.map((c) => {
+    const items = byCore.get(c.coreKey) || [];
+    items.sort((a, b) => {
+      const ta = Number.isFinite(a.firstTime) ? a.firstTime : Number.POSITIVE_INFINITY;
+      const tb = Number.isFinite(b.firstTime) ? b.firstTime : Number.POSITIVE_INFINITY;
+      if (ta !== tb) return ta - tb;
+      if (a.expectedSlot !== b.expectedSlot) return a.expectedSlot - b.expectedSlot;
+      return a.id - b.id;
+    });
+
+    let hasDivergence = false;
+    let lastDelta = null;
+    for (const item of items) {
+      if (item.status === "on-time") continue;
+      if (item.status === "missing") {
+        if (!hasDivergence) {
+          item.firstDivergence = true;
+          hasDivergence = true;
+        } else {
+          item.propagated = true;
+        }
+        continue;
+      }
+      if (!hasDivergence || item.delta !== lastDelta) {
+        item.firstDivergence = true;
+        hasDivergence = true;
+      } else {
+        item.propagated = true;
+      }
+      lastDelta = item.delta;
+    }
+
+    const deltaCounts = new Map();
+    const statusCounts = { onTime: 0, early: 0, late: 0, missing: 0 };
+    for (const item of items) {
+      if (item.status === "on-time") statusCounts.onTime += 1;
+      if (item.status === "early") statusCounts.early += 1;
+      if (item.status === "late") statusCounts.late += 1;
+      if (item.status === "missing") statusCounts.missing += 1;
+      if (item.status === "early" || item.status === "late") {
+        const k = String(item.delta);
+        deltaCounts.set(k, (deltaCounts.get(k) || 0) + 1);
+      }
+    }
+    let modeDelta = 0;
+    let modeCount = 0;
+    for (const [k, v] of deltaCounts.entries()) {
+      if (v > modeCount) {
+        modeCount = v;
+        modeDelta = Number(k);
+      }
+    }
+
+    return {
+      ...c,
+      items,
+      modeDelta,
+      modeCount,
+      statusCounts,
+      earlyLateCount: statusCounts.early + statusCounts.late,
+      phaseOffset: modeCount > 0 ? modeDelta : null,
+      phaseConfidence: (statusCounts.early + statusCounts.late) > 0
+        ? modeCount / (statusCounts.early + statusCounts.late)
+        : 0,
     };
+  });
+
+  for (const items of cellMap.values()) {
+    items.sort((a, b) => a.id - b.id);
   }
-  return { kind: "unknown", raw: name };
+
+  return {
+    ii: programSpec.ii,
+    slots: programSpec.slots,
+    columns,
+    cellMap,
+  };
 }
 
-function endpointPoint(ep) {
-  if (!ep) {
-    return null;
-  }
-  if (ep.kind === "tilePort") {
-    const r = tileRect(ep.x, ep.y);
-    if (ep.port === "North") return { x: r.x + r.w / 2, y: r.y, tile: tileKey(ep.x, ep.y) };
-    if (ep.port === "South") return { x: r.x + r.w / 2, y: r.y + r.h, tile: tileKey(ep.x, ep.y) };
-    if (ep.port === "West") return { x: r.x, y: r.y + r.h / 2, tile: tileKey(ep.x, ep.y) };
-    if (ep.port === "East") return { x: r.x + r.w, y: r.y + r.h / 2, tile: tileKey(ep.x, ep.y) };
-  }
-  if (ep.kind === "driver") {
-    const side = ep.side;
-    const idx = ep.idx;
-    if (side === "North" && idx <= state.maxX) {
-      const r = tileRect(idx, state.maxY);
-      return { x: r.x + r.w / 2, y: r.y - layout.driverOffset };
+function renderTimingView() {
+  if (!controls.timingGrid || !controls.timingSummary) return;
+  if (!state.programSpec) {
+    controls.timingSummary.textContent = "Load program YAML to enable strict timing comparison.";
+    controls.timingGrid.innerHTML = "";
+    if (controls.timingCoreFocus) controls.timingCoreFocus.innerHTML = "<option value=\"\">All cores</option>";
+    if (controls.timingIoWaveCore) controls.timingIoWaveCore.innerHTML = "";
+    if (controls.timingIoWaveAll) controls.timingIoWaveAll.checked = false;
+    if (controls.timingDrilldown) {
+      controls.timingDrilldown.innerHTML =
+        "<div class=\"timing-drill-empty\">Load YAML and trace, then click a timeline mark for details.</div>";
     }
-    if (side === "South" && idx <= state.maxX) {
-      const r = tileRect(idx, 0);
-      return { x: r.x + r.w / 2, y: r.y + r.h + layout.driverOffset };
+    if (controls.timingCoreMini) {
+      controls.timingCoreMini.innerHTML =
+        "<div class=\"timing-core-mini-empty\">Focus one core to inspect local trace details.</div>";
     }
-    if (side === "West" && idx <= state.maxY) {
-      const r = tileRect(0, idx);
-      return { x: r.x - layout.driverOffset, y: r.y + r.h / 2 };
+    return;
+  }
+  if (state.events.length === 0) {
+    controls.timingSummary.textContent = "Load trace log to populate timing comparison.";
+    controls.timingGrid.innerHTML = "";
+    if (controls.timingCoreFocus) controls.timingCoreFocus.innerHTML = "<option value=\"\">All cores</option>";
+    if (controls.timingIoWaveCore) controls.timingIoWaveCore.innerHTML = "";
+    if (controls.timingIoWaveAll) controls.timingIoWaveAll.checked = false;
+    if (controls.timingDrilldown) {
+      controls.timingDrilldown.innerHTML =
+        "<div class=\"timing-drill-empty\">Load YAML and trace, then click a timeline mark for details.</div>";
     }
-    if (side === "East" && idx <= state.maxY) {
-      const r = tileRect(state.maxX, idx);
-      return { x: r.x + r.w + layout.driverOffset, y: r.y + r.h / 2 };
+    if (controls.timingCoreMini) {
+      controls.timingCoreMini.innerHTML =
+        "<div class=\"timing-core-mini-empty\">Focus one core to inspect local trace details.</div>";
     }
+    return;
   }
-  return null;
-}
 
-function inferBounds(events) {
-  let maxX = 0;
-  let maxY = 0;
-  for (const e of events) {
-    if (Number.isInteger(e.X)) maxX = Math.max(maxX, e.X);
-    if (Number.isInteger(e.Y)) maxY = Math.max(maxY, e.Y);
-    for (const f of ["Src", "Dst", "From", "To"]) {
-      if (!e[f]) continue;
-      const ep = parseEndpoint(e[f]);
-      if (ep && ep.kind === "tilePort") {
-        maxX = Math.max(maxX, ep.x);
-        maxY = Math.max(maxY, ep.y);
-      }
+  const view = buildStrictTimingView(state.programSpec, state.events);
+  state.timingRows = view.columns;
+  state.timingColumns = view.slots;
+  state.timingReady = true;
+  refreshCoreFocusControl(view.columns);
+  refreshIoWaveCoreControl(view.columns);
+  const heatmap = buildTimingHeatmap(view);
+  const phaseExplain = buildPhaseExplain(view);
+  const compensation = buildCompensationModels(view, phaseExplain, state.events);
+
+  const totals = { onTime: 0, early: 0, late: 0, missing: 0 };
+  for (const c of view.columns) {
+    totals.onTime += c.statusCounts.onTime;
+    totals.early += c.statusCounts.early;
+    totals.late += c.statusCounts.late;
+    totals.missing += c.statusCounts.missing;
+  }
+  const filterText = state.timingAnomalyOnly ? "filter=anomaly-only" : "filter=all";
+  const boundaryText = state.timingBoundaryOnly ? "scope=boundary-only" : "scope=all-cores";
+  const focusedCoreText = state.timingFocusedCoreKey ? `focus=${state.timingFocusedCoreKey}` : "focus=all-cores";
+  const phaseText = state.showPhaseExplain
+    ? `phase(boundary=${formatDelta(phaseExplain.boundaryPhase)} inner=${formatDelta(phaseExplain.innerPhase)} gap=${formatDelta(phaseExplain.phaseGap)})`
+    : "phase(hidden)";
+  const compModel = ["distance", "fitted", "hybrid"].includes(state.timingCompModel)
+    ? state.timingCompModel
+    : "hybrid";
+  const modelSummary = compensation.models[compModel];
+  const compTotals = { onTime: 0, early: 0, late: 0, missing: 0 };
+  for (const c of view.columns) {
+    const compMeta = compensation.coreOffsets.get(c.coreKey) || null;
+    const compOffset = getModelOffset(compMeta, compModel);
+    for (const item of c.items) {
+      const deltaComp = computeDeltaRebased(item.delta, compOffset, view.ii);
+      const statusComp = statusFromDelta(deltaComp, item.status === "missing");
+      if (statusComp === "on-time") compTotals.onTime += 1;
+      if (statusComp === "early") compTotals.early += 1;
+      if (statusComp === "late") compTotals.late += 1;
+      if (statusComp === "missing") compTotals.missing += 1;
     }
   }
-  return { maxX, maxY };
-}
+  controls.timingSummary.textContent =
+    `strict baseline | ii=${view.ii || "N/A"} | on-time=${totals.onTime} early=${totals.early} late=${totals.late} missing=${totals.missing} | comp(${compModel}) on-time=${compTotals.onTime} early=${compTotals.early} late=${compTotals.late} missing=${compTotals.missing} gap=${formatDelta(modelSummary?.gap)} | view=${state.timingBaselineView} | ${filterText} | ${boundaryText} | ${focusedCoreText} | ingress=${compensation.ingressSides.join("+")} | ${phaseText}`;
 
-function parseJsonLines(text) {
-  const lines = text.split(/\r?\n/).map((s) => s.trim()).filter(Boolean);
-  const rows = [];
-  for (const line of lines) {
-    try {
-      const obj = JSON.parse(line);
-      if (obj && typeof obj.Time === "number" && Number.isFinite(obj.Time)) {
-        obj.Time = Math.round(obj.Time);
-        rows.push(obj);
+  let visibleColumns = view.columns;
+  if (state.timingBoundaryOnly) {
+    visibleColumns = visibleColumns.filter((c) => phaseExplain.coreMap.get(c.coreKey)?.isBoundary);
+  }
+  if (state.timingFocusedCoreKey) {
+    visibleColumns = visibleColumns.filter((c) => c.coreKey === state.timingFocusedCoreKey);
+  }
+  const visibleCoreSet = new Set(visibleColumns.map((c) => c.coreKey));
+
+  if (state.timingSelectedCell && !heatmap.cells.has(state.timingSelectedCell)) {
+    state.timingSelectedCell = null;
+  }
+  if (state.timingSelectedCell) {
+    const selectedCoreKey = splitCellKey(state.timingSelectedCell).coreKey;
+    if (!visibleCoreSet.has(selectedCoreKey)) {
+      state.timingSelectedCell = null;
+    }
+  }
+  if (!state.timingSelectedCell) {
+    for (const c of visibleColumns) {
+      for (const slot of view.slots) {
+        const cell = heatmap.cells.get(`${c.coreKey}|${slot}`);
+        if (cell && (!state.timingAnomalyOnly || cell.hasAnomaly)) {
+          state.timingSelectedCell = cell.cellKey;
+          break;
+        }
       }
-    } catch (_) {
-      // Ignore malformed lines.
+      if (state.timingSelectedCell) break;
     }
   }
-  return rows;
-}
 
-function indexByTime(events) {
-  const byTime = new Map();
-  let minTime = Number.POSITIVE_INFINITY;
-  let maxTime = Number.NEGATIVE_INFINITY;
-  for (const e of events) {
-    const tKey = Math.round(Number(e.Time));
-    if (!byTime.has(tKey)) byTime.set(tKey, []);
-    byTime.get(tKey).push(e);
-    minTime = Math.min(minTime, tKey);
-    maxTime = Math.max(maxTime, tKey);
-  }
-  if (!Number.isFinite(minTime) || !Number.isFinite(maxTime)) {
-    minTime = 0;
-    maxTime = 0;
+  const timeline = buildTimelineLanes(view, visibleColumns, phaseExplain, compensation);
+  const compModelForMismatch = ["distance", "fitted", "hybrid"].includes(state.timingCompModel)
+    ? state.timingCompModel
+    : "hybrid";
+  let firstHybridMismatchTime = null;
+  for (const lane of timeline.lanes) {
+    for (const slot of lane.expectedSlots) {
+      if (getCompStatusByModel(slot, compModelForMismatch) !== "on-time") {
+        const t = Number.isFinite(slot.actualTime) ? slot.actualTime : slot.expectedTime;
+        if (Number.isFinite(t) && (firstHybridMismatchTime == null || t < firstHybridMismatchTime)) {
+          firstHybridMismatchTime = t;
+        }
+      }
+    }
   }
-  return { byTime, minTime, maxTime };
+  state.firstHybridMismatchTime = firstHybridMismatchTime;
+
+  renderTimelineSvg(view, timeline);
+  renderTimingDrilldown(view, heatmap, phaseExplain, compensation);
+  renderFocusedCoreMini(view, timeline);
 }
 
 function summarizeEvent(e) {
@@ -193,13 +2428,58 @@ function summarizeEvent(e) {
   if (e.msg === "Memory") {
     return `Memory ${e.Behavior} tile=(${e.X},${e.Y}) value=${e.Value} addr=${e.Addr}`;
   }
+  if (e.msg === "Backpressure") {
+    return `Backpressure tile=(${e.X},${e.Y}) dir=${e.DstDir ?? "N/A"} reason=${e.Reason ?? "N/A"} op=${e.OpCode ?? "N/A"} id=${e.ID ?? "N/A"}`;
+  }
   return JSON.stringify(e);
 }
 
+function applyMeshZoomTransform(transform) {
+  meshZoomTransform = transform || d3.zoomIdentity;
+  if (sceneRoot) sceneRoot.attr("transform", meshZoomTransform.toString());
+}
+
+function bindMeshZoom() {
+  if (!meshZoomBehavior) {
+    meshZoomBehavior = d3.zoom()
+      .scaleExtent([0.4, 8])
+      .on("zoom", (event) => {
+        applyMeshZoomTransform(event.transform);
+      });
+  }
+  meshZoomBehavior
+    .extent([[0, 0], [layout.width, layout.height]])
+    .translateExtent([
+      [-layout.width * 1.5, -layout.height * 1.5],
+      [layout.width * 2.5, layout.height * 2.5],
+    ]);
+  svg.call(meshZoomBehavior);
+  svg.call(meshZoomBehavior.transform, meshZoomTransform);
+}
+
+function renderMeshLegend() {
+  if (!controls.meshLegend) return;
+  const legendItems = [
+    ["Send", colors.Send],
+    ["Recv", colors.Recv],
+    ["FeedIn", colors.FeedIn],
+    ["Collect", colors.Collect],
+    ["Backpressure path", colors.Backpressure],
+    ["Inst", colors.Inst],
+    ["Memory", colors.Memory],
+  ];
+  controls.meshLegend.innerHTML = legendItems.map(
+    ([name, color]) =>
+      `<span class="mesh-legend-item"><i class="mesh-legend-dot" style="background:${color}"></i>${name}</span>`,
+  ).join("");
+}
+
 function drawStaticScene() {
   svg.selectAll("*").remove();
-  staticLayer = svg.append("g");
-  dynamicLayer = svg.append("g");
+  sceneRoot = svg.append("g").attr("class", "mesh-scene-root");
+  applyMeshZoomTransform(meshZoomTransform);
+  staticLayer = sceneRoot.append("g");
+  dynamicLayer = sceneRoot.append("g");
 
   const bg = staticLayer.append("rect");
   bg
@@ -230,6 +2510,19 @@ function drawStaticScene() {
     .attr("height", layout.tileSize)
     .attr("rx", 10);
 
+  tileGroup
+    .selectAll(".tile-report-heat")
+    .data(tiles)
+    .join("rect")
+    .attr("class", (d) => `tile-report-heat tile-report-heat-${d.x}-${d.y}`)
+    .attr("x", (d) => tileRect(d.x, d.y).x)
+    .attr("y", (d) => tileRect(d.x, d.y).y)
+    .attr("width", layout.tileSize)
+    .attr("height", layout.tileSize)
+    .attr("rx", 10)
+    .attr("opacity", 0)
+    .style("display", "none");
+
   tileGroup
     .selectAll("text")
     .data(tiles)
@@ -269,35 +2562,12 @@ function drawStaticScene() {
     .attr("x", (d) => endpointPoint({ kind: "driver", side: d.side, idx: d.idx }).x + 12)
     .attr("y", (d) => endpointPoint({ kind: "driver", side: d.side, idx: d.idx }).y + 4)
     .text((d) => `${d.side[0]}${d.idx}`);
-
-  const legend = staticLayer.append("g").attr("transform", "translate(28, 34)");
-  const legendItems = [
-    ["Send", colors.Send],
-    ["Recv", colors.Recv],
-    ["FeedIn", colors.FeedIn],
-    ["Collect", colors.Collect],
-    ["Inst", colors.Inst],
-    ["Memory", colors.Memory],
-  ];
-  legend
-    .selectAll("circle")
-    .data(legendItems)
-    .join("circle")
-    .attr("cx", (_d, i) => i * 112)
-    .attr("cy", 0)
-    .attr("r", 5)
-    .attr("fill", (d) => d[1]);
-  legend
-    .selectAll("text")
-    .data(legendItems)
-    .join("text")
-    .attr("class", "legend-text")
-    .attr("x", (_d, i) => i * 112 + 9)
-    .attr("y", 4)
-    .text((d) => d[0]);
+  applyReportHeatOverlay();
+  renderMeshLegend();
+  bindMeshZoom();
 }
 
-function drawLink(type, srcPoint, dstPoint) {
+function buildLinkPath(srcPoint, dstPoint) {
   const path = d3.path();
   path.moveTo(srcPoint.x, srcPoint.y);
   const dx = dstPoint.x - srcPoint.x;
@@ -311,14 +2581,198 @@ function drawLink(type, srcPoint, dstPoint) {
     dstPoint.x,
     dstPoint.y,
   );
+  return path.toString();
+}
+
+function resolveDataFlowEndpoints(event) {
+  if (!event || event.msg !== "DataFlow") return null;
+  let src = null;
+  let dst = null;
+  if (event.Behavior === "FeedIn") {
+    src = parseEndpoint(event.From);
+    dst = parseEndpoint(event.To);
+  } else if (event.Behavior === "Collect") {
+    src = parseEndpoint(event.From);
+    dst = parseEndpoint(event.To || event.Dst);
+  } else {
+    src = parseEndpoint(event.Src);
+    dst = parseEndpoint(event.Dst);
+  }
+  const srcPoint = endpointPoint(src);
+  const dstPoint = endpointPoint(dst);
+  return {
+    type: event.Behavior,
+    src,
+    dst,
+    srcPoint,
+    dstPoint,
+  };
+}
+
+function collectBackpressureOverlay(events) {
+  const blockedTileKeys = new Set();
+  const blockedLinks = [];
+  const incomingByDstTile = new Map();
+
+  const addIncomingEdge = (edge) => {
+    if (!edge?.dstTileKey) return;
+    if (!incomingByDstTile.has(edge.dstTileKey)) incomingByDstTile.set(edge.dstTileKey, []);
+    incomingByDstTile.get(edge.dstTileKey).push(edge);
+  };
+
+  for (const e of events) {
+    if (e.msg === "Backpressure" && Number.isFinite(Number(e.X)) && Number.isFinite(Number(e.Y))) {
+      const x = Math.round(Number(e.X));
+      const y = Math.round(Number(e.Y));
+      const tile = tileKey(x, y);
+      blockedTileKeys.add(tile);
+
+      const outPort = normalizePortName(e.DstDir || e.Dir || e.Port || "");
+      if (outPort) {
+        const src = tilePortEndpoint(x, y, outPort);
+        const dst = neighborEndpointFromTilePort(x, y, outPort);
+        const srcPoint = endpointPoint(src);
+        const dstPoint = endpointPoint(dst);
+        if (srcPoint && dstPoint) {
+          blockedLinks.push({
+            key: `${src.raw}->${dst.raw}`,
+            srcPoint,
+            dstPoint,
+            title: `blocked wire: ${src.raw} -> ${dst.raw}`,
+          });
+        }
+      }
+      continue;
+    }
+
+    const flow = resolveDataFlowEndpoints(e);
+    if (!flow?.srcPoint || !flow?.dstPoint) continue;
+    if (!flow.srcPoint.tile || !flow.dstPoint.tile) continue;
+    addIncomingEdge({
+      key: `${flow.src?.raw || flow.srcPoint.tile}->${flow.dst?.raw || flow.dstPoint.tile}`,
+      srcTileKey: flow.srcPoint.tile,
+      dstTileKey: flow.dstPoint.tile,
+      srcPoint: flow.srcPoint,
+      dstPoint: flow.dstPoint,
+      behavior: flow.type,
+    });
+  }
+
+  const propagatedTileKeys = new Set(blockedTileKeys);
+  const propagatedLinks = [];
+  const visitedEdges = new Set();
+  const queue = [...blockedTileKeys];
+  while (queue.length > 0) {
+    const tile = queue.shift();
+    const incoming = incomingByDstTile.get(tile) || [];
+    for (const edge of incoming) {
+      if (visitedEdges.has(edge.key)) continue;
+      visitedEdges.add(edge.key);
+      propagatedLinks.push(edge);
+      if (edge.srcTileKey && !propagatedTileKeys.has(edge.srcTileKey)) {
+        propagatedTileKeys.add(edge.srcTileKey);
+        queue.push(edge.srcTileKey);
+      }
+    }
+  }
+
+  return {
+    tileKeys: propagatedTileKeys,
+    blockedLinks,
+    propagatedLinks,
+  };
+}
+
+function drawBackpressureOverlay(overlay) {
+  if (!overlay) return;
+  const drawnPathKeys = new Set();
+  for (const link of overlay.propagatedLinks || []) {
+    if (!link?.srcPoint || !link?.dstPoint) continue;
+    if (drawnPathKeys.has(link.key)) continue;
+    drawnPathKeys.add(link.key);
+    dynamicLayer.append("path")
+      .attr("class", "bp-path-link")
+      .attr("d", buildLinkPath(link.srcPoint, link.dstPoint))
+      .append("title")
+      .text(`backpressure path (${link.behavior || "DataFlow"})`);
+  }
+  for (const link of overlay.blockedLinks || []) {
+    if (!link?.srcPoint || !link?.dstPoint) continue;
+    if (drawnPathKeys.has(link.key)) continue;
+    drawnPathKeys.add(link.key);
+    dynamicLayer.append("path")
+      .attr("class", "bp-path-link")
+      .attr("d", buildLinkPath(link.srcPoint, link.dstPoint))
+      .append("title")
+      .text(link.title || "blocked wire");
+  }
+  for (const key of overlay.tileKeys || []) {
+    const [x, y] = String(key).split(",").map(Number);
+    if (!Number.isFinite(x) || !Number.isFinite(y)) continue;
+    const r = tileRect(x, y);
+    dynamicLayer.append("rect")
+      .attr("class", "bp-tile-outline")
+      .attr("x", r.x + 1.5)
+      .attr("y", r.y + 1.5)
+      .attr("width", Math.max(2, r.w - 3))
+      .attr("height", Math.max(2, r.h - 3))
+      .attr("rx", 9)
+      .append("title")
+      .text(`backpressure tile (${x},${y})`);
+  }
+}
 
-  dynamicLayer
+function drawLink(type, srcPoint, dstPoint, payload = null) {
+  const pathData = buildLinkPath(srcPoint, dstPoint);
+  const dx = dstPoint.x - srcPoint.x;
+  const dy = dstPoint.y - srcPoint.y;
+
+  const link = dynamicLayer
     .append("path")
     .attr("class", "event-link")
-    .attr("d", path.toString())
+    .attr("d", pathData)
     .attr("stroke", colors[type] || "#555")
     .attr("stroke-opacity", 0.78);
 
+  const dataText = payload?.dataText == null ? "" : String(payload.dataText);
+  if (payload?.drawDataLabel && dataText) {
+    const shortData = shortText(dataText, 12);
+    let anchorX = srcPoint.x + dx * 0.58;
+    let anchorY = srcPoint.y + dy * 0.58;
+    try {
+      const node = link.node();
+      if (node) {
+        const total = node.getTotalLength();
+        if (Number.isFinite(total) && total > 0) {
+          const p = node.getPointAtLength(total * 0.58);
+          anchorX = p.x;
+          anchorY = p.y;
+        }
+      }
+    } catch (_) {
+      // Fall back to linear interpolation point when path metrics unavailable.
+    }
+    const tag = dynamicLayer.append("g")
+      .attr("class", "flow-data-tag")
+      .attr("transform", `translate(${anchorX},${anchorY})`);
+    const text = tag.append("text")
+      .attr("class", "flow-data-text")
+      .attr("text-anchor", "middle")
+      .attr("dominant-baseline", "middle")
+      .text(shortData);
+    const box = text.node()?.getBBox();
+    if (box) {
+      tag.insert("rect", "text")
+        .attr("class", "flow-data-bg")
+        .attr("x", box.x - 3)
+        .attr("y", box.y - 1)
+        .attr("width", box.width + 6)
+        .attr("height", box.height + 2)
+        .attr("rx", 4);
+    }
+    tag.append("title").text(`data=${dataText}`);
+  }
+
   const pulse = dynamicLayer
     .append("circle")
     .attr("class", "pulse")
@@ -346,61 +2800,165 @@ function applyTileActivity(activeTiles) {
   staticLayer.selectAll(".tile-label").style("display", state.showLabels ? null : "none");
 }
 
+function shortText(value, maxLen = 16) {
+  const s = String(value ?? "").trim();
+  if (!s) return "";
+  return s.length <= maxLen ? s : `${s.slice(0, maxLen - 1)}~`;
+}
+
+function summarizeTokens(tokens, prefix, maxItems = 3) {
+  if (!Array.isArray(tokens) || tokens.length === 0) return null;
+  const counts = new Map();
+  for (const token of tokens) {
+    const key = String(token || "N/A");
+    counts.set(key, (counts.get(key) || 0) + 1);
+  }
+  const sorted = [...counts.entries()]
+    .sort((a, b) => {
+      if (b[1] !== a[1]) return b[1] - a[1];
+      return a[0].localeCompare(b[0]);
+    });
+  const picked = sorted.slice(0, maxItems).map(([k, v]) => (v > 1 ? `${k}*${v}` : k));
+  const remain = sorted.length - maxItems;
+  const suffix = remain > 0 ? `,+${remain}` : "";
+  return `${prefix}:${picked.join(",")}${suffix}`;
+}
+
+function summarizeData(values, prefix, maxItems = 4) {
+  if (!Array.isArray(values) || values.length === 0) return null;
+  const picked = values.slice(0, maxItems).map((v) => shortText(v, 8));
+  const suffix = values.length > maxItems ? ",..." : "";
+  return `${prefix}:${picked.join(",")}${suffix}`;
+}
+
 function drawTileBadges(timeEvents) {
-  const instCounts = new Map();
-  const memCounts = new Map();
+  const byTile = new Map();
+  const ensure = (key) => {
+    if (!byTile.has(key)) {
+      byTile.set(key, {
+        op: [],
+        mem: [],
+        txData: [],
+        rxData: [],
+        details: [],
+      });
+    }
+    return byTile.get(key);
+  };
 
   for (const e of timeEvents) {
-    if (e.msg === "Inst" && state.showInst) {
-      const k = tileKey(e.X, e.Y);
-      instCounts.set(k, (instCounts.get(k) || 0) + 1);
+    if (e.msg === "Inst" && state.showInst && Number.isFinite(Number(e.X)) && Number.isFinite(Number(e.Y))) {
+      const k = tileKey(Number(e.X), Number(e.Y));
+      const rec = ensure(k);
+      const op = shortText(e.OpCode || "Inst", 10);
+      rec.op.push(op || "Inst");
+      rec.details.push(`Inst#${e.ID ?? "?"} ${e.OpCode ?? "N/A"} pred=${e.Pred ?? "N/A"}`);
+      continue;
     }
-    if (e.msg === "Memory" && state.showMemory) {
-      const k = tileKey(e.X, e.Y);
-      memCounts.set(k, (memCounts.get(k) || 0) + 1);
+
+    if (e.msg === "Memory" && state.showMemory && Number.isFinite(Number(e.X)) && Number.isFinite(Number(e.Y))) {
+      const k = tileKey(Number(e.X), Number(e.Y));
+      const rec = ensure(k);
+      const behavior = String(e.Behavior || "Memory");
+      const memTag = behavior === "LoadDirect"
+        ? `LD(${shortText(e.Value, 6)})`
+        : (behavior === "StoreDirect"
+          ? `ST(${shortText(e.Value, 6)})`
+          : shortText(behavior, 12));
+      rec.mem.push(memTag);
+      rec.details.push(`Memory ${behavior} value=${e.Value ?? "N/A"} addr=${e.Addr ?? "N/A"}`);
+      continue;
+    }
+
+    if (e.msg === "DataFlow" && state.showDataFlow) {
+      const dataValue = e.Data;
+      if (e.Behavior === "Send") {
+        const src = parseEndpoint(e.Src);
+        if (src?.kind === "tilePort") {
+          const rec = ensure(tileKey(src.x, src.y));
+          rec.txData.push(dataValue);
+          rec.details.push(`TX ${dataValue} ${e.Src} -> ${e.Dst}`);
+        }
+      } else if (e.Behavior === "Recv") {
+        const dst = parseEndpoint(e.Dst);
+        if (dst?.kind === "tilePort") {
+          const rec = ensure(tileKey(dst.x, dst.y));
+          rec.rxData.push(dataValue);
+          rec.details.push(`RX ${dataValue} ${e.Src} -> ${e.Dst}`);
+        }
+      } else if (e.Behavior === "FeedIn") {
+        const dst = parseEndpoint(e.To);
+        if (dst?.kind === "tilePort") {
+          const rec = ensure(tileKey(dst.x, dst.y));
+          rec.rxData.push(dataValue);
+          rec.details.push(`FeedIn ${dataValue} ${e.From} -> ${e.To}`);
+        }
+      } else if (e.Behavior === "Collect") {
+        const src = parseEndpoint(e.From);
+        if (src?.kind === "tilePort") {
+          const rec = ensure(tileKey(src.x, src.y));
+          rec.txData.push(dataValue);
+          rec.details.push(`Collect ${dataValue} ${e.From} -> ${e.To || e.Dst || "Driver"}`);
+        }
+      }
     }
   }
 
-  for (const [k, count] of instCounts.entries()) {
-    const [x, y] = k.split(",").map(Number);
-    const r = tileRect(x, y);
-    dynamicLayer
-      .append("circle")
-      .attr("class", "inst-badge")
-      .attr("cx", r.x + 16)
-      .attr("cy", r.y + 16)
-      .attr("r", 10)
-      .attr("fill", colors.Inst)
-      .attr("opacity", 0.9);
-    dynamicLayer
-      .append("text")
-      .attr("x", r.x + 12)
-      .attr("y", r.y + 20)
-      .attr("fill", "#fff")
-      .attr("font-size", 11)
-      .text(`${count}`);
-  }
-
-  for (const [k, count] of memCounts.entries()) {
+  for (const [k, rec] of byTile.entries()) {
     const [x, y] = k.split(",").map(Number);
     const r = tileRect(x, y);
-    dynamicLayer
-      .append("rect")
-      .attr("class", "memory-badge")
-      .attr("x", r.x + r.w - 23)
-      .attr("y", r.y + r.h - 23)
-      .attr("width", 16)
-      .attr("height", 16)
-      .attr("rx", 3)
-      .attr("fill", colors.Memory)
-      .attr("opacity", 0.9);
-    dynamicLayer
-      .append("text")
-      .attr("x", r.x + r.w - 20)
-      .attr("y", r.y + r.h - 11)
-      .attr("fill", "#fff")
-      .attr("font-size", 11)
-      .text(`${count}`);
+    const lineHeight = clamp(Math.round(layout.tileSize * 0.12), 9, 13);
+    const fontSize = clamp(Math.round(layout.tileSize * 0.1), 7, 11);
+    const innerWidth = Math.max(20, r.w - 8);
+    const approxCharWidth = fontSize * 0.6;
+    const maxCharsPerLine = Math.max(4, Math.floor(innerWidth / approxCharWidth));
+    const textTop = r.y + 24;
+    const maxTextHeight = Math.max(8, r.h - 30);
+    const maxLines = Math.max(1, Math.floor(maxTextHeight / lineHeight));
+
+    const lines = [];
+    const lineOps = [summarizeTokens(rec.op, "OP", 2), summarizeTokens(rec.mem, "MEM", 1)]
+      .filter(Boolean)
+      .join(" | ");
+    const lineFlow = [summarizeData(rec.rxData, "RX", 2), summarizeData(rec.txData, "TX", 2)]
+      .filter(Boolean)
+      .join(" | ");
+    // Prioritize flow values so data is still visible when space is tight.
+    if (lineFlow) lines.push(lineFlow);
+    if (lineOps) lines.push(lineOps);
+    if (lines.length === 0) continue;
+
+    const shown = lines.slice(0, maxLines).map((line) => shortText(line, maxCharsPerLine));
+    if (lines.length > maxLines) {
+      shown[maxLines - 1] = `${shortText(shown[maxLines - 1], Math.max(4, maxCharsPerLine - 3))}...`;
+    }
+    const bgHeight = shown.length * lineHeight + 8;
+    const g = dynamicLayer.append("g")
+      .attr("class", "tile-overlay")
+      .attr("transform", `translate(${r.x + 4},${textTop})`);
+    g.append("rect")
+      .attr("class", "tile-overlay-card")
+      .attr("width", innerWidth)
+      .attr("height", bgHeight)
+      .attr("rx", 4);
+    const text = g.append("text")
+      .attr("class", "tile-overlay-text")
+      .style("font-size", `${fontSize}px`)
+      .attr("x", 4)
+      .attr("y", lineHeight - 1);
+    shown.forEach((line, idx) => {
+      text.append("tspan")
+        .attr("x", 4)
+        .attr("dy", idx === 0 ? 0 : lineHeight)
+        .text(line);
+    });
+    g.append("title").text(
+      [
+        `tile=(${x},${y})`,
+        ...lines,
+        ...rec.details.slice(0, 12),
+      ].join("\n"),
+    );
   }
 }
 
@@ -421,32 +2979,34 @@ function renderCycleDetails(events, t) {
 }
 
 function renderTime(t) {
-  state.currentTime = t;
-  controls.timeLabel.textContent = `T=${t}`;
-  controls.timeSlider.value = String(t);
+  const cycle = clamp(normalizeCycleTime(t, state.currentTime), state.minTime, state.maxTime);
+  state.currentTime = cycle;
+  controls.timeLabel.textContent = `T=${cycle}`;
+  controls.timeSlider.value = String(cycle);
   dynamicLayer.selectAll("*").remove();
 
-  const events = state.byTime.get(t) || [];
+  const events = state.byTime.get(cycle) || [];
   const activeTiles = new Set();
+  const linkLabelSeen = new Set();
+  const bpOverlay = collectBackpressureOverlay(events);
+  for (const tile of bpOverlay.tileKeys || []) {
+    activeTiles.add(tile);
+  }
 
   for (const e of events) {
     if (e.msg === "DataFlow" && state.showDataFlow) {
-      let src = null;
-      let dst = null;
-      let type = e.Behavior;
-      if (e.Behavior === "FeedIn") {
-        src = parseEndpoint(e.From);
-        dst = parseEndpoint(e.To);
-      } else if (e.Behavior === "Collect") {
-        src = parseEndpoint(e.From);
-      } else {
-        src = parseEndpoint(e.Src);
-        dst = parseEndpoint(e.Dst);
-      }
-      const srcPoint = endpointPoint(src);
-      const dstPoint = endpointPoint(dst);
+      const flow = resolveDataFlowEndpoints(e);
+      const src = flow?.src;
+      const dst = flow?.dst;
+      const type = flow?.type || e.Behavior;
+      const srcPoint = flow?.srcPoint;
+      const dstPoint = flow?.dstPoint;
       if (srcPoint && dstPoint) {
-        drawLink(type, srcPoint, dstPoint);
+        const dataText = e.Data == null ? "" : String(e.Data);
+        const labelKey = `${src?.raw || srcPoint.tile || "?"}|${dst?.raw || dstPoint.tile || "?"}|${dataText}`;
+        const drawDataLabel = dataText && !linkLabelSeen.has(labelKey);
+        if (drawDataLabel) linkLabelSeen.add(labelKey);
+        drawLink(type, srcPoint, dstPoint, { dataText, drawDataLabel });
       } else if (srcPoint) {
         dynamicLayer
           .append("circle")
@@ -474,45 +3034,76 @@ function renderTime(t) {
 
   applyTileActivity(activeTiles);
   drawTileBadges(events);
-  renderCycleDetails(events, t);
+  drawBackpressureOverlay(bpOverlay);
+  // Keep link arrows above tile overlay cards.
+  dynamicLayer.selectAll(".event-link").raise();
+  dynamicLayer.selectAll(".bp-path-link").raise();
+  dynamicLayer.selectAll(".bp-tile-outline").raise();
+  // Keep transfer data labels/pulses above tile cards for readability.
+  dynamicLayer.selectAll(".flow-data-tag").raise();
+  dynamicLayer.selectAll(".pulse").raise();
+  renderCycleDetails(events, cycle);
 }
 
 function stopPlayback() {
   if (state.timer) {
-    clearInterval(state.timer);
+    clearTimeout(state.timer);
     state.timer = null;
   }
   controls.playBtn.textContent = "Play";
 }
 
+function playbackTick() {
+  if (!state.timer) return;
+  const next = nextIndexedTime(state.currentTime, +1);
+  if (next <= state.currentTime) {
+    stopPlayback();
+    return;
+  }
+  renderTime(next);
+  state.timer = setTimeout(playbackTick, state.speedMs);
+}
+
 function playOrPause() {
   if (state.timer) {
     stopPlayback();
     return;
   }
+  if (state.currentTime >= state.maxTime) renderTime(state.maxTime);
   controls.playBtn.textContent = "Pause";
-  state.timer = setInterval(() => {
-    if (state.currentTime >= state.maxTime) {
-      stopPlayback();
-      return;
-    }
-    renderTime(state.currentTime + 1);
-  }, state.speedMs);
+  state.timer = setTimeout(playbackTick, state.speedMs);
 }
 
 function initControls() {
   controls.playBtn.addEventListener("click", playOrPause);
   controls.stepBackBtn.addEventListener("click", () => {
+    if (state.stepLock) return;
+    state.stepLock = true;
     stopPlayback();
-    renderTime(Math.max(state.minTime, state.currentTime - 1));
+    try {
+      renderTime(nextIndexedTime(state.currentTime, -1));
+    } finally {
+      state.stepLock = false;
+    }
   });
   controls.stepFwdBtn.addEventListener("click", () => {
+    if (state.stepLock) return;
+    state.stepLock = true;
     stopPlayback();
-    renderTime(Math.min(state.maxTime, state.currentTime + 1));
+    try {
+      renderTime(nextIndexedTime(state.currentTime, +1));
+    } finally {
+      state.stepLock = false;
+    }
   });
   controls.timeSlider.addEventListener("input", (e) => {
+    const wasPlaying = Boolean(state.timer);
     stopPlayback();
-    renderTime(Number(e.target.value));
+    const nextTime = Number(e.target.value);
+    renderTime(nextTime);
+    if (wasPlaying) {
+      playOrPause();
+    }
   });
   controls.speedSelect.addEventListener("change", (e) => {
     state.speedMs = Number(e.target.value);
@@ -537,23 +3128,198 @@ function initControls() {
     state.showLabels = Boolean(e.target.checked);
     renderTime(state.currentTime);
   });
+  if (controls.timingAnomalyOnly) {
+    controls.timingAnomalyOnly.checked = state.timingAnomalyOnly;
+    controls.timingAnomalyOnly.addEventListener("change", (e) => {
+      state.timingAnomalyOnly = Boolean(e.target.checked);
+      renderTimingView();
+    });
+  }
+  if (controls.timingShowPhaseExplain) {
+    controls.timingShowPhaseExplain.checked = state.showPhaseExplain;
+    controls.timingShowPhaseExplain.addEventListener("change", (e) => {
+      state.showPhaseExplain = Boolean(e.target.checked);
+      renderTimingView();
+    });
+  }
+  if (controls.timingBoundaryOnly) {
+    controls.timingBoundaryOnly.checked = state.timingBoundaryOnly;
+    controls.timingBoundaryOnly.addEventListener("change", (e) => {
+      state.timingBoundaryOnly = Boolean(e.target.checked);
+      state.timingSelectedCell = null;
+      renderTimingView();
+    });
+  }
+  if (controls.timingCoreFocus) {
+    controls.timingCoreFocus.addEventListener("change", (e) => {
+      const value = String(e.target.value || "");
+      state.timingFocusedCoreKey = value || null;
+      state.timingSelectedCell = null;
+      renderTimingView();
+    });
+  }
+  if (controls.timingIoWaveAll) {
+    controls.timingIoWaveAll.addEventListener("change", (e) => {
+      const checked = Boolean(e.target.checked);
+      state.timingIoWaveExpandAll = checked;
+      if (checked) {
+        state.timingIoWaveExpandedCoreKeys = new Set((state.timingRows || []).map((c) => c.coreKey));
+      }
+      renderTimingView();
+    });
+  }
+  if (controls.timingIoWaveCore) {
+    controls.timingIoWaveCore.addEventListener("change", (e) => {
+      const selectedKeys = new Set(
+        [...e.target.selectedOptions]
+          .map((opt) => String(opt.value || ""))
+          .filter(Boolean),
+      );
+      state.timingIoWaveExpandedCoreKeys = selectedKeys;
+      const total = (state.timingRows || []).length;
+      state.timingIoWaveExpandAll = total > 0 && selectedKeys.size >= total;
+      renderTimingView();
+    });
+  }
+  if (controls.timingBaselineView) {
+    controls.timingBaselineView.value = state.timingBaselineView;
+    controls.timingBaselineView.addEventListener("change", (e) => {
+      state.timingBaselineView = String(e.target.value || "strict");
+      renderTimingView();
+    });
+  }
+  if (controls.timingCompModel) {
+    controls.timingCompModel.value = state.timingCompModel;
+    controls.timingCompModel.addEventListener("change", (e) => {
+      state.timingCompModel = String(e.target.value || "hybrid");
+      renderTimingView();
+    });
+  }
+  if (controls.timingExportPng) {
+    controls.timingExportPng.addEventListener("click", exportTimelinePng);
+  }
+  if (controls.timingJumpFirstMismatch) {
+    controls.timingJumpFirstMismatch.addEventListener("click", () => {
+      if (state.firstHybridMismatchTime == null || !Number.isFinite(state.firstHybridMismatchTime)) return;
+      const half = Math.floor((Number(state.timingWindowSize) || 60) / 2);
+      state.timingWindowStart = Math.max(0, state.firstHybridMismatchTime - half);
+      renderTimingView();
+    });
+  }
+  if (controls.timingWindowStart) {
+    controls.timingWindowStart.addEventListener("input", (e) => {
+      state.timingWindowStart = Number(e.target.value);
+      renderTimingView();
+    });
+  }
+  if (controls.timingWindowSize) {
+    controls.timingWindowSize.addEventListener("input", (e) => {
+      state.timingWindowSize = Number(e.target.value);
+      renderTimingView();
+    });
+  }
+  if (controls.timingZoomY) {
+    controls.timingZoomY.addEventListener("input", (e) => {
+      state.timingZoomY = clamp(Number(e.target.value) / 100, 0.6, 4);
+      renderTimingView();
+    });
+  }
+  if (controls.timingResetZoom) {
+    controls.timingResetZoom.addEventListener("click", () => {
+      state.timingZoomX = 1;
+      state.timingZoomY = 1;
+      const fullMin = state.timingViewport?.fullMin ?? state.minTime;
+      const fullMax = state.timingViewport?.fullMax ?? state.maxTime;
+      const fullSpan = Math.max(1, fullMax - fullMin + 1);
+      state.timingWindowSize = Math.min(120, fullSpan);
+      state.timingWindowStart = fullMin;
+      renderTimingView();
+    });
+  }
+  if (controls.timingGrid) {
+    controls.timingGrid.addEventListener("wheel", handleTimelineCtrlWheelZoom, { passive: false });
+    controls.timingGrid.addEventListener("click", (e) => {
+      const label = e.target.closest("[data-core-key]");
+      if (label) {
+        if (timingCoreLabelClickTimer) clearTimeout(timingCoreLabelClickTimer);
+        const key = label.getAttribute("data-core-key");
+        timingCoreLabelClickTimer = setTimeout(() => {
+          timingCoreLabelClickTimer = null;
+          state.timingFocusedCoreKey = state.timingFocusedCoreKey === key ? null : key;
+          state.timingSelectedCell = null;
+          renderTimingView();
+        }, 220);
+        return;
+      }
+      const btn = e.target.closest("[data-timing-cell]");
+      if (!btn) return;
+      state.timingSelectedCell = btn.getAttribute("data-timing-cell");
+      renderTimingView();
+    });
+    controls.timingGrid.addEventListener("dblclick", (e) => {
+      const label = e.target.closest("[data-core-key]");
+      if (!label) return;
+      if (timingCoreLabelClickTimer) {
+        clearTimeout(timingCoreLabelClickTimer);
+        timingCoreLabelClickTimer = null;
+      }
+      const key = label.getAttribute("data-core-key");
+      if (!key) return;
+      const expanded = new Set(state.timingIoWaveExpandedCoreKeys || []);
+      if (expanded.has(key)) {
+        expanded.delete(key);
+      } else {
+        expanded.add(key);
+      }
+      state.timingIoWaveExpandedCoreKeys = expanded;
+      const total = (state.timingRows || []).length;
+      state.timingIoWaveExpandAll = total > 0 && expanded.size >= total;
+      renderTimingView();
+    });
+  }
   controls.fileInput.addEventListener("change", async (e) => {
     const file = e.target.files?.[0];
     if (!file) return;
     const text = await file.text();
     loadTrace(text);
   });
+  controls.yamlInput.addEventListener("change", async (e) => {
+    const file = e.target.files?.[0];
+    if (!file) return;
+    const text = await file.text();
+    loadProgramYaml(text);
+  });
+  if (controls.reportInput) {
+    controls.reportInput.addEventListener("change", async (e) => {
+      const file = e.target.files?.[0];
+      if (!file) return;
+      const text = await file.text();
+      loadReport(text);
+    });
+  }
 }
 
 function loadTrace(text) {
   stopPlayback();
   const events = parseJsonLines(text);
   state.events = events;
-  const bounds = inferBounds(events);
+  state.coreIoWaveByTime = buildCoreIoWaveByTime(events);
+  state.timingSelectedCell = null;
+  state.timingFocusedCoreKey = null;
+  state.timingIoWaveExpandAll = false;
+  state.timingIoWaveExpandedCoreKeys = new Set();
+  state.timingWindowStart = 0;
+  state.timingWindowSize = 120;
+  state.timingZoomX = 1;
+  state.timingZoomY = 1;
+  state.timingViewport = null;
+  meshZoomTransform = d3.zoomIdentity;
+  const bounds = resolveMeshBounds(events);
   state.maxX = bounds.maxX;
   state.maxY = bounds.maxY;
   const index = indexByTime(events);
   state.byTime = index.byTime;
+  state.timeKeys = index.sortedTimes;
   state.minTime = index.minTime;
   state.maxTime = index.maxTime;
 
@@ -561,12 +3327,84 @@ function loadTrace(text) {
   controls.timeSlider.max = String(state.maxTime);
   controls.timeSlider.value = String(state.minTime);
 
+  applyAdaptiveLayout();
   drawStaticScene();
   renderTime(state.minTime);
+  renderReportView();
+  renderTimingView();
+}
+
+function loadProgramYaml(text) {
+  try {
+    state.programSpec = parseProgramYaml(text);
+    state.yamlGridBounds = boundsFromProgramSpec(state.programSpec);
+    state.timingSelectedCell = null;
+    state.timingFocusedCoreKey = null;
+    state.timingIoWaveExpandAll = false;
+    state.timingIoWaveExpandedCoreKeys = new Set();
+    state.timingWindowStart = 0;
+    state.timingZoomX = 1;
+    state.timingZoomY = 1;
+    state.timingViewport = null;
+    meshZoomTransform = d3.zoomIdentity;
+    const bounds = state.yamlGridBounds || inferBounds(state.events);
+    state.maxX = bounds.maxX;
+    state.maxY = bounds.maxY;
+    applyAdaptiveLayout();
+    drawStaticScene();
+    if (state.events.length > 0) {
+      renderTime(clamp(state.currentTime, state.minTime, state.maxTime));
+    }
+    renderReportView();
+    renderTimingView();
+  } catch (err) {
+    state.programSpec = null;
+    state.yamlGridBounds = null;
+    state.timingReady = false;
+    state.timingFocusedCoreKey = null;
+    state.timingIoWaveExpandAll = false;
+    state.timingIoWaveExpandedCoreKeys = new Set();
+    controls.timingSummary.textContent = `Program YAML parse error: ${err.message}`;
+    controls.timingGrid.innerHTML = "";
+    if (controls.timingCoreFocus) {
+      controls.timingCoreFocus.innerHTML = "<option value=\"\">All cores</option>";
+      controls.timingCoreFocus.value = "";
+    }
+    if (controls.timingIoWaveCore) {
+      controls.timingIoWaveCore.innerHTML = "";
+    }
+    if (controls.timingIoWaveAll) {
+      controls.timingIoWaveAll.checked = false;
+    }
+    if (controls.timingDrilldown) {
+      controls.timingDrilldown.innerHTML =
+        "<div class=\"timing-drill-empty\">Program YAML parse failed. Fix YAML and reload.</div>";
+    }
+    if (controls.timingCoreMini) {
+      controls.timingCoreMini.innerHTML =
+        "<div class=\"timing-core-mini-empty\">Focus one core to inspect local trace details.</div>";
+    }
+    renderReportView();
+  }
+}
+
+let resizeTimer = null;
+
+function handleResize() {
+  if (state.events.length === 0 && !state.programSpec) return;
+  if (resizeTimer) clearTimeout(resizeTimer);
+  resizeTimer = setTimeout(() => {
+    applyAdaptiveLayout();
+    drawStaticScene();
+    if (state.events.length > 0) renderTime(state.currentTime);
+  }, 120);
 }
 
 async function boot() {
   initControls();
+  applyAdaptiveLayout();
+  renderReportView();
+  window.addEventListener("resize", handleResize);
 
   // Default behavior: load ../gemm.json.log when served from repo root.
   try {
@@ -578,6 +3416,15 @@ async function boot() {
     controls.statsLine.textContent = "Default log not loaded. Use the file picker.";
     controls.eventDump.textContent = "";
   }
+
+  try {
+    const yamlResp = await fetch("../gemm.yaml");
+    if (!yamlResp.ok) throw new Error(`HTTP ${yamlResp.status}`);
+    const yamlText = await yamlResp.text();
+    loadProgramYaml(yamlText);
+  } catch (_) {
+    renderTimingView();
+  }
 }
 
 boot();
diff --git a/tool/viz/index.html b/tool/viz/index.html
index cd8528b..2a8caa9 100644
--- a/tool/viz/index.html
+++ b/tool/viz/index.html
@@ -3,20 +3,31 @@
   <head>
     <meta charset="UTF-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-    <title>CGRA GEMM Log Viewer</title>
+    <title>CGRA Log Viewer</title>
     <link rel="stylesheet" href="./styles.css" />
     <script src="https://cdn.jsdelivr.net/npm/d3@7"></script>
+    <script src="https://cdn.jsdelivr.net/npm/js-yaml@4.1.0/dist/js-yaml.min.js"></script>
   </head>
   <body>
     <main class="layout">
       <header class="panel topbar">
         <div>
-          <h1>CGRA GEMM Log Viewer</h1>
+          <h1>CGRA Log Viewer</h1>
           <p class="subtitle">Timeline visualization for JSONL execution traces</p>
         </div>
         <div class="file-load">
-          <label for="fileInput">Load log</label>
-          <input id="fileInput" type="file" accept=".log,.json,.jsonl,.txt" />
+          <div class="file-load-item">
+            <label for="fileInput">Trace log</label>
+            <input id="fileInput" type="file" accept=".log,.json,.jsonl,.txt" />
+          </div>
+          <div class="file-load-item">
+            <label for="yamlInput">Program YAML</label>
+            <input id="yamlInput" type="file" accept=".yaml,.yml,.txt" />
+          </div>
+          <div class="file-load-item">
+            <label for="reportInput">Report JSON</label>
+            <input id="reportInput" type="file" accept=".json,.txt" />
+          </div>
         </div>
       </header>
 
@@ -45,7 +56,8 @@ <h1>CGRA GEMM Log Viewer</h1>
       </section>
 
       <section class="panel viz">
-        <svg id="canvas" viewBox="0 0 940 620" preserveAspectRatio="xMidYMid meet"></svg>
+        <div id="meshLegend" class="mesh-legend-panel"></div>
+        <svg id="canvas" preserveAspectRatio="xMidYMid meet"></svg>
       </section>
 
       <section class="panel details">
@@ -53,6 +65,99 @@ <h2>Cycle Details</h2>
         <p id="statsLine"></p>
         <pre id="eventDump"></pre>
       </section>
+
+      <section class="panel report">
+        <h2>Report Overview</h2>
+        <p id="reportWarning" class="report-warning"></p>
+        <div id="reportSummary" class="report-summary"></div>
+        <div id="reportHotTiles" class="report-hot-tiles"></div>
+      </section>
+
+      <section class="panel timing">
+        <h2>Strict Timing Offset View</h2>
+        <p id="timingSummary" class="timing-summary"></p>
+        <div class="timing-toolbar">
+          <label class="timing-toggle">
+            <input type="checkbox" id="timingAnomalyOnly" />
+            anomaly-only
+          </label>
+          <label class="timing-toggle">
+            <input type="checkbox" id="timingShowPhaseExplain" checked />
+            show-phase-explain
+          </label>
+          <label class="timing-toggle">
+            <input type="checkbox" id="timingBoundaryOnly" />
+            boundary-only
+          </label>
+          <label class="timing-toggle timing-select">
+            core-focus
+            <select id="timingCoreFocus">
+              <option value="">All cores</option>
+            </select>
+          </label>
+          <label class="timing-toggle timing-select">
+            <input type="checkbox" id="timingIoWaveAll" />
+            io-wave-all
+          </label>
+          <label class="timing-toggle timing-select">
+            io-wave-core
+            <select id="timingIoWaveCore" multiple size="4">
+            </select>
+          </label>
+          <label class="timing-toggle timing-select">
+            baseline-view
+            <select id="timingBaselineView">
+              <option value="strict">strict</option>
+              <option value="compensated" selected>compensated</option>
+              <option value="split">split</option>
+            </select>
+          </label>
+          <label class="timing-toggle timing-select">
+            comp-model
+            <select id="timingCompModel">
+              <option value="distance">distance-heuristic</option>
+              <option value="fitted">trace-fitted-phase</option>
+              <option value="hybrid" selected>hybrid</option>
+            </select>
+          </label>
+          <label class="timing-toggle timing-export-setting">
+            max-side
+            <input id="timingExportMaxSide" type="number" min="512" max="16000" step="256" value="4096" />
+          </label>
+          <button id="timingExportPng" type="button">Export PNG</button>
+          <button id="timingJumpFirstMismatch" type="button">Jump to first hybrid mismatch</button>
+        </div>
+        <div class="timing-window">
+          <label class="timing-toggle timing-window-item">
+            window-size
+            <input id="timingWindowSize" type="range" min="1" max="1" value="1" />
+            <span id="timingWindowSizeLabel">1 cycle</span>
+          </label>
+          <label class="timing-toggle timing-window-item">
+            y-zoom
+            <input id="timingZoomY" type="range" min="60" max="400" step="5" value="100" />
+            <span id="timingZoomYLabel">1.00x</span>
+          </label>
+          <button id="timingResetZoom" type="button">Reset Zoom</button>
+        </div>
+        <div class="timing-legend">
+          <span class="legend-chip on-time">on-time</span>
+          <span class="legend-chip early">early</span>
+          <span class="legend-chip late">late</span>
+          <span class="legend-chip missing">missing</span>
+          <span class="legend-chip propagated">propagated</span>
+        </div>
+        <div id="timingGrid" class="timing-grid-wrap"></div>
+        <div id="timingCoreMini" class="timing-core-mini"></div>
+        <div class="timing-window timing-window-below">
+          <label class="timing-toggle timing-window-item">
+            window-start
+            <input id="timingWindowStart" type="range" min="0" max="0" value="0" />
+            <span id="timingWindowStartLabel">T0-T0</span>
+          </label>
+        </div>
+        <div id="timingDrilldown" class="timing-drilldown"></div>
+      </section>
     </main>
 
     <script src="./app.js"></script>
diff --git a/tool/viz/styles.css b/tool/viz/styles.css
index 0f2adf1..246ef65 100644
--- a/tool/viz/styles.css
+++ b/tool/viz/styles.css
@@ -30,6 +30,7 @@ body {
     radial-gradient(circle at 10% 90%, #8ecae644, transparent 38%),
     var(--bg);
   font-family: "Avenir Next", "Segoe UI", "Helvetica Neue", sans-serif;
+  scrollbar-gutter: stable both-edges;
 }
 
 .layout {
@@ -47,6 +48,7 @@ body {
   border-radius: 14px;
   padding: 0.9rem 1rem;
   box-shadow: var(--shadow);
+  min-width: 0;
 }
 
 .topbar {
@@ -75,10 +77,17 @@ h2 {
 .file-load {
   display: flex;
   align-items: center;
-  gap: 0.5rem;
+  gap: 0.8rem;
+  flex-wrap: wrap;
   font-size: 0.92rem;
 }
 
+.file-load-item {
+  display: flex;
+  align-items: center;
+  gap: 0.4rem;
+}
+
 .controls .row {
   display: flex;
   gap: 0.6rem;
@@ -118,8 +127,38 @@ button {
   padding: 0.2rem;
 }
 
+.mesh-legend-panel {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 0.45rem 0.8rem;
+  align-items: center;
+  margin: 0.1rem 0.15rem 0.5rem;
+  padding: 0.35rem 0.45rem;
+  border: 1px solid #d7caaa;
+  border-radius: 10px;
+  background: #fbf5e7;
+}
+
+.mesh-legend-item {
+  display: inline-flex;
+  align-items: center;
+  gap: 0.35rem;
+  font-size: 0.82rem;
+  color: #544b3f;
+  font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+}
+
+.mesh-legend-dot {
+  width: 10px;
+  height: 10px;
+  border-radius: 999px;
+  border: 1px solid rgba(48, 48, 48, 0.35);
+  display: inline-block;
+}
+
 #canvas {
   width: 100%;
+  height: auto;
   min-height: 480px;
   display: block;
 }
@@ -131,7 +170,7 @@ button {
 }
 
 #eventDump {
-  max-height: 260px;
+  height: 260px;
   overflow: auto;
   background: #fbf7ea;
   border: 1px solid var(--line);
@@ -140,6 +179,100 @@ button {
   margin-top: 0.6rem;
   font-size: 0.82rem;
   line-height: 1.45;
+  white-space: pre-wrap;
+  word-break: break-word;
+  overflow-wrap: anywhere;
+}
+
+.report-warning {
+  margin: 0 0 0.45rem;
+  color: #756f63;
+  font-size: 0.86rem;
+}
+
+.report-warning.warn {
+  color: #905b13;
+}
+
+.report-warning.error {
+  color: #9d1f1f;
+}
+
+.report-summary {
+  display: grid;
+  grid-template-columns: repeat(auto-fit, minmax(130px, 1fr));
+  gap: 0.45rem;
+  margin-bottom: 0.55rem;
+}
+
+.report-card {
+  border: 1px solid #ddcfb2;
+  border-radius: 8px;
+  background: #fbf7ea;
+  padding: 0.42rem 0.5rem;
+}
+
+.report-card-k {
+  font-size: 0.73rem;
+  color: #7a6f58;
+  text-transform: uppercase;
+  letter-spacing: 0.02em;
+}
+
+.report-card-v {
+  margin-top: 0.1rem;
+  font-size: 1rem;
+  color: #3b352d;
+  font-weight: 600;
+  font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+}
+
+.report-hot-tiles {
+  border: 1px solid #ddcfb2;
+  border-radius: 10px;
+  background: #fffaf0;
+  overflow: auto;
+  max-height: 280px;
+}
+
+.report-hot-title {
+  position: sticky;
+  top: 0;
+  z-index: 1;
+  padding: 0.42rem 0.55rem;
+  border-bottom: 1px solid #ddcfb2;
+  background: #f4ecd9;
+  color: #5c5348;
+  font-size: 0.82rem;
+  font-weight: 600;
+}
+
+.report-hot-table {
+  width: 100%;
+  border-collapse: collapse;
+}
+
+.report-hot-table th,
+.report-hot-table td {
+  border-bottom: 1px solid #eadfc7;
+  padding: 0.28rem 0.4rem;
+  text-align: left;
+  font-size: 0.79rem;
+  color: #4f483c;
+  font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+}
+
+.report-hot-table thead th {
+  position: sticky;
+  top: 29px;
+  background: #f8f0de;
+  z-index: 1;
+}
+
+.report-empty {
+  padding: 0.55rem;
+  color: #817662;
+  font-size: 0.84rem;
 }
 
 .tile {
@@ -148,6 +281,11 @@ button {
   stroke-width: 1.4px;
 }
 
+.tile-report-heat {
+  fill: #d62828;
+  pointer-events: none;
+}
+
 .tile.active {
   fill: var(--tile-active);
 }
@@ -158,6 +296,21 @@ button {
   pointer-events: none;
 }
 
+.tile-overlay-card,
+.tile-overlay-bg {
+  fill: rgba(255, 253, 247, 0.9);
+  stroke: rgba(122, 109, 84, 0.5);
+  stroke-width: 0.75px;
+}
+
+.tile-overlay-text {
+  fill: #3a352d;
+  font-size: 9.5px;
+  font-weight: 600;
+  font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+  pointer-events: none;
+}
+
 .legend-text,
 .driver-label {
   fill: #4d4d4d;
@@ -170,6 +323,41 @@ button {
   stroke-linecap: round;
 }
 
+.bp-path-link {
+  fill: none;
+  stroke: #b91c1c;
+  stroke-width: 4.2px;
+  stroke-linecap: round;
+  stroke-linejoin: round;
+  stroke-opacity: 0.95;
+  filter: drop-shadow(0 0 1px rgba(185, 28, 28, 0.35));
+  pointer-events: none;
+}
+
+.bp-tile-outline {
+  fill: none;
+  stroke: #b91c1c;
+  stroke-width: 3px;
+  pointer-events: none;
+}
+
+.flow-data-tag {
+  pointer-events: none;
+}
+
+.flow-data-bg {
+  fill: rgba(255, 251, 240, 0.92);
+  stroke: rgba(122, 109, 84, 0.48);
+  stroke-width: 0.7px;
+}
+
+.flow-data-text {
+  fill: #3b3427;
+  font-size: 9px;
+  font-weight: 700;
+  font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+}
+
 .pulse {
   r: 5;
 }
@@ -180,8 +368,678 @@ button {
   stroke-width: 0.5px;
 }
 
+.timing-summary {
+  margin: 0 0 0.45rem;
+  color: var(--muted);
+  font-size: 0.9rem;
+}
+
+.timing-toolbar {
+  display: flex;
+  align-items: center;
+  justify-content: flex-start;
+  flex-wrap: wrap;
+  gap: 0.7rem;
+  margin: 0 0 0.45rem;
+}
+
+.timing-toggle {
+  display: inline-flex;
+  align-items: center;
+  gap: 0.35rem;
+  font-size: 0.84rem;
+  color: #5b5b5b;
+}
+
+.timing-select select {
+  min-width: 130px;
+  padding: 0.18rem 0.3rem;
+  border: 1px solid #cbbda2;
+  border-radius: 6px;
+  background: #fffdf7;
+  color: #514a3f;
+}
+
+#timingIoWaveCore[multiple] {
+  min-width: 140px;
+  min-height: 92px;
+}
+
+.timing-export-setting input[type="number"] {
+  width: 86px;
+  padding: 0.2rem 0.35rem;
+}
+
+.timing-legend {
+  display: flex;
+  gap: 0.45rem;
+  align-items: center;
+  flex-wrap: wrap;
+  margin-bottom: 0.55rem;
+}
+
+.legend-chip {
+  border: 1px solid #bfb8a6;
+  border-radius: 999px;
+  padding: 0.12rem 0.5rem;
+  font-size: 0.77rem;
+}
+
+.legend-chip.on-time {
+  background: #d9f5df;
+}
+
+.legend-chip.early {
+  background: #ffd9d9;
+}
+
+.legend-chip.late {
+  background: #ffc8c8;
+}
+
+.legend-chip.missing {
+  background: #ececec;
+}
+
+.legend-chip.propagated {
+  background: #f5e4e4;
+  opacity: 0.75;
+}
+
+.timing-grid-wrap {
+  overflow: auto;
+  border: 1px solid var(--line);
+  border-radius: 10px;
+  background: #fbf7ea;
+}
+
+.timing-grid,
+.timing-heatmap {
+  width: max-content;
+  min-width: 100%;
+  border-collapse: collapse;
+}
+
+.timing-grid th,
+.timing-grid td,
+.timing-heatmap th,
+.timing-heatmap td {
+  border-bottom: 1px solid #d8ccb0;
+  border-right: 1px solid #e4d8be;
+  vertical-align: top;
+  padding: 0.35rem 0.4rem;
+}
+
+.timing-grid thead th,
+.timing-heatmap thead th {
+  position: sticky;
+  top: 0;
+  background: #f4ecd9;
+  z-index: 1;
+  font-size: 0.78rem;
+  white-space: nowrap;
+}
+
+.core-col {
+  min-width: 176px;
+  color: #544f43;
+  font-size: 0.8rem;
+  background: #f8f1df;
+  text-align: left;
+}
+
+.slot-head {
+  min-width: 84px;
+  text-align: left;
+}
+
+.timing-grid thead .core-col,
+.timing-heatmap thead .core-col {
+  left: 0;
+  z-index: 3;
+}
+
+.timing-grid tbody .core-col,
+.timing-heatmap tbody .core-col {
+  position: sticky;
+  left: 0;
+  z-index: 2;
+}
+
+.core-col .core-meta {
+  color: #7a6b55;
+  font-size: 0.72rem;
+  margin-top: 0.15rem;
+  font-weight: 500;
+}
+
+.timing-heat-cell {
+  padding: 0.2rem 0.25rem;
+}
+
+.timing-heat-btn {
+  width: 100%;
+  min-width: 64px;
+  border-radius: 8px;
+  border: 1px solid #c8baa0;
+  padding: 0.2rem 0.26rem;
+  display: flex;
+  flex-direction: column;
+  align-items: flex-start;
+  gap: 0.08rem;
+  cursor: pointer;
+  background: #fff;
+  transition: transform 120ms ease, box-shadow 120ms ease;
+}
+
+.timing-heat-btn:hover {
+  transform: translateY(-1px);
+  box-shadow: 0 1px 0 rgba(0, 0, 0, 0.06), 0 2px 8px rgba(0, 0, 0, 0.08);
+}
+
+.timing-heat-btn.selected {
+  box-shadow: 0 0 0 2px rgba(73, 83, 102, 0.35) inset;
+}
+
+.timing-heat-btn.first-divergence {
+  border-width: 2px;
+}
+
+.timing-heat-btn.boundary-core {
+  box-shadow: 0 0 0 1px rgba(66, 66, 66, 0.18);
+}
+
+.timing-heat-btn.muted {
+  opacity: 0.25;
+}
+
+.timing-heat-btn .heat-main {
+  font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+  font-size: 0.76rem;
+  font-weight: 700;
+  line-height: 1;
+}
+
+.timing-heat-btn .heat-sub {
+  font-size: 0.67rem;
+  color: #5f5f5f;
+  line-height: 1.1;
+  font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+}
+
+.timing-heat-btn.status-on-time {
+  background: rgba(217, 245, 223, var(--heat-alpha, 0.2));
+  border-color: #8bbd95;
+}
+
+.timing-heat-btn.status-early {
+  background: rgba(255, 217, 217, var(--heat-alpha, 0.5));
+  border-color: #d88b8b;
+}
+
+.timing-heat-btn.status-late {
+  background: rgba(255, 200, 200, var(--heat-alpha, 0.56));
+  border-color: #d77a7a;
+}
+
+.timing-heat-btn.status-missing {
+  background: rgba(236, 236, 236, var(--heat-alpha, 0.62));
+  border-color: #9a9696;
+}
+
+.timing-drilldown {
+  margin-top: 0.6rem;
+  border: 1px solid #dacfb5;
+  border-radius: 10px;
+  background: #fffaf0;
+  max-height: 280px;
+  overflow: auto;
+}
+
+.timing-drill-head {
+  position: sticky;
+  top: 0;
+  z-index: 1;
+  background: #f4ecd9;
+  border-bottom: 1px solid #dacfb5;
+  color: #5f5548;
+  font-size: 0.78rem;
+  font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+  padding: 0.45rem 0.55rem;
+  white-space: nowrap;
+  overflow: auto;
+}
+
+.timing-drill-list {
+  display: flex;
+  flex-direction: column;
+}
+
+.timing-drill-row {
+  display: grid;
+  grid-template-columns: minmax(120px, 200px) 1fr;
+  gap: 0.5rem;
+  align-items: start;
+  padding: 0.33rem 0.55rem;
+  border-bottom: 1px solid #eee2c8;
+  font-size: 0.74rem;
+}
+
+.timing-drill-row .drill-op {
+  font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+  font-weight: 700;
+  color: #3f3f3f;
+}
+
+.timing-drill-row .drill-meta {
+  color: #676255;
+  font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+}
+
+.timing-drill-row.on-time {
+  background: #eefaf0;
+}
+
+.timing-drill-row.early {
+  background: #fff1f1;
+}
+
+.timing-drill-row.late {
+  background: #ffeaea;
+}
+
+.timing-drill-row.missing {
+  background: #f3f3f3;
+}
+
+.timing-drill-row.propagated {
+  opacity: 0.72;
+}
+
+.timing-drill-empty {
+  color: #7d7668;
+  font-size: 0.82rem;
+  padding: 0.6rem;
+}
+
+.timing-core-mini {
+  margin-top: 0.55rem;
+  border: 1px solid #dacfb5;
+  border-radius: 10px;
+  background: #fffaf0;
+  max-height: 210px;
+  overflow: auto;
+}
+
+.timing-core-mini-head {
+  position: sticky;
+  top: 0;
+  z-index: 1;
+  background: #f4ecd9;
+  border-bottom: 1px solid #dacfb5;
+  color: #5f5548;
+  font-size: 0.78rem;
+  font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+  padding: 0.4rem 0.55rem;
+  white-space: nowrap;
+  overflow: auto;
+}
+
+.timing-core-mini-list {
+  display: flex;
+  flex-direction: column;
+}
+
+.timing-core-mini-row {
+  padding: 0.28rem 0.55rem;
+  border-bottom: 1px solid #eee2c8;
+  font-size: 0.76rem;
+  color: #5a5448;
+  font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+}
+
+.timing-core-mini-empty {
+  color: #7d7668;
+  font-size: 0.82rem;
+  padding: 0.6rem;
+}
+
+.timing-window {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 0.8rem;
+  margin: 0 0 0.55rem;
+}
+
+.timing-window-below {
+  margin: 0.45rem 0 0.55rem;
+}
+
+.timing-window-item {
+  gap: 0.45rem;
+}
+
+.timing-window-item input[type="range"] {
+  width: min(360px, 42vw);
+}
+
+#timingResetZoom {
+  height: 28px;
+  align-self: center;
+}
+
+.timing-timeline-svg {
+  display: block;
+  background: #fffaf0;
+}
+
+.timeline-axis {
+  stroke: #8f846d;
+  stroke-width: 1;
+}
+
+.timeline-cycle-sep {
+  stroke: #c4b89a;
+  stroke-width: 1;
+  stroke-dasharray: 3 2;
+}
+
+.timeline-core-sep {
+  stroke: #7a6f58;
+  stroke-width: 1.2;
+  stroke-dasharray: 4 3;
+}
+
+.timeline-tick {
+  fill: #7a6f58;
+  font-size: 10px;
+  font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+}
+
+.timeline-core-label {
+  fill: #5a5347;
+  font-size: 11px;
+  font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+  cursor: pointer;
+  user-select: none;
+}
+
+.timeline-core-label.boundary {
+  font-weight: 700;
+}
+
+.timeline-core-label:hover {
+  fill: #2a4f9a;
+}
+
+.timeline-core-label.focused {
+  fill: #1f4eb5;
+  font-weight: 700;
+  text-decoration: underline;
+}
+
+.timeline-core-label.io-expanded {
+  fill: #6a2b96;
+  text-decoration: underline;
+  text-decoration-style: dashed;
+}
+
+.timeline-lane-tag {
+  fill: #7f7460;
+  font-size: 10px;
+  font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+}
+
+.timeline-io-tag-in {
+  fill: #2d6cdf;
+  font-weight: 700;
+}
+
+.timeline-io-tag-out {
+  fill: #8f2ac7;
+  font-weight: 700;
+}
+
+.timeline-rect {
+  rx: 1.6;
+  ry: 1.6;
+  shape-rendering: crispEdges;
+}
+
+.timeline-rect.expected {
+  fill: #f4f4f4;
+  stroke: #8f8f8f;
+  stroke-width: 0.8;
+}
+
+.timeline-rect.actual-ok {
+  fill: #2a7f62;
+  stroke: #1f604a;
+  stroke-width: 0.7;
+}
+
+.timeline-rect.actual-bad {
+  fill: #d62828;
+  stroke: #8f1717;
+  stroke-width: 0.7;
+}
+
+.timeline-rect.actual-comp-ok {
+  fill: #2d6cdf;
+  stroke: #1d4a97;
+  stroke-width: 0.8;
+  opacity: 0.84;
+}
+
+.timeline-rect.actual-comp-bad {
+  fill: #9b2ce0;
+  stroke: #5d178a;
+  stroke-width: 0.85;
+  opacity: 0.9;
+}
+
+.timeline-rect.missing {
+  fill: #f4f4f4;
+  stroke: #7a7a7a;
+  stroke-width: 1.1;
+  stroke-dasharray: 2 1;
+}
+
+.timeline-rect.selected {
+  stroke-width: 1.8;
+  filter: drop-shadow(0 0 1px rgba(0, 0, 0, 0.28));
+}
+
+.timeline-io-bus {
+  stroke-width: 0.9;
+}
+
+.timeline-io-bus-in {
+  fill: #deebff;
+  stroke: #7da3ea;
+}
+
+.timeline-io-bus-out {
+  fill: #f1e1ff;
+  stroke: #b589dd;
+}
+
+.timeline-io-bus-label {
+  font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+  pointer-events: none;
+  font-size: 7px;
+}
+
+.timeline-io-bus-label-in {
+  fill: #214a9c;
+}
+
+.timeline-io-bus-label-out {
+  fill: #6d2094;
+}
+
+.timeline-rect-label {
+  font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+  pointer-events: none;
+  font-size: 7px;
+}
+
+.timeline-rect-label-expected {
+  fill: #444;
+}
+
+.timeline-rect-label-actual {
+  fill: #fff;
+}
+
+.timeline-missing {
+  stroke: #7a7a7a;
+  stroke-width: 1.2;
+}
+
+.timeline-link.ok {
+  stroke: rgba(54, 132, 103, 0.45);
+  stroke-width: 0.9;
+}
+
+.timeline-link.bad {
+  stroke: rgba(214, 40, 40, 0.72);
+  stroke-width: 1.2;
+}
+
+.timeline-link.comp-ok {
+  stroke: rgba(45, 108, 223, 0.5);
+  stroke-width: 0.9;
+  stroke-dasharray: 2 1;
+}
+
+.timeline-link.comp-bad {
+  stroke: rgba(155, 44, 224, 0.78);
+  stroke-width: 1.2;
+  stroke-dasharray: 2 1;
+}
+
+.timeline-legend-text {
+  fill: #615a4f;
+  font-size: 10px;
+}
+
+.timeline-legend-exp {
+  fill: #fff;
+  stroke: #8c8c8c;
+}
+
+.timeline-legend-act-ok {
+  fill: #2a7f62;
+}
+
+.timeline-legend-act-bad {
+  fill: #d62828;
+}
+
+.timeline-legend-comp-ok {
+  fill: #2d6cdf;
+}
+
+.timeline-legend-comp-bad {
+  fill: #9b2ce0;
+}
+
+.timeline-legend-io-in {
+  fill: #deebff;
+  stroke: #7da3ea;
+}
+
+.timeline-legend-io-out {
+  fill: #f1e1ff;
+  stroke: #b589dd;
+}
+
+.timeline-legend-missing {
+  fill: #f4f4f4;
+  stroke: #7a7a7a;
+  stroke-dasharray: 2 1;
+}
+
+.timing-cell {
+  min-height: 42px;
+  display: flex;
+  flex-wrap: wrap;
+  gap: 0.25rem;
+}
+
+.timing-op {
+  border: 1px solid #c8baa0;
+  border-radius: 6px;
+  padding: 0.1rem 0.3rem;
+  font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+  font-size: 0.73rem;
+  cursor: default;
+  background: #fff;
+}
+
+.timing-op.on-time {
+  background: #d9f5df;
+  border-color: #8bbd95;
+}
+
+.timing-op.early {
+  background: #ffd9d9;
+  border-color: #d88b8b;
+}
+
+.timing-op.late {
+  background: #ffc8c8;
+  border-color: #d77a7a;
+}
+
+.timing-op.missing {
+  background: #ececec;
+  border-color: #bcbcbc;
+}
+
+.timing-op.first-divergence {
+  box-shadow: 0 0 0 2px rgba(96, 41, 41, 0.25) inset;
+  font-weight: 600;
+}
+
+.timing-op.propagated {
+  opacity: 0.55;
+}
+
+.timing-empty {
+  color: #8b8577;
+  font-size: 0.73rem;
+}
+
 @media (max-width: 860px) {
   #canvas {
     min-height: 360px;
   }
+
+  .core-col {
+    min-width: 150px;
+  }
+
+  .slot-head {
+    min-width: 74px;
+  }
+
+  .timing-heat-btn {
+    min-width: 54px;
+    padding: 0.18rem 0.22rem;
+  }
+
+  .timing-drill-row {
+    grid-template-columns: 1fr;
+    gap: 0.2rem;
+  }
+
+  .timing-window-item input[type="range"] {
+    width: min(280px, 60vw);
+  }
+
+  .timeline-core-label {
+    font-size: 10px;
+  }
 }
diff --git a/verify/VERIFY_OVERVIEW.md b/verify/VERIFY_OVERVIEW.md
new file mode 100644
index 0000000..177632c
--- /dev/null
+++ b/verify/VERIFY_OVERVIEW.md
@@ -0,0 +1,113 @@
+# Verify 包说明
+
+本文档说明 Zeonica 中 `verify` 文件夹的职责、组成和用法。
+
+---
+
+## 一句话定位
+
+**verify** 是 Zeonica 的**快速验证层**：在跑 cycle-accurate 仿真之前，用**静态检查 + 无时序的功能仿真**先检查 kernel 的**结构、时序约束和计算语义**，并生成可读的验证报告。不建模周期、网络延迟和 backpressure，只关心“对不对”，不关心“多快”。
+
+---
+
+## 三块功能
+
+### 1. Lint（静态检查）— `lint.go`
+
+**作用**：只看 kernel YAML 和架构信息，不做执行，快速发现映射/调度错误。
+
+- **STRUCT**
+  - 坐标格式是否合法（如 `"(x,y)"`）
+  - PE 坐标是否在 CGRA 范围内（如 4×4 下不能有 (5,5)）
+  - 同一 (PE, timestep) 内是否有**端口写冲突**（同一端口被多条指令写）
+- **TIMING**
+  - 跨 PE 数据依赖的**时序是否满足**
+  - 规则：生产者写 → 消费者读，中间至少要经过 `距离 × HopLatency` 个周期
+  - 支持 modulo scheduling（II > 0）：用 D∈{0,1} 的迭代距离模型，减少误报
+
+**输入**：`programs map[string]core.Program`（从 kernel YAML 加载）+ `arch *ArchInfo`（行列、mesh、HopLatency 等）。  
+**输出**：`[]Issue`，每个 issue 带类型（STRUCT/TIMING）、PE、timestep、op 索引、消息和 Details。
+
+---
+
+### 2. Functional Simulator（功能仿真）— `funcsim.go`
+
+**作用**：按**数据依赖**执行 kernel，不建模周期、网络延迟和 backpressure，只验证**计算语义**是否正确。
+
+- 执行顺序：按 timestep 拓扑执行，某条 op 的**所有源操作数就绪**才执行
+- 每个 PE 维护：寄存器、本地 memory、端口上的“数据是否到位”
+- 数据带 **predicate**（valid/invalid），运算会传播 predicate（如二元 op 的 pred = pred0 AND pred1）
+- 支持 35+ opcode（与 `core/emu.go` 语义对齐的 subset：算术、逻辑、内存、比较、PHI、控制等）
+- **不做**：周期推进、网络延迟、SendBuf 满时的阻塞
+
+**输入**：同样 `programs` + `arch`，还可 `PreloadMemory(x, y, value, addr)` 预填内存。  
+**输出**：执行完后用 `GetRegisterValue(x, y, regIndex)`、`GetMemoryValue(x, y, addr)` 等查结果；若执行卡死或出错，`Run(maxSteps)` 返回 error。
+
+**典型用法**：在跑 Akita cycle-accurate 仿真前，先用 funcsim 跑一遍，看结果是否和预期一致，用来区分是**编译器/映射问题**还是**仿真器/时序问题**。
+
+---
+
+### 3. Report（报告）— `report.go`
+
+**作用**：把 Lint + FuncSim 的结果整理成一份**可读的验证报告**（控制台或文件）。
+
+- 先跑 Lint，再跑 FuncSim
+- 报告内容包含：
+  - 加载了多少个 PE program
+  - Lint：多少 STRUCT / TIMING issue，每条的具体信息
+  - FuncSim：是否成功完成、若有错则报错信息
+  - 总结：PE 数量、issue 统计、仿真成功/失败
+  - **Recommendation**：若有 TIMING 违例，会提示调整 timestep、调度或缓冲策略；若都通过，提示 kernel 可以进仿真
+
+**入口**：`GenerateReport(programs, arch, maxSteps)` 返回 `*VerificationReport`，再调用 `WriteReport(w)` 或 `SaveReportToFile(filename)`。
+
+---
+
+## 和 cycle-accurate simulator 的关系
+
+| 对比项       | verify（Lint + FuncSim）     | core + runtimecfg + config（真实仿真） |
+|--------------|------------------------------|----------------------------------------|
+| 目的         | 快速验证“对不对”             | 周期精确的“怎么执行、多快”             |
+| 时序         | 无周期、无网络延迟           | 有周期、有 backpressure、有延迟        |
+| 执行驱动     | 数据依赖就绪即执行           | 引擎 tick、端口收发、调度策略          |
+| 速度         | 很快（毫秒级）               | 慢（秒级或更长）                       |
+| 典型用法     | 改完 kernel/映射先跑一遍     | 确认无误后再跑完整仿真                 |
+
+**总结**：verify 是仿真前的“守门员”，先保证结构和语义没问题，再上真仿真。
+
+---
+
+## 输入输出小结
+
+- **输入**
+  - Kernel YAML（per-PE program，和 `core.LoadProgramFileFromYAML` 一致）
+  - 架构参数：`ArchInfo{Rows, Columns, Topology, HopLatency, MemCapacity, CtrlMemItems}`（verify 里自己定义，和 `runtimecfg` 的 arch spec 是分开的）
+  - 可选：funcsim 的 `PreloadMemory`、`maxSteps`
+- **输出**
+  - Lint：`[]Issue`
+  - FuncSim：各 PE 的寄存器/内存查询接口 + `Run()` 的 error
+  - Report：文本报告（stdout 或文件），包含 Lint 汇总、FuncSim 结果和 Recommendation
+
+---
+
+## 文件结构
+
+| 文件 | 作用 |
+|------|------|
+| `verify.go` | 类型定义（Issue、ArchInfo、PEState、FunctionalSimulator）、NewFunctionalSimulator、PreloadMemory、GetRegisterValue/GetMemoryValue、GenerateReport 等对外 API |
+| `lint.go` | `RunLint`、坐标校验、端口冲突、TIMING 约束检查（含 modulo 支持） |
+| `funcsim.go` | `Run`、`canExecuteOp`、`executeOp`、各 opcode 的语义实现（与 core 对齐） |
+| `report.go` | `VerificationReport`、`GenerateReport`、`WriteReport`、`SaveReportToFile` |
+| `verify_test.go` | 单元测试 |
+| `histogram_integration_test*.go` | 用真实 histogram kernel 做集成测试 |
+| `cmd/verify-axpy/main.go` 等 | 按 kernel（axpy、histogram、fir、gemv）封装的 CLI，读 YAML + 调 `GenerateReport(...).SaveReportToFile(...)` |
+
+---
+
+## 总结
+
+- **Lint**：静态看“坐标、端口冲突、跨 PE 时序”是否合法。
+- **FuncSim**：不跑周期，只按数据流执行，看“算出来的数”对不对。
+- **Report**：把前两步结果打成一份报告，并给出是否适合进仿真的建议。
+
+整体上，**verify 包就是在做“不跑完整仿真也能检查 kernel 对不对”的快速验证流水线**。
diff --git a/verify/cmd/verify-axpy/main.go b/verify/cmd/verify-axpy/main.go
index 38f6f58..a00b6a2 100644
--- a/verify/cmd/verify-axpy/main.go
+++ b/verify/cmd/verify-axpy/main.go
@@ -32,8 +32,15 @@ func main() {
 
 	report := verify.GenerateReport(programs, arch, 1000)
 	report.WriteReport(os.Stdout)
-	if len(report.LintIssues) > 0 {
-		log.Fatalf("AXPY verification failed with %d lint issues", len(report.LintIssues))
+	if report.BlockingLintIssueCount() > 0 {
+		log.Fatalf(
+			"AXPY verification failed with %d blocking lint issues (%d warnings)",
+			report.BlockingLintIssueCount(),
+			report.WarningLintIssueCount(),
+		)
+	}
+	if report.WarningLintIssueCount() > 0 {
+		log.Printf("AXPY verification has %d non-blocking warnings", report.WarningLintIssueCount())
 	}
 	if !report.SimulationOK {
 		log.Fatalf("AXPY simulation failed: %v", report.SimulationErr)
diff --git a/verify/cmd/verify-fir/main.go b/verify/cmd/verify-fir/main.go
index 342b010..7689ae9 100644
--- a/verify/cmd/verify-fir/main.go
+++ b/verify/cmd/verify-fir/main.go
@@ -30,8 +30,15 @@ func main() {
 
 	report := verify.GenerateReport(programs, arch, 1000)
 	report.WriteReport(os.Stdout)
-	if len(report.LintIssues) > 0 {
-		log.Fatalf("FIR verification failed with %d lint issues", len(report.LintIssues))
+	if report.BlockingLintIssueCount() > 0 {
+		log.Fatalf(
+			"FIR verification failed with %d blocking lint issues (%d warnings)",
+			report.BlockingLintIssueCount(),
+			report.WarningLintIssueCount(),
+		)
+	}
+	if report.WarningLintIssueCount() > 0 {
+		log.Printf("FIR verification has %d non-blocking warnings", report.WarningLintIssueCount())
 	}
 	if !report.SimulationOK {
 		log.Fatalf("FIR simulation failed: %v", report.SimulationErr)
diff --git a/verify/cmd/verify-gemv/main.go b/verify/cmd/verify-gemv/main.go
new file mode 100644
index 0000000..a4fc5de
--- /dev/null
+++ b/verify/cmd/verify-gemv/main.go
@@ -0,0 +1,84 @@
+package main
+
+import (
+	"log"
+	"os"
+	"strconv"
+
+	"github.com/sarchlab/zeonica/core"
+	"github.com/sarchlab/zeonica/verify"
+)
+
+// main runs lint and functional simulation on gemv kernel
+func main() {
+	programPath := os.Getenv("ZEONICA_PROGRAM_YAML")
+	if programPath == "" {
+		programPath = "test/testbench/gemv/tmp-generated-instructions.yaml"
+	}
+	programs := core.LoadProgramFileFromYAML(programPath)
+	if len(programs) == 0 {
+		log.Fatalf("Failed to load gemv program from %s", programPath)
+	}
+	lintOpts := verify.DefaultLintOptions()
+	lintOpts.EnablePrologueAwarePredicate = getEnvBool(
+		"VERIFY_PRED_PROLOGUE_AWARE",
+		lintOpts.EnablePrologueAwarePredicate,
+	)
+	lintOpts.PredicateWarmupPassCap = getEnvInt(
+		"VERIFY_PRED_WARMUP_CAP",
+		lintOpts.PredicateWarmupPassCap,
+	)
+	lintOpts.PredicateSteadyStatePasses = getEnvInt(
+		"VERIFY_PRED_STEADY_PASSES",
+		lintOpts.PredicateSteadyStatePasses,
+	)
+
+	arch := &verify.ArchInfo{
+		Rows:         4,
+		Columns:      4,
+		Topology:     "mesh",
+		HopLatency:   1,
+		MemCapacity:  2048,
+		CtrlMemItems: 20,
+	}
+
+	report := verify.GenerateReport(programs, arch, 1000, lintOpts)
+	report.WriteReport(os.Stdout)
+	if report.BlockingLintIssueCount() > 0 {
+		log.Fatalf(
+			"GEMV verification failed with %d blocking lint issues (%d warnings)",
+			report.BlockingLintIssueCount(),
+			report.WarningLintIssueCount(),
+		)
+	}
+	if report.WarningLintIssueCount() > 0 {
+		log.Printf("GEMV verification has %d non-blocking warnings", report.WarningLintIssueCount())
+	}
+	if !report.SimulationOK {
+		log.Fatalf("GEMV simulation failed: %v", report.SimulationErr)
+	}
+}
+
+func getEnvInt(name string, fallback int) int {
+	raw := os.Getenv(name)
+	if raw == "" {
+		return fallback
+	}
+	v, err := strconv.Atoi(raw)
+	if err != nil {
+		return fallback
+	}
+	return v
+}
+
+func getEnvBool(name string, fallback bool) bool {
+	raw := os.Getenv(name)
+	if raw == "" {
+		return fallback
+	}
+	v, err := strconv.ParseBool(raw)
+	if err != nil {
+		return fallback
+	}
+	return v
+}
diff --git a/verify/cmd/verify-histogram/main.go b/verify/cmd/verify-histogram/main.go
index 829867f..0e4d322 100644
--- a/verify/cmd/verify-histogram/main.go
+++ b/verify/cmd/verify-histogram/main.go
@@ -31,8 +31,18 @@ func main() {
 
 	report := verify.GenerateReport(programs, arch, 100)
 	report.WriteReport(os.Stdout)
-	if len(report.LintIssues) > 0 {
-		log.Fatalf("Histogram verification failed with %d lint issues", len(report.LintIssues))
+	if report.BlockingLintIssueCount() > 0 {
+		log.Fatalf(
+			"Histogram verification failed with %d blocking lint issues (%d warnings)",
+			report.BlockingLintIssueCount(),
+			report.WarningLintIssueCount(),
+		)
+	}
+	if report.WarningLintIssueCount() > 0 {
+		log.Printf(
+			"Histogram verification has %d non-blocking warnings",
+			report.WarningLintIssueCount(),
+		)
 	}
 	if !report.SimulationOK {
 		log.Fatalf("Histogram simulation failed: %v", report.SimulationErr)
diff --git a/verify/cmd/verify-kernelfusion/main.go b/verify/cmd/verify-kernelfusion/main.go
new file mode 100644
index 0000000..e1e0a3c
--- /dev/null
+++ b/verify/cmd/verify-kernelfusion/main.go
@@ -0,0 +1,150 @@
+package main
+
+import (
+	"fmt"
+	"os"
+	"sort"
+	"strconv"
+
+	"github.com/sarchlab/zeonica/core"
+	"github.com/sarchlab/zeonica/verify"
+)
+
+//nolint:gocyclo,funlen
+func main() {
+	programPath := os.Getenv("ZEONICA_PROGRAM_YAML")
+	if programPath == "" {
+		programPath = "test/testbench/kernelfusion/tmp-generated-instructions.yaml"
+	}
+
+	rows := getEnvInt("VERIFY_ROWS", 8)
+	cols := getEnvInt("VERIFY_COLS", 8)
+	maxPrint := getEnvInt("VERIFY_MAX_PRINT", 50)
+	lintOpts := verify.DefaultLintOptions()
+	lintOpts.EnablePrologueAwarePredicate = getEnvBool(
+		"VERIFY_PRED_PROLOGUE_AWARE",
+		lintOpts.EnablePrologueAwarePredicate,
+	)
+	lintOpts.PredicateWarmupPassCap = getEnvInt(
+		"VERIFY_PRED_WARMUP_CAP",
+		lintOpts.PredicateWarmupPassCap,
+	)
+	lintOpts.PredicateSteadyStatePasses = getEnvInt(
+		"VERIFY_PRED_STEADY_PASSES",
+		lintOpts.PredicateSteadyStatePasses,
+	)
+
+	programs := core.LoadProgramFileFromYAML(programPath)
+	if len(programs) == 0 {
+		fmt.Printf("failed to load program: %s\n", programPath)
+		os.Exit(2)
+	}
+
+	arch := &verify.ArchInfo{
+		Rows:         rows,
+		Columns:      cols,
+		Topology:     "mesh",
+		HopLatency:   1,
+		MemCapacity:  2048,
+		CtrlMemItems: 20,
+	}
+
+	issues := verify.RunLint(programs, arch, lintOpts)
+	structCnt := 0
+	timingCnt := 0
+	predicateCnt := 0
+	for _, it := range issues {
+		switch it.Type {
+		case verify.IssueStruct:
+			structCnt++
+		case verify.IssueTiming:
+			timingCnt++
+		case verify.IssuePredicate:
+			predicateCnt++
+		}
+	}
+
+	fmt.Printf("program: %s\n", programPath)
+	fmt.Printf("arch: %dx%d\n", cols, rows)
+	fmt.Printf(
+		"predicate_lint: prologueAware=%t warmupCap=%d steadyPasses=%d\n",
+		lintOpts.EnablePrologueAwarePredicate,
+		lintOpts.PredicateWarmupPassCap,
+		lintOpts.PredicateSteadyStatePasses,
+	)
+	fmt.Printf(
+		"total issues: %d (STRUCT=%d, TIMING=%d, PREDICATE=%d)\n",
+		len(issues),
+		structCnt,
+		timingCnt,
+		predicateCnt,
+	)
+	report := &verify.VerificationReport{LintIssues: issues}
+	fmt.Printf(
+		"severity: blocking=%d, warning=%d\n",
+		report.BlockingLintIssueCount(),
+		report.WarningLintIssueCount(),
+	)
+
+	if len(issues) == 0 {
+		fmt.Println("verify lint passed: no dependency/timing issues found")
+		return
+	}
+
+	sort.Slice(issues, func(i, j int) bool {
+		if issues[i].Type != issues[j].Type {
+			return issues[i].Type < issues[j].Type
+		}
+		if issues[i].PEY != issues[j].PEY {
+			return issues[i].PEY < issues[j].PEY
+		}
+		if issues[i].PEX != issues[j].PEX {
+			return issues[i].PEX < issues[j].PEX
+		}
+		if issues[i].Time != issues[j].Time {
+			return issues[i].Time < issues[j].Time
+		}
+		return issues[i].OpID < issues[j].OpID
+	})
+
+	limit := len(issues)
+	if maxPrint > 0 && limit > maxPrint {
+		limit = maxPrint
+	}
+	fmt.Printf("\nshowing first %d issue(s):\n", limit)
+	for i := 0; i < limit; i++ {
+		it := issues[i]
+		fmt.Printf(
+			"[%03d] %-6s PE(%d,%d) t=%d op=%d | %s\n",
+			i+1, it.Type, it.PEX, it.PEY, it.Time, it.OpID, it.Message,
+		)
+	}
+
+	if limit < len(issues) {
+		fmt.Printf("... %d more issue(s) not shown\n", len(issues)-limit)
+	}
+}
+
+func getEnvInt(name string, fallback int) int {
+	raw := os.Getenv(name)
+	if raw == "" {
+		return fallback
+	}
+	v, err := strconv.Atoi(raw)
+	if err != nil {
+		return fallback
+	}
+	return v
+}
+
+func getEnvBool(name string, fallback bool) bool {
+	raw := os.Getenv(name)
+	if raw == "" {
+		return fallback
+	}
+	v, err := strconv.ParseBool(raw)
+	if err != nil {
+		return fallback
+	}
+	return v
+}
diff --git a/verify/lint.go b/verify/lint.go
index 09cc504..281b1a2 100644
--- a/verify/lint.go
+++ b/verify/lint.go
@@ -3,6 +3,8 @@ package verify
 
 import (
 	"fmt"
+	"sort"
+	"strconv"
 	"strings"
 
 	"github.com/sarchlab/zeonica/core"
@@ -12,11 +14,16 @@ import (
 // It validates structure (STRUCT) and simple timing constraints (TIMING).
 // For kernels with modulo scheduling (ii > 0), it uses a D∈{0,1} iteration
 // distance model to reduce false positives on loop-carried dependencies.
+// Optional lint options can be provided to tune predicate analysis behavior.
 // Returns a list of issues found, or empty list if no issues.
 //
 //nolint:gocyclo
-func RunLint(programs map[string]core.Program, arch *ArchInfo) []Issue {
+func RunLint(programs map[string]core.Program, arch *ArchInfo, opts ...LintOptions) []Issue {
 	var issues []Issue
+	lintOpts := DefaultLintOptions()
+	if len(opts) > 0 {
+		lintOpts = normalizeLintOptions(opts[0])
+	}
 
 	// Extract CompiledII from the first program that has it
 	// (All programs should have the same II since they're from the same kernel)
@@ -107,6 +114,8 @@ func RunLint(programs map[string]core.Program, arch *ArchInfo) []Issue {
 
 	// TIMING: Build dependency graph and check latencies with modulo scheduling support
 	issues = append(issues, checkTimingConstraints(programs, arch, ii)...)
+	// PREDICATE: Check PHI/PHI_START/GRANT predicate consistency risks.
+	issues = append(issues, checkPredicateConstraints(programs, arch, ii, lintOpts)...)
 
 	return issues
 }
@@ -266,6 +275,428 @@ func checkTimingConstraints(programs map[string]core.Program, arch *ArchInfo, ii
 	return issues
 }
 
+type predMask uint8
+
+const (
+	predCanTrue predMask = 1 << iota
+	predCanFalse
+)
+
+const (
+	predTrueMask    predMask = predCanTrue
+	predFalseMask   predMask = predCanFalse
+	predUnknownMask predMask = predCanTrue | predCanFalse
+)
+
+func predHasTrue(v predMask) bool {
+	return v&predCanTrue != 0
+}
+
+func predHasFalse(v predMask) bool {
+	return v&predCanFalse != 0
+}
+
+func predAnd(values ...predMask) predMask {
+	canTrue := true
+	canFalse := false
+	for _, v := range values {
+		canTrue = canTrue && predHasTrue(v)
+		canFalse = canFalse || predHasFalse(v)
+	}
+
+	var out predMask
+	if canTrue {
+		out |= predCanTrue
+	}
+	if canFalse {
+		out |= predCanFalse
+	}
+	if out == 0 {
+		return predUnknownMask
+	}
+	return out
+}
+
+func predOr(a, b predMask) predMask {
+	out := predMask(0)
+	if predHasTrue(a) || predHasTrue(b) {
+		out |= predCanTrue
+	}
+	if predHasFalse(a) || predHasFalse(b) {
+		out |= predCanFalse
+	}
+	if out == 0 {
+		return predUnknownMask
+	}
+	return out
+}
+
+func parseRegisterIndex(impl string) (int, bool) {
+	if !strings.HasPrefix(impl, "$") {
+		return 0, false
+	}
+	idx, err := strconv.Atoi(strings.TrimPrefix(impl, "$"))
+	if err != nil {
+		return 0, false
+	}
+	return idx, true
+}
+
+func parseImmediateInt(impl string) (int64, bool) {
+	if !strings.HasPrefix(impl, "#") {
+		return 0, false
+	}
+	v := strings.TrimPrefix(impl, "#")
+	num, err := strconv.ParseInt(v, 0, 64)
+	if err == nil {
+		return num, true
+	}
+	u, err := strconv.ParseUint(v, 0, 64)
+	if err != nil {
+		return 0, false
+	}
+	return int64(u), true
+}
+
+func operandPredMask(operand core.Operand, regPred map[int]predMask) predMask {
+	if idx, ok := parseRegisterIndex(operand.Impl); ok {
+		if p, exists := regPred[idx]; exists {
+			return p
+		}
+		return predUnknownMask
+	}
+	if strings.HasPrefix(operand.Impl, "#") {
+		return predTrueMask
+	}
+	if isPortOperand(operand.Impl) {
+		return predUnknownMask
+	}
+	return predUnknownMask
+}
+
+func predicateGateMask(operand core.Operand) predMask {
+	v, ok := parseImmediateInt(operand.Impl)
+	if !ok {
+		return predUnknownMask
+	}
+	if v == 0 {
+		return predFalseMask
+	}
+	return predTrueMask
+}
+
+func writeRegisterDsts(op core.Operation, regPred map[int]predMask, pred predMask) {
+	for _, dst := range op.DstOperands.Operands {
+		if idx, ok := parseRegisterIndex(dst.Impl); ok {
+			regPred[idx] = pred
+		}
+	}
+}
+
+func andFromSrcOperands(op core.Operation, regPred map[int]predMask) predMask {
+	if len(op.SrcOperands.Operands) == 0 {
+		return predUnknownMask
+	}
+	preds := make([]predMask, 0, len(op.SrcOperands.Operands))
+	for _, src := range op.SrcOperands.Operands {
+		preds = append(preds, operandPredMask(src, regPred))
+	}
+	return predAnd(preds...)
+}
+
+type predicateStage string
+
+const (
+	predicateStageWarmup predicateStage = "warmup"
+	predicateStageSteady predicateStage = "steady"
+)
+
+type predicateRiskStat struct {
+	x       int
+	y       int
+	t       int
+	opID    int
+	message string
+	opcode  string
+
+	totalHits    int
+	definiteHits int
+	stageHits    map[predicateStage]int
+}
+
+func newPredicateRiskStat(x, y, t, opID int, message, opcode string) *predicateRiskStat {
+	return &predicateRiskStat{
+		x:       x,
+		y:       y,
+		t:       t,
+		opID:    opID,
+		message: message,
+		opcode:  opcode,
+		stageHits: map[predicateStage]int{
+			predicateStageWarmup: 0,
+			predicateStageSteady: 0,
+		},
+	}
+}
+
+func (p *predicateRiskStat) mark(stage predicateStage, definite bool) {
+	p.totalHits++
+	p.stageHits[stage]++
+	if definite {
+		p.definiteHits++
+	}
+}
+
+func (p *predicateRiskStat) certainty() string {
+	if p.totalHits > 0 && p.definiteHits == p.totalHits {
+		return "definite"
+	}
+	return "possible"
+}
+
+func computePredicatePassWindows(maxInvalid, ii int, opts LintOptions) (int, int) {
+	if !opts.EnablePrologueAwarePredicate {
+		passCount := maxInvalid + 1
+		if passCount < 1 {
+			passCount = 1
+		}
+		if passCount > 4 {
+			// Keep lint bounded in legacy mode.
+			passCount = 4
+		}
+		return passCount, 0
+	}
+
+	warmupPasses := maxInvalid + 1
+	if ii > 0 && ii+1 > warmupPasses {
+		warmupPasses = ii + 1
+	}
+	if warmupPasses < 1 {
+		warmupPasses = 1
+	}
+	if warmupPasses > opts.PredicateWarmupPassCap {
+		warmupPasses = opts.PredicateWarmupPassCap
+	}
+
+	steadyPasses := opts.PredicateSteadyStatePasses
+	if steadyPasses < 0 {
+		steadyPasses = 0
+	}
+	return warmupPasses, steadyPasses
+}
+
+func recordPredicateRisk(
+	stats map[string]*predicateRiskStat,
+	key string,
+	x, y, t, opID int,
+	message, opcode string,
+	stage predicateStage,
+	definite bool,
+) {
+	s, exists := stats[key]
+	if !exists {
+		s = newPredicateRiskStat(x, y, t, opID, message, opcode)
+		stats[key] = s
+	}
+	s.mark(stage, definite)
+}
+
+//nolint:gocyclo
+func checkPredicateConstraints(
+	programs map[string]core.Program,
+	arch *ArchInfo,
+	ii int,
+	opts LintOptions,
+) []Issue {
+	var issues []Issue
+
+	type opCursor struct {
+		timeIdx    int
+		op         core.Operation
+		invalidRem int
+	}
+
+	for coordStr, prog := range programs {
+		x, y, err := parseCoordinate(coordStr)
+		if err != nil || x < 0 || x >= arch.Columns || y < 0 || y >= arch.Rows {
+			continue
+		}
+
+		regPred := make(map[int]predMask)
+		phiStartSeen := make(map[int]bool)
+		grantOnceSeen := make(map[int]bool)
+		riskStats := make(map[string]*predicateRiskStat)
+
+		ops := make([]*opCursor, 0)
+		maxInvalid := 0
+		for _, entry := range prog.EntryBlocks {
+			for t, ig := range entry.InstructionGroups {
+				for _, op := range ig.Operations {
+					if op.InvalidIterations > maxInvalid {
+						maxInvalid = op.InvalidIterations
+					}
+					ops = append(ops, &opCursor{
+						timeIdx:    t,
+						op:         op,
+						invalidRem: op.InvalidIterations,
+					})
+				}
+			}
+		}
+
+		warmupPasses, steadyPasses := computePredicatePassWindows(maxInvalid, ii, opts)
+		totalPasses := warmupPasses + steadyPasses
+		if totalPasses < 1 {
+			totalPasses = 1
+		}
+
+		for pass := 0; pass < totalPasses; pass++ {
+			stage := predicateStageWarmup
+			if pass >= warmupPasses {
+				stage = predicateStageSteady
+			}
+			for _, item := range ops {
+				if item.invalidRem > 0 {
+					item.invalidRem--
+					continue
+				}
+
+				op := item.op
+				opName := strings.ToUpper(op.OpCode)
+
+				switch opName {
+				case "PHI_START":
+					if len(op.SrcOperands.Operands) < 2 {
+						writeRegisterDsts(op, regPred, predUnknownMask)
+						continue
+					}
+					src1 := operandPredMask(op.SrcOperands.Operands[0], regPred)
+					src2 := operandPredMask(op.SrcOperands.Operands[1], regPred)
+
+					if !phiStartSeen[op.ID] {
+						if predHasFalse(src1) {
+							recordPredicateRisk(
+								riskStats,
+								fmt.Sprintf("phi_start_first:%d", op.ID),
+								x, y, item.timeIdx, op.ID,
+								fmt.Sprintf("PHI_START id=%d first source may have pred=false on first execution", op.ID),
+								opName,
+								stage,
+								src1 == predFalseMask,
+							)
+						}
+						phiStartSeen[op.ID] = true
+						writeRegisterDsts(op, regPred, src1)
+					} else {
+						if predHasTrue(src1) && predHasTrue(src2) {
+							recordPredicateRisk(
+								riskStats,
+								fmt.Sprintf("phi_start_both_true:%d", op.ID),
+								x, y, item.timeIdx, op.ID,
+								fmt.Sprintf("PHI_START id=%d may see both source predicates true", op.ID),
+								opName,
+								stage,
+								src1 == predTrueMask && src2 == predTrueMask,
+							)
+						}
+						writeRegisterDsts(op, regPred, predOr(src1, src2))
+					}
+				case "PHI":
+					if len(op.SrcOperands.Operands) < 2 {
+						writeRegisterDsts(op, regPred, predUnknownMask)
+						continue
+					}
+					src1 := operandPredMask(op.SrcOperands.Operands[0], regPred)
+					src2 := operandPredMask(op.SrcOperands.Operands[1], regPred)
+					if predHasTrue(src1) && predHasTrue(src2) {
+						recordPredicateRisk(
+							riskStats,
+							fmt.Sprintf("phi_both_true:%d", op.ID),
+							x, y, item.timeIdx, op.ID,
+							fmt.Sprintf("PHI id=%d may have both source predicates true", op.ID),
+							opName,
+							stage,
+							src1 == predTrueMask && src2 == predTrueMask,
+						)
+					}
+					writeRegisterDsts(op, regPred, predOr(src1, src2))
+				case "GRANT_PREDICATE", "GPRED":
+					if len(op.SrcOperands.Operands) < 2 {
+						writeRegisterDsts(op, regPred, predUnknownMask)
+						continue
+					}
+					srcPred := operandPredMask(op.SrcOperands.Operands[0], regPred)
+					predPred := operandPredMask(op.SrcOperands.Operands[1], regPred)
+					gate := predicateGateMask(op.SrcOperands.Operands[1])
+					writeRegisterDsts(op, regPred, predAnd(srcPred, predPred, gate))
+				case "GRANT_ONCE":
+					if len(op.SrcOperands.Operands) == 0 {
+						writeRegisterDsts(op, regPred, predUnknownMask)
+						continue
+					}
+					srcPred := operandPredMask(op.SrcOperands.Operands[0], regPred)
+					if !grantOnceSeen[op.ID] {
+						grantOnceSeen[op.ID] = true
+						writeRegisterDsts(op, regPred, srcPred)
+					} else {
+						writeRegisterDsts(op, regPred, predFalseMask)
+					}
+				case "MOV", "DATA_MOV", "CTRL_MOV", "SEXT", "ZEXT", "CAST_FPTOSI", "NOT", "LOAD":
+					if len(op.SrcOperands.Operands) == 0 {
+						writeRegisterDsts(op, regPred, predUnknownMask)
+						continue
+					}
+					writeRegisterDsts(op, regPred, operandPredMask(op.SrcOperands.Operands[0], regPred))
+				case "CONSTANT":
+					writeRegisterDsts(op, regPred, predTrueMask)
+				case "PHI_CONST":
+					if len(op.SrcOperands.Operands) < 2 {
+						writeRegisterDsts(op, regPred, predUnknownMask)
+						continue
+					}
+					src1 := operandPredMask(op.SrcOperands.Operands[0], regPred)
+					src2 := operandPredMask(op.SrcOperands.Operands[1], regPred)
+					writeRegisterDsts(op, regPred, predOr(src1, src2))
+				case "ADD", "SUB", "MUL", "DIV", "FADD", "FSUB", "FMUL", "FDIV",
+					"OR", "XOR", "AND", "SHL", "LLS", "LRS", "GEP", "MUL_ADD",
+					"ICMP_EQ", "ICMP_SLT", "ICMP_SGT", "ICMP_SGE", "ICMP_SLE", "ICMP_SNE", "LT_EX":
+					writeRegisterDsts(op, regPred, andFromSrcOperands(op, regPred))
+				default:
+					// Unknown opcode to lint: keep analysis conservative.
+					writeRegisterDsts(op, regPred, predUnknownMask)
+				}
+			}
+		}
+
+		keys := make([]string, 0, len(riskStats))
+		for key := range riskStats {
+			keys = append(keys, key)
+		}
+		sort.Strings(keys)
+		for _, key := range keys {
+			stat := riskStats[key]
+			issues = append(issues, Issue{
+				Type:    IssuePredicate,
+				PEX:     stat.x,
+				PEY:     stat.y,
+				Time:    stat.t,
+				OpID:    stat.opID,
+				Message: stat.message,
+				Details: map[string]interface{}{
+					"certainty":     stat.certainty(),
+					"opcode":        stat.opcode,
+					"warmup_hits":   stat.stageHits[predicateStageWarmup],
+					"steady_hits":   stat.stageHits[predicateStageSteady],
+					"total_hits":    stat.totalHits,
+					"definite_hits": stat.definiteHits,
+				},
+			})
+		}
+	}
+
+	return issues
+}
+
 // isPortOperand checks if an operand is a port (direction name)
 func isPortOperand(impl string) bool {
 	dirNames := map[string]bool{
diff --git a/verify/report.go b/verify/report.go
index b6ffd4b..1f8050f 100644
--- a/verify/report.go
+++ b/verify/report.go
@@ -11,18 +11,79 @@ import (
 
 // VerificationReport represents a complete verification report
 type VerificationReport struct {
-	ProgramCount  int
-	LintIssues    []Issue
-	StructIssues  []Issue
-	TimingIssues  []Issue
-	SimulationErr error
-	SimulationOK  bool
-	Arch          *ArchInfo
-	Programs      map[string]core.Program
+	ProgramCount    int
+	LintIssues      []Issue
+	StructIssues    []Issue
+	TimingIssues    []Issue
+	PredicateIssues []Issue
+	SimulationErr   error
+	SimulationOK    bool
+	Arch            *ArchInfo
+	Programs        map[string]core.Program
 }
 
-// GenerateReport runs both lint and functional simulation, returns a report
-func GenerateReport(programs map[string]core.Program, arch *ArchInfo, maxSimSteps int) *VerificationReport {
+func predicateIssueIsPossible(issue Issue) bool {
+	if issue.Type != IssuePredicate {
+		return false
+	}
+	if issue.Details == nil {
+		return false
+	}
+	certainty, ok := issue.Details["certainty"]
+	if !ok {
+		return false
+	}
+	certaintyStr, ok := certainty.(string)
+	if !ok {
+		return false
+	}
+	return strings.EqualFold(strings.TrimSpace(certaintyStr), "possible")
+}
+
+// BlockingLintIssues returns lint issues that should fail verification.
+// Current policy:
+// - STRUCT/TIMING issues are always blocking.
+// - PREDICATE issues are blocking unless tagged as certainty=possible.
+func (r *VerificationReport) BlockingLintIssues() []Issue {
+	blocking := make([]Issue, 0, len(r.LintIssues))
+	for _, issue := range r.LintIssues {
+		if issue.Type == IssuePredicate && predicateIssueIsPossible(issue) {
+			continue
+		}
+		blocking = append(blocking, issue)
+	}
+	return blocking
+}
+
+// WarningLintIssues returns non-blocking lint issues (currently predicate possible).
+func (r *VerificationReport) WarningLintIssues() []Issue {
+	warnings := make([]Issue, 0)
+	for _, issue := range r.LintIssues {
+		if issue.Type == IssuePredicate && predicateIssueIsPossible(issue) {
+			warnings = append(warnings, issue)
+		}
+	}
+	return warnings
+}
+
+// BlockingLintIssueCount returns number of blocking lint issues.
+func (r *VerificationReport) BlockingLintIssueCount() int {
+	return len(r.BlockingLintIssues())
+}
+
+// WarningLintIssueCount returns number of warning lint issues.
+func (r *VerificationReport) WarningLintIssueCount() int {
+	return len(r.WarningLintIssues())
+}
+
+// GenerateReport runs both lint and functional simulation, returns a report.
+// Optional lint options can be provided to tune predicate analysis.
+func GenerateReport(
+	programs map[string]core.Program,
+	arch *ArchInfo,
+	maxSimSteps int,
+	opts ...LintOptions,
+) *VerificationReport {
 	report := &VerificationReport{
 		ProgramCount: len(programs),
 		Arch:         arch,
@@ -30,13 +91,18 @@ func GenerateReport(programs map[string]core.Program, arch *ArchInfo, maxSimStep
 	}
 
 	// Run lint
-	report.LintIssues = RunLint(programs, arch)
+	report.LintIssues = RunLint(programs, arch, opts...)
 
 	// Categorize issues
 	for _, issue := range report.LintIssues {
-		if issue.Type == IssueStruct {
+		switch issue.Type {
+		case IssueStruct:
 			report.StructIssues = append(report.StructIssues, issue)
-		} else {
+		case IssueTiming:
+			report.TimingIssues = append(report.TimingIssues, issue)
+		case IssuePredicate:
+			report.PredicateIssues = append(report.PredicateIssues, issue)
+		default:
 			report.TimingIssues = append(report.TimingIssues, issue)
 		}
 	}
@@ -74,6 +140,12 @@ func (r *VerificationReport) WriteReport(w io.Writer) {
 		fmt.Fprintln(w, "✓ No lint issues found!")
 	} else {
 		fmt.Fprintf(w, "⚠ Found %d lint issues:\n\n", len(r.LintIssues))
+		fmt.Fprintf(
+			w,
+			"  Blocking: %d, Warning: %d\n",
+			r.BlockingLintIssueCount(),
+			r.WarningLintIssueCount(),
+		)
 
 		if len(r.StructIssues) > 0 {
 			fmt.Fprintf(w, "\nSTRUCT ISSUES (%d):\n", len(r.StructIssues))
@@ -108,6 +180,31 @@ func (r *VerificationReport) WriteReport(w io.Writer) {
 				fmt.Fprintln(w)
 			}
 		}
+
+		if len(r.PredicateIssues) > 0 {
+			fmt.Fprintf(w, "\nPREDICATE ISSUES (%d):\n", len(r.PredicateIssues))
+			fmt.Fprintln(w, dash)
+			for i, issue := range r.PredicateIssues {
+				fmt.Fprintf(w, "  Issue %d: [PE(%d,%d) t=%d op=%d]\n",
+					i+1, issue.PEX, issue.PEY, issue.Time, issue.OpID)
+				fmt.Fprintf(w, "    Message: %s\n", issue.Message)
+				if issue.Details != nil {
+					if opCode, ok := issue.Details["opcode"]; ok {
+						fmt.Fprintf(w, "    OpCode: %v\n", opCode)
+					}
+					if certainty, ok := issue.Details["certainty"]; ok {
+						fmt.Fprintf(w, "    Certainty: %v\n", certainty)
+					}
+					if warmupHits, ok := issue.Details["warmup_hits"]; ok {
+						fmt.Fprintf(w, "    Warmup hits: %v\n", warmupHits)
+					}
+					if steadyHits, ok := issue.Details["steady_hits"]; ok {
+						fmt.Fprintf(w, "    Steady hits: %v\n", steadyHits)
+					}
+				}
+				fmt.Fprintln(w)
+			}
+		}
 	}
 
 	// STAGE 2: FUNCTIONAL SIMULATION
@@ -127,8 +224,14 @@ func (r *VerificationReport) WriteReport(w io.Writer) {
 	fmt.Fprintln(w, separator)
 
 	fmt.Fprintf(w, "Program Structure: %d PEs deployed\n", r.ProgramCount)
-	fmt.Fprintf(w, "Lint Result: %d issues detected (%d STRUCT, %d TIMING)\n",
-		len(r.LintIssues), len(r.StructIssues), len(r.TimingIssues))
+	fmt.Fprintf(w, "Lint Result: %d issues detected (%d STRUCT, %d TIMING, %d PREDICATE)\n",
+		len(r.LintIssues), len(r.StructIssues), len(r.TimingIssues), len(r.PredicateIssues))
+	fmt.Fprintf(
+		w,
+		"Lint Severity: %d blocking, %d warning\n",
+		r.BlockingLintIssueCount(),
+		r.WarningLintIssueCount(),
+	)
 	simStatus := "SUCCESS"
 	if !r.SimulationOK {
 		simStatus = "FAILED: " + r.SimulationErr.Error()
@@ -139,13 +242,14 @@ func (r *VerificationReport) WriteReport(w io.Writer) {
 	fmt.Fprintln(w, "RECOMMENDATION")
 	fmt.Fprintln(w, separator)
 
-	if len(r.TimingIssues) > 0 {
-		fmt.Fprintln(w, "⚠ TIMING VIOLATIONS DETECTED")
-		fmt.Fprintln(w, "This kernel has cross-PE communication constraints")
-		fmt.Fprintln(w, "that are not satisfied. Consider:")
-		fmt.Fprintln(w, "  1. Adjusting operation timesteps to allow latency")
-		fmt.Fprintln(w, "  2. Modifying the scheduling to respect network delays")
-		fmt.Fprintln(w, "  3. Using buffering or pipelining strategies")
+	if r.BlockingLintIssueCount() > 0 {
+		fmt.Fprintln(w, "⚠ BLOCKING LINT ISSUES DETECTED")
+		fmt.Fprintln(w, "This kernel still has structural/timing/definite-predicate issues.")
+		fmt.Fprintln(w, "Fix blocking issues before trusting simulation results.")
+	} else if r.WarningLintIssueCount() > 0 {
+		fmt.Fprintln(w, "⚠ PREDICATE RISKS DETECTED")
+		fmt.Fprintln(w, "Only non-blocking predicate warnings are present (certainty=possible).")
+		fmt.Fprintln(w, "You may still review PHI/PHI_START/GPRED flows for robustness.")
 	} else {
 		fmt.Fprintln(w, "✓ KERNEL PASSED ALL CHECKS")
 		fmt.Fprintln(w, "The kernel is ready for simulation.")
diff --git a/verify/verify.go b/verify/verify.go
index d42ea0e..16e4d7b 100644
--- a/verify/verify.go
+++ b/verify/verify.go
@@ -137,11 +137,13 @@ const (
 	IssueStruct IssueType = "STRUCT" // Mapping/structure error (illegal PE, port conflict)
 	// IssueTiming indicates a dependency/timing lint issue.
 	IssueTiming IssueType = "TIMING" // Dependency/timing error (insufficient latency)
+	// IssuePredicate indicates predicate-consistency risk in control/dataflow ops.
+	IssuePredicate IssueType = "PREDICATE" // Predicate risk (PHI/PHI_START/GPRED interactions)
 )
 
 // Issue represents a single lint issue
 type Issue struct {
-	Type    IssueType              // STRUCT or TIMING
+	Type    IssueType              // STRUCT, TIMING, or PREDICATE
 	PEX     int                    // PE X coordinate (-1 if not applicable)
 	PEY     int                    // PE Y coordinate (-1 if not applicable)
 	Time    int                    // Timestep (-1 if not applicable)
@@ -160,6 +162,41 @@ type ArchInfo struct {
 	CtrlMemItems int    // Control memory entries per PE
 }
 
+const (
+	defaultPredicateWarmupPassCap   = 8
+	defaultPredicateSteadyPassCount = 2
+)
+
+// LintOptions controls static lint behavior.
+type LintOptions struct {
+	// EnablePrologueAwarePredicate enables bounded warmup+steady analysis for predicate checks.
+	EnablePrologueAwarePredicate bool
+	// PredicateWarmupPassCap bounds warmup passes used to consume invalid_iterations/prologue.
+	PredicateWarmupPassCap int
+	// PredicateSteadyStatePasses controls extra passes after warmup to inspect steady-state risks.
+	PredicateSteadyStatePasses int
+}
+
+// DefaultLintOptions returns the default lint configuration.
+func DefaultLintOptions() LintOptions {
+	return LintOptions{
+		EnablePrologueAwarePredicate: true,
+		PredicateWarmupPassCap:       defaultPredicateWarmupPassCap,
+		PredicateSteadyStatePasses:   defaultPredicateSteadyPassCount,
+	}
+}
+
+func normalizeLintOptions(opts LintOptions) LintOptions {
+	out := opts
+	if out.PredicateWarmupPassCap <= 0 {
+		out.PredicateWarmupPassCap = defaultPredicateWarmupPassCap
+	}
+	if out.PredicateSteadyStatePasses < 0 {
+		out.PredicateSteadyStatePasses = 0
+	}
+	return out
+}
+
 // PEState captures the runtime state of a single PE (for functional simulator)
 type PEState struct {
 	Registers  map[int]core.Data      // Register file: register index → Data
diff --git a/verify/verify_test.go b/verify/verify_test.go
index 76d446c..9201f2f 100644
--- a/verify/verify_test.go
+++ b/verify/verify_test.go
@@ -2,6 +2,7 @@
 package verify
 
 import (
+	"strings"
 	"testing"
 
 	"github.com/sarchlab/zeonica/core"
@@ -264,3 +265,353 @@ func TestFunctionalSimulatorMemory(t *testing.T) {
 		t.Errorf("Expected $1 = 42 (from memory), got %d", val1)
 	}
 }
+
+func TestRunLintPredicatePhiStartFirstSourceRisk(t *testing.T) {
+	arch := &ArchInfo{
+		Rows:         2,
+		Columns:      2,
+		Topology:     "mesh",
+		HopLatency:   1,
+		MemCapacity:  1024,
+		CtrlMemItems: 256,
+	}
+
+	// Force $0 predicate=false, then PHI_START reads $0 as first source.
+	prog := core.Program{
+		EntryBlocks: []core.EntryBlock{
+			{
+				InstructionGroups: []core.InstructionGroup{
+					{
+						Operations: []core.Operation{
+							{
+								OpCode: "GRANT_PREDICATE",
+								SrcOperands: core.OperandList{
+									Operands: []core.Operand{
+										{Impl: "#1", Color: "RED"},
+										{Impl: "#0", Color: "RED"},
+									},
+								},
+								DstOperands: core.OperandList{
+									Operands: []core.Operand{
+										{Impl: "$0", Color: "RED"},
+									},
+								},
+							},
+						},
+					},
+					{
+						Operations: []core.Operation{
+							{
+								OpCode: "PHI_START",
+								ID:     145,
+								SrcOperands: core.OperandList{
+									Operands: []core.Operand{
+										{Impl: "$0", Color: "RED"},
+										{Impl: "$1", Color: "RED"},
+									},
+								},
+								DstOperands: core.OperandList{
+									Operands: []core.Operand{
+										{Impl: "$2", Color: "RED"},
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	issues := RunLint(map[string]core.Program{"(0, 0)": prog}, arch)
+
+	found := false
+	for _, issue := range issues {
+		if issue.Type == IssuePredicate && strings.Contains(issue.Message, "PHI_START") {
+			found = true
+			break
+		}
+	}
+	if !found {
+		t.Fatalf("expected a PREDICATE issue for PHI_START first-source risk, got %v", issues)
+	}
+}
+
+func TestRunLintPredicatePhiBothTrueRisk(t *testing.T) {
+	arch := &ArchInfo{
+		Rows:         2,
+		Columns:      2,
+		Topology:     "mesh",
+		HopLatency:   1,
+		MemCapacity:  1024,
+		CtrlMemItems: 256,
+	}
+
+	// $0 and $1 are both definitely true before PHI.
+	prog := core.Program{
+		EntryBlocks: []core.EntryBlock{
+			{
+				InstructionGroups: []core.InstructionGroup{
+					{
+						Operations: []core.Operation{
+							{
+								OpCode: "MOV",
+								SrcOperands: core.OperandList{
+									Operands: []core.Operand{{Impl: "#1", Color: "RED"}},
+								},
+								DstOperands: core.OperandList{
+									Operands: []core.Operand{{Impl: "$0", Color: "RED"}},
+								},
+							},
+						},
+					},
+					{
+						Operations: []core.Operation{
+							{
+								OpCode: "MOV",
+								SrcOperands: core.OperandList{
+									Operands: []core.Operand{{Impl: "#2", Color: "RED"}},
+								},
+								DstOperands: core.OperandList{
+									Operands: []core.Operand{{Impl: "$1", Color: "RED"}},
+								},
+							},
+						},
+					},
+					{
+						Operations: []core.Operation{
+							{
+								OpCode: "PHI",
+								ID:     111,
+								SrcOperands: core.OperandList{
+									Operands: []core.Operand{
+										{Impl: "$0", Color: "RED"},
+										{Impl: "$1", Color: "RED"},
+									},
+								},
+								DstOperands: core.OperandList{
+									Operands: []core.Operand{{Impl: "$2", Color: "RED"}},
+								},
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	issues := RunLint(map[string]core.Program{"(0, 0)": prog}, arch)
+
+	found := false
+	for _, issue := range issues {
+		if issue.Type == IssuePredicate && strings.Contains(issue.Message, "PHI") {
+			found = true
+			break
+		}
+	}
+	if !found {
+		t.Fatalf("expected a PREDICATE issue for PHI both-true risk, got %v", issues)
+	}
+}
+
+func TestVerificationReportBlockingAndWarningCounts(t *testing.T) {
+	report := &VerificationReport{
+		LintIssues: []Issue{
+			{Type: IssueStruct, Message: "struct issue"},
+			{Type: IssueTiming, Message: "timing issue"},
+			{
+				Type:    IssuePredicate,
+				Message: "predicate possible",
+				Details: map[string]interface{}{"certainty": "possible"},
+			},
+			{
+				Type:    IssuePredicate,
+				Message: "predicate definite",
+				Details: map[string]interface{}{"certainty": "definite"},
+			},
+			{
+				Type:    IssuePredicate,
+				Message: "predicate without certainty",
+			},
+		},
+	}
+
+	if got := report.WarningLintIssueCount(); got != 1 {
+		t.Fatalf("expected 1 warning issue, got %d", got)
+	}
+	if got := report.BlockingLintIssueCount(); got != 4 {
+		t.Fatalf("expected 4 blocking issues, got %d", got)
+	}
+}
+
+func TestRunLintPredicatePhiStartInvalidIterationsPrologueSafe(t *testing.T) {
+	arch := &ArchInfo{
+		Rows:         2,
+		Columns:      2,
+		Topology:     "mesh",
+		HopLatency:   1,
+		MemCapacity:  1024,
+		CtrlMemItems: 256,
+	}
+
+	// The first PHI_START execution should see $0=true.
+	// A later GPRED overwrites $0 predicate to false, but only after one invalid iteration.
+	prog := core.Program{
+		EntryBlocks: []core.EntryBlock{
+			{
+				InstructionGroups: []core.InstructionGroup{
+					{
+						Operations: []core.Operation{
+							{
+								OpCode: "MOV",
+								SrcOperands: core.OperandList{
+									Operands: []core.Operand{{Impl: "#1", Color: "RED"}},
+								},
+								DstOperands: core.OperandList{
+									Operands: []core.Operand{{Impl: "$0", Color: "RED"}},
+								},
+							},
+						},
+					},
+					{
+						Operations: []core.Operation{
+							{
+								OpCode: "GRANT_PREDICATE",
+								SrcOperands: core.OperandList{
+									Operands: []core.Operand{
+										{Impl: "#1", Color: "RED"},
+										{Impl: "#0", Color: "RED"},
+									},
+								},
+								DstOperands: core.OperandList{
+									Operands: []core.Operand{{Impl: "$2", Color: "RED"}},
+								},
+							},
+						},
+					},
+					{
+						Operations: []core.Operation{
+							{
+								OpCode:            "GRANT_PREDICATE",
+								InvalidIterations: 1,
+								SrcOperands: core.OperandList{
+									Operands: []core.Operand{
+										{Impl: "#1", Color: "RED"},
+										{Impl: "#0", Color: "RED"},
+									},
+								},
+								DstOperands: core.OperandList{
+									Operands: []core.Operand{{Impl: "$0", Color: "RED"}},
+								},
+							},
+						},
+					},
+					{
+						Operations: []core.Operation{
+							{
+								OpCode: "PHI_START",
+								ID:     210,
+								SrcOperands: core.OperandList{
+									Operands: []core.Operand{
+										{Impl: "$0", Color: "RED"},
+										{Impl: "$2", Color: "RED"},
+									},
+								},
+								DstOperands: core.OperandList{
+									Operands: []core.Operand{{Impl: "$3", Color: "RED"}},
+								},
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	issues := RunLint(map[string]core.Program{"(0, 0)": prog}, arch)
+	for _, issue := range issues {
+		if issue.Type == IssuePredicate && strings.Contains(issue.Message, "first source") {
+			t.Fatalf("unexpected PHI_START first-source issue with prologue protection: %+v", issue)
+		}
+	}
+}
+
+func TestRunLintPredicatePhiSteadyStateAfterWarmupStillDetected(t *testing.T) {
+	arch := &ArchInfo{
+		Rows:         2,
+		Columns:      2,
+		Topology:     "mesh",
+		HopLatency:   1,
+		MemCapacity:  1024,
+		CtrlMemItems: 256,
+	}
+
+	// All three ops become executable after warmup; PHI should still be flagged.
+	prog := core.Program{
+		EntryBlocks: []core.EntryBlock{
+			{
+				InstructionGroups: []core.InstructionGroup{
+					{
+						Operations: []core.Operation{
+							{
+								OpCode:            "MOV",
+								InvalidIterations: 1,
+								SrcOperands: core.OperandList{
+									Operands: []core.Operand{{Impl: "#1", Color: "RED"}},
+								},
+								DstOperands: core.OperandList{
+									Operands: []core.Operand{{Impl: "$0", Color: "RED"}},
+								},
+							},
+						},
+					},
+					{
+						Operations: []core.Operation{
+							{
+								OpCode:            "MOV",
+								InvalidIterations: 1,
+								SrcOperands: core.OperandList{
+									Operands: []core.Operand{{Impl: "#2", Color: "RED"}},
+								},
+								DstOperands: core.OperandList{
+									Operands: []core.Operand{{Impl: "$1", Color: "RED"}},
+								},
+							},
+						},
+					},
+					{
+						Operations: []core.Operation{
+							{
+								OpCode:            "PHI",
+								ID:                211,
+								InvalidIterations: 1,
+								SrcOperands: core.OperandList{
+									Operands: []core.Operand{
+										{Impl: "$0", Color: "RED"},
+										{Impl: "$1", Color: "RED"},
+									},
+								},
+								DstOperands: core.OperandList{
+									Operands: []core.Operand{{Impl: "$2", Color: "RED"}},
+								},
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	issues := RunLint(map[string]core.Program{"(0, 0)": prog}, arch)
+	foundDefinitePhi := false
+	for _, issue := range issues {
+		if issue.Type == IssuePredicate && strings.Contains(issue.Message, "PHI id=211") {
+			if certainty, ok := issue.Details["certainty"].(string); ok && certainty == "definite" {
+				foundDefinitePhi = true
+			}
+		}
+	}
+	if !foundDefinitePhi {
+		t.Fatalf("expected definite PHI predicate issue after warmup, got: %+v", issues)
+	}
+}