diff --git a/app/core/clients/proxy.py b/app/core/clients/proxy.py
index a60eeab2e..6a8a66cdf 100644
--- a/app/core/clients/proxy.py
+++ b/app/core/clients/proxy.py
@@ -944,18 +944,34 @@ def _is_native_codex_originator(originator: str | None) -> bool:
     return stripped in _NATIVE_CODEX_ORIGINATORS
 
 
+def _payload_uses_image_generation_tool(payload: Mapping[str, JsonValue]) -> bool:
+    tools = payload.get("tools")
+    if not isinstance(tools, list):
+        return False
+    for tool in tools:
+        if not isinstance(tool, dict):
+            continue
+        tool_type = tool.get("type")
+        if tool_type == "image_generation":
+            return True
+    return False
+
+
 def _resolve_stream_transport(
     *,
     transport: str,
     transport_override: str | None,
     model: str | None,
     headers: Mapping[str, str],
+    has_image_generation_tool: bool = False,
 ) -> str:
     configured = _configured_stream_transport(transport=transport, transport_override=transport_override)
     if configured == "websocket":
         return "websocket"
     if configured == "http":
         return "http"
+    if has_image_generation_tool:
+        return "http"
     if _has_native_codex_transport_headers(headers):
         return "websocket"
 
@@ -1598,6 +1614,7 @@ async def stream_responses(
         transport_override=upstream_stream_transport_override,
         model=payload.model,
         headers=headers,
+        has_image_generation_tool=_payload_uses_image_generation_tool(payload_dict),
     )
     if transport == "websocket":
         upstream_headers = _build_upstream_websocket_headers(headers, access_token, account_id)
diff --git a/app/core/config/settings.py b/app/core/config/settings.py
index a80fc7ffc..c02d639dd 100644
--- a/app/core/config/settings.py
+++ b/app/core/config/settings.py
@@ -135,7 +135,11 @@ class Settings(BaseSettings):
     compact_request_budget_seconds: float = Field(default=75.0, gt=0)
     stream_idle_timeout_seconds: float = 300.0
     proxy_downstream_websocket_idle_timeout_seconds: float = Field(default=120.0, gt=0)
-    max_sse_event_bytes: int = Field(default=2 * 1024 * 1024, gt=0)
+    # Applies to both upstream SSE event buffering and upstream websocket message
+    # frames. Keep the default aligned with the common 16 MiB websocket ceiling so
+    # large built-in tool payloads (for example image_generation outputs) do not
+    # fail locally with a 1009 before upstream completion.
+    max_sse_event_bytes: int = Field(default=16 * 1024 * 1024, gt=0)
     auth_base_url: str = "https://auth.openai.com"
     oauth_client_id: str = "app_EMoamEEZ73f0CkXaXp7hrann"
     oauth_originator: str = "codex_chatgpt_desktop"
diff --git a/app/core/openai/requests.py b/app/core/openai/requests.py
index 3f6bd7ba7..7cfef32b2 100644
--- a/app/core/openai/requests.py
+++ b/app/core/openai/requests.py
@@ -69,7 +69,7 @@ def normalize_tool_choice(choice: JsonValue | None) -> JsonValue | None:
     return choice
 
 
-def validate_tool_types(tools: list[JsonValue]) -> list[JsonValue]:
+def validate_tool_types(tools: list[JsonValue], *, allow_builtin_tools: bool = False) -> list[JsonValue]:
     normalized_tools: list[JsonValue] = []
     for tool in tools:
         if not is_json_mapping(tool):
@@ -83,7 +83,7 @@ def validate_tool_types(tools: list[JsonValue]) -> list[JsonValue]:
                 tool = dict(tool_mapping)
                 tool["type"] = normalized_type
                 tool_type = normalized_type
-            if tool_type in UNSUPPORTED_TOOL_TYPES:
+            if not allow_builtin_tools and tool_type in UNSUPPORTED_TOOL_TYPES:
                 raise ValueError(f"Unsupported tool type: {tool_type}")
         normalized_tools.append(tool)
     return normalized_tools
@@ -379,7 +379,7 @@ def _normalize_previous_response_id(cls, value: str | None) -> str | None:
     @field_validator("tools")
     @classmethod
     def _validate_tools(cls, value: list[JsonValue]) -> list[JsonValue]:
-        return validate_tool_types(value)
+        return validate_tool_types(value, allow_builtin_tools=True)
 
     @field_validator("tool_choice")
     @classmethod
@@ -511,6 +511,9 @@ def _sort_keys_recursive(value: JsonValue) -> JsonValue:
 def _strip_compact_unsupported_fields(payload: MutableJsonObject) -> MutableJsonObject:
     payload = _strip_unsupported_fields(payload)
     payload.pop("store", None)
+    payload.pop("tools", None)
+    payload.pop("tool_choice", None)
+    payload.pop("parallel_tool_calls", None)
     return payload
 
 
diff --git a/app/core/openai/v1_requests.py b/app/core/openai/v1_requests.py
index 19974290f..a430ac638 100644
--- a/app/core/openai/v1_requests.py
+++ b/app/core/openai/v1_requests.py
@@ -54,7 +54,7 @@ def _ensure_store_false(cls, value: bool | None) -> bool | None:
     @field_validator("tools")
     @classmethod
     def _validate_tools(cls, value: list[JsonValue]) -> list[JsonValue]:
-        return validate_tool_types(value)
+        return validate_tool_types(value, allow_builtin_tools=True)
 
     @model_validator(mode="after")
     def _validate_input(self) -> "V1ResponsesRequest":
diff --git a/app/modules/proxy/load_balancer.py b/app/modules/proxy/load_balancer.py
index c2cabd0a1..291e96362 100644
--- a/app/modules/proxy/load_balancer.py
+++ b/app/modules/proxy/load_balancer.py
@@ -170,10 +170,11 @@ async def load_selection_inputs() -> _SelectionInputs:
                     runtime=self._runtime,
                 )
 
-                result = select_account(
+                result = _select_account_preferring_budget_safe(
                     states,
                     prefer_earlier_reset=prefer_earlier_reset_accounts,
                     routing_strategy=routing_strategy,
+                    budget_threshold_pct=budget_threshold_pct,
                 )
 
                 selected_account_map = account_map
@@ -645,10 +646,11 @@ async def _select_with_stickiness(
         sticky_repo: StickySessionsRepository | None,
     ) -> SelectionResult:
         if not sticky_key or not sticky_repo:
-            return select_account(
+            return _select_account_preferring_budget_safe(
                 states,
                 prefer_earlier_reset=prefer_earlier_reset_accounts,
                 routing_strategy=routing_strategy,
+                budget_threshold_pct=budget_threshold_pct,
             )
         if sticky_kind is None:
             raise ValueError("sticky_kind is required when sticky_key is provided")
@@ -670,14 +672,16 @@ async def _select_with_stickiness(
         if existing:
             pinned = next((state for state in states if state.account_id == existing), None)
             if pinned is not None:
-                # Check if pinned account has insufficient budget (< 5% remaining)
-                # or rate limit is far away (reset_at more than 10 minutes away)
+                # Proactively rebind session affinity for prompt-cache and
+                # codex sessions once the pinned account is already above the
+                # configured budget threshold. That preserves continuity below
+                # the threshold while avoiding obvious short-window failures
+                # once the session is skating on the edge of exhaustion.
                 now = time.time()
-                budget_exhausted = (
-                    sticky_kind == StickySessionKind.PROMPT_CACHE
+                budget_pressured = (
+                    sticky_kind in (StickySessionKind.PROMPT_CACHE, StickySessionKind.CODEX_SESSION)
                     and pinned.status != AccountStatus.RATE_LIMITED
-                    and pinned.used_percent is not None
-                    and pinned.used_percent > budget_threshold_pct
+                    and _state_above_budget_threshold(pinned, budget_threshold_pct)
                 )
                 rate_limit_far_away = (
                     sticky_kind == StickySessionKind.PROMPT_CACHE
@@ -685,7 +689,7 @@ async def _select_with_stickiness(
                     and pinned.reset_at is not None
                     and pinned.reset_at - now >= 600  # 10 minutes
                 )
-                if not (budget_exhausted or rate_limit_far_away):
+                if not (budget_pressured or rate_limit_far_away):
                     pinned_result = select_account(
                         [pinned],
                         prefer_earlier_reset=prefer_earlier_reset_accounts,
@@ -702,19 +706,17 @@ async def _select_with_stickiness(
                     # is above the budget threshold, reallocating just
                     # wastes DB writes and destroys prompt-cache locality
                     # (thrashing).
-                    if budget_exhausted:
-                        pool_best = select_account(
+                    if budget_pressured:
+                        pool_best = _select_account_preferring_budget_safe(
                             states,
                             prefer_earlier_reset=prefer_earlier_reset_accounts,
                             routing_strategy=routing_strategy,
                             deterministic_probe=True,
+                            budget_threshold_pct=budget_threshold_pct,
                         )
                         pool_also_exhausted = pool_best.account is not None and (
                             pool_best.account.account_id == pinned.account_id
-                            or (
-                                pool_best.account.used_percent is not None
-                                and pool_best.account.used_percent > budget_threshold_pct
-                            )
+                            or _state_above_budget_threshold(pool_best.account, budget_threshold_pct)
                         )
                         if pool_also_exhausted:
                             pinned_result = select_account(
@@ -769,10 +771,11 @@ async def _select_with_stickiness(
             else:
                 await sticky_repo.delete(sticky_key, kind=sticky_kind)
 
-        chosen = select_account(
+        chosen = _select_account_preferring_budget_safe(
             states,
             prefer_earlier_reset=prefer_earlier_reset_accounts,
             routing_strategy=routing_strategy,
+            budget_threshold_pct=budget_threshold_pct,
         )
         if persist_fallback and chosen.account is not None and chosen.account.account_id in account_map:
             await sticky_repo.upsert(sticky_key, chosen.account.account_id, kind=sticky_kind)
@@ -1295,6 +1298,43 @@ def _additional_usage_is_exhausted(entry: AdditionalUsageHistory) -> bool:
     return float(entry.used_percent) >= 100.0
 
 
+def _state_above_budget_threshold(state: AccountState, budget_threshold_pct: float) -> bool:
+    return any(
+        used_percent is not None and used_percent > budget_threshold_pct
+        for used_percent in (state.used_percent, state.secondary_used_percent)
+    )
+
+
+def _select_account_preferring_budget_safe(
+    states: Iterable[AccountState],
+    *,
+    prefer_earlier_reset: bool,
+    routing_strategy: RoutingStrategy,
+    budget_threshold_pct: float,
+    allow_backoff_fallback: bool = True,
+    deterministic_probe: bool = False,
+) -> SelectionResult:
+    state_list = list(states)
+    preferred_states = [state for state in state_list if not _state_above_budget_threshold(state, budget_threshold_pct)]
+    if preferred_states and len(preferred_states) != len(state_list):
+        preferred = select_account(
+            preferred_states,
+            prefer_earlier_reset=prefer_earlier_reset,
+            routing_strategy=routing_strategy,
+            allow_backoff_fallback=allow_backoff_fallback,
+            deterministic_probe=deterministic_probe,
+        )
+        if preferred.account is not None:
+            return preferred
+    return select_account(
+        state_list,
+        prefer_earlier_reset=prefer_earlier_reset,
+        routing_strategy=routing_strategy,
+        allow_backoff_fallback=allow_backoff_fallback,
+        deterministic_probe=deterministic_probe,
+    )
+
+
 def _is_upstream_circuit_breaker_open() -> bool:
     settings = get_settings()
     if not getattr(settings, "circuit_breaker_enabled", False):
diff --git a/openspec/changes/raise-upstream-event-size-limit/proposal.md b/openspec/changes/raise-upstream-event-size-limit/proposal.md
new file mode 100644
index 000000000..3ae864251
--- /dev/null
+++ b/openspec/changes/raise-upstream-event-size-limit/proposal.md
@@ -0,0 +1,15 @@
+# Proposal: raise-upstream-event-size-limit
+
+## Why
+
+Recent Codex Desktop builds can request built-in tools such as `image_generation`, which may produce large upstream Responses events. The proxy currently caps upstream SSE events and websocket message frames at 2 MiB, which is too low for legitimate image payloads and causes local websocket `1009 message too big` disconnects before `response.completed`.
+
+## What Changes
+
+- Raise the default upstream Responses event/message size limit from 2 MiB to 16 MiB.
+- Keep the existing configuration knob (`max_sse_event_bytes`) so operators can still override the limit.
+
+## Impact
+
+- Prevents local `1009` disconnects for large but valid Responses tool outputs.
+- Aligns the default limit with the common 16 MiB websocket ceiling already assumed by the proxy's `response.create` budget logic.
diff --git a/openspec/changes/raise-upstream-event-size-limit/specs/responses-api-compat/spec.md b/openspec/changes/raise-upstream-event-size-limit/specs/responses-api-compat/spec.md
new file mode 100644
index 000000000..dd6e57cb0
--- /dev/null
+++ b/openspec/changes/raise-upstream-event-size-limit/specs/responses-api-compat/spec.md
@@ -0,0 +1,7 @@
+## MODIFIED Requirements
+### Requirement: Upstream Responses event size budget
+The service SHALL allow upstream Responses SSE events and upstream websocket message frames up to 16 MiB by default before treating them as oversized.
+
+#### Scenario: built-in tool output exceeds the old 2 MiB limit
+- **WHEN** upstream Responses traffic includes a single SSE event or websocket message frame larger than 2 MiB but not larger than 16 MiB
+- **THEN** the proxy continues processing the event instead of closing the upstream websocket locally with `1009 message too big`
diff --git a/openspec/changes/raise-upstream-event-size-limit/tasks.md b/openspec/changes/raise-upstream-event-size-limit/tasks.md
new file mode 100644
index 000000000..666211ef5
--- /dev/null
+++ b/openspec/changes/raise-upstream-event-size-limit/tasks.md
@@ -0,0 +1,8 @@
+## 1. Implementation
+
+- [x] 1.1 Raise the default upstream Responses event/message size limit to 16 MiB.
+
+## 2. Verification
+
+- [x] 2.1 Add or update settings coverage for the new default.
+- [x] 2.2 Run targeted pytest, ruff, and `openspec validate --specs`.
diff --git a/openspec/changes/reallocate-codex-session-budget-pressure/proposal.md b/openspec/changes/reallocate-codex-session-budget-pressure/proposal.md
new file mode 100644
index 000000000..420092f64
--- /dev/null
+++ b/openspec/changes/reallocate-codex-session-budget-pressure/proposal.md
@@ -0,0 +1,18 @@
+## Why
+
+Backend Codex routes use durable `codex_session` stickiness when the client sends a `session_id` header. Today that stickiness preserves continuity too aggressively: if the pinned account remains `ACTIVE` but its short-window usage is already above the sticky reallocation threshold, selection still keeps routing the session there until upstream starts returning `usage_limit_reached`.
+
+Separately, fresh selection still lets near-exhausted active accounts compete with budget-safe accounts until a hard failure occurs. In production this surfaces as repeated compact and websocket failures on accounts whose latest local primary-window usage is already in the `97-99%` range, even while other active accounts still have healthy short-window budget.
+
+## What Changes
+
+- extend proactive sticky reallocation to durable backend `codex_session` mappings when the pinned account is above the configured budget threshold and a healthier candidate exists
+- prefer budget-safe Responses routing candidates over already-pressured candidates whenever at least one budget-safe candidate exists
+- keep existing durable `codex_session` behavior below that threshold
+- add regression coverage for backend Codex responses + compact routing with `session_id`
+
+## Impact
+
+- backend Codex sessions may rebind to a different account slightly earlier, before the pinned account hard-fails upstream on short-window budget exhaustion
+- fresh Responses requests will prefer accounts that are still below the configured budget threshold instead of spending more attempts on near-exhausted accounts first
+- OpenAI `/v1` routes still do not create durable `codex_session` mappings from `session_id`, but they do benefit from the same budget-safe fresh-selection preference
diff --git a/openspec/changes/reallocate-codex-session-budget-pressure/specs/responses-api-compat/spec.md b/openspec/changes/reallocate-codex-session-budget-pressure/specs/responses-api-compat/spec.md
new file mode 100644
index 000000000..5b45ddfe9
--- /dev/null
+++ b/openspec/changes/reallocate-codex-session-budget-pressure/specs/responses-api-compat/spec.md
@@ -0,0 +1,9 @@
+## MODIFIED Requirements
+### Requirement: Responses routing prefers budget-safe accounts
+When serving Responses routes, the service MUST prefer eligible accounts that are still below the configured budget threshold over eligible accounts already above that threshold. If no below-threshold candidate exists, the service MAY fall back to the pressured candidates.
+
+#### Scenario: Fresh Responses request avoids a near-exhausted account
+- **WHEN** `/backend-api/codex/responses`, `/backend-api/codex/responses/compact`, `/v1/responses`, or `/v1/responses/compact` selects among multiple eligible active accounts
+- **AND** one candidate is above the configured budget threshold
+- **AND** another candidate remains below that threshold
+- **THEN** the below-threshold candidate is chosen first
diff --git a/openspec/changes/reallocate-codex-session-budget-pressure/specs/sticky-session-operations/spec.md b/openspec/changes/reallocate-codex-session-budget-pressure/specs/sticky-session-operations/spec.md
new file mode 100644
index 000000000..671ba161c
--- /dev/null
+++ b/openspec/changes/reallocate-codex-session-budget-pressure/specs/sticky-session-operations/spec.md
@@ -0,0 +1,13 @@
+## MODIFIED Requirements
+### Requirement: Sticky sessions are explicitly typed
+The system SHALL persist each sticky-session mapping with an explicit kind so durable Codex backend affinity, durable dashboard sticky-thread routing, and bounded prompt-cache affinity can be managed independently.
+
+#### Scenario: Backend Codex session affinity is stored as durable
+- **WHEN** a backend Codex request creates or refreshes stickiness from `session_id`
+- **THEN** the stored mapping kind is `codex_session`
+
+#### Scenario: Backend Codex session rebinds under budget pressure
+- **WHEN** a backend Codex request resolves an existing `codex_session` mapping
+- **AND** the pinned account is above the configured sticky reallocation budget threshold
+- **AND** another eligible account remains below that threshold
+- **THEN** selection rebinds the durable `codex_session` mapping to the healthier account before sending the request upstream
diff --git a/openspec/changes/reallocate-codex-session-budget-pressure/tasks.md b/openspec/changes/reallocate-codex-session-budget-pressure/tasks.md
new file mode 100644
index 000000000..3ae48c705
--- /dev/null
+++ b/openspec/changes/reallocate-codex-session-budget-pressure/tasks.md
@@ -0,0 +1,11 @@
+## 1. Implementation
+
+- [x] 1.1 Update sticky selection so backend `codex_session` mappings reallocate when the pinned account is above the sticky budget threshold and a healthier candidate exists
+- [x] 1.2 Preserve existing durable `codex_session` behavior when the pinned account is still below the threshold
+- [x] 1.3 Prefer budget-safe Responses routing candidates over pressured candidates when any budget-safe option exists
+
+## 2. Verification
+
+- [x] 2.1 Add integration coverage for backend Codex `session_id` routing that proves reallocation above threshold
+- [x] 2.2 Run the affected sticky-session integration tests
+- [x] 2.3 Add targeted selection coverage for fresh routing that proves a budget-safe account wins over a pressured one
diff --git a/openspec/changes/route-image-generation-over-http/.openspec.yaml b/openspec/changes/route-image-generation-over-http/.openspec.yaml
new file mode 100644
index 000000000..863bff183
--- /dev/null
+++ b/openspec/changes/route-image-generation-over-http/.openspec.yaml
@@ -0,0 +1,2 @@
+schema: spec-driven
+created: 2026-04-17
diff --git a/openspec/changes/route-image-generation-over-http/design.md b/openspec/changes/route-image-generation-over-http/design.md
new file mode 100644
index 000000000..5e2b1394f
--- /dev/null
+++ b/openspec/changes/route-image-generation-over-http/design.md
@@ -0,0 +1,63 @@
+## Context
+
+The proxy currently chooses upstream Responses transport in `app/core/clients/proxy.py`. In `auto` mode it prefers websocket for native Codex headers and websocket-preferred models such as `gpt-5.4`. That works well for normal text flows, but `image_generation` is different: OpenAI documents that the tool returns generated image data inline as base64, so large events are normal rather than pathological.
+
+The recent 16 MiB limit increase reduces failures, but it still applies one shared ceiling to both SSE event buffering and websocket message frames. Continuing to route image-generation traffic over websocket keeps the proxy exposed to avoidable large-frame failures and forces operators to keep raising a global limit for a narrow tool-specific behavior.
+
+## Goals / Non-Goals
+
+**Goals:**
+
+- Make auto upstream transport choose the safer HTTP/SSE path for Responses requests that include `image_generation`.
+- Preserve existing operator controls and existing websocket preference logic for non-image-generation requests.
+- Keep the change local to transport selection without altering request payload semantics.
+
+**Non-Goals:**
+
+- Do not mutate image-generation tool options such as `size`, `quality`, `format`, `compression`, or `partial_images`.
+- Do not retry an already-started websocket image-generation request over HTTP.
+- Do not change compact routing, which already strips tool fields before calling the upstream compact endpoint.
+
+## Decisions
+
+### Detect `image_generation` from the serialized Responses payload
+
+Use the already-materialized payload dictionary in `stream_responses()` to determine whether the request contains `tools[*].type == "image_generation"`.
+
+Why:
+
+- The payload is already canonicalized there, so the check is cheap and uses the same data that will go upstream.
+- This avoids duplicating request-model-specific logic elsewhere in the proxy stack.
+
+Alternative considered:
+
+- Recompute the signal from the Pydantic request model in multiple call sites. Rejected because it spreads the policy across more than one layer.
+
+### Override only `auto` transport
+
+When `upstream_stream_transport` resolves to `auto`, `image_generation` forces upstream HTTP. Explicit `http` or `websocket` settings still take precedence.
+
+Why:
+
+- Operator overrides should stay authoritative.
+- The issue is specifically that the default heuristic chooses websocket for requests whose payload shape makes HTTP safer.
+
+Alternative considered:
+
+- Force HTTP even when the operator explicitly configured websocket. Rejected because it breaks the existing transport-control contract.
+
+## Risks / Trade-offs
+
+- [Image-generation requests may lose websocket-specific latency benefits] → This is intentional; correctness and transport fit matter more than websocket preference for large image payloads.
+- [Future built-in tools may have similar large-payload behavior] → Keep the rule narrow for now and extend only when real evidence justifies it.
+- [Large HTTP/SSE events still rely on the configured byte ceiling] → The existing 16 MiB default remains in place as a separate safeguard.
+
+## Migration Plan
+
+- Deploy the transport-selection change without config migration.
+- If operators explicitly want websocket for image generation despite the risk, they can still force `upstream_stream_transport=websocket`.
+- Rollback is a code rollback only; there is no persisted state change.
+
+## Open Questions
+
+- None.
diff --git a/openspec/changes/route-image-generation-over-http/proposal.md b/openspec/changes/route-image-generation-over-http/proposal.md
new file mode 100644
index 000000000..6c4bafbdc
--- /dev/null
+++ b/openspec/changes/route-image-generation-over-http/proposal.md
@@ -0,0 +1,25 @@
+## Why
+
+Responses `image_generation` calls can return large inline base64 image payloads. The current auto upstream transport logic still prefers websocket for websocket-preferred models such as `gpt-5.4`, which makes those legitimate image outputs hit the proxy's websocket frame ceiling and fail locally with `1009 message too big`.
+
+## What Changes
+
+- Modify auto upstream transport selection so Responses requests containing the built-in `image_generation` tool use upstream HTTP/SSE instead of upstream websocket.
+- Keep explicit operator overrides intact: `upstream_stream_transport=http` and `upstream_stream_transport=websocket` continue to win over auto policy.
+- Add regression coverage for transport resolution and end-to-end streaming path selection when `image_generation` is present.
+
+## Capabilities
+
+### New Capabilities
+
+None.
+
+### Modified Capabilities
+
+- `responses-api-compat`: auto upstream transport selection changes for Responses requests that include the `image_generation` built-in tool.
+
+## Impact
+
+- Affects upstream transport selection in [app/core/clients/proxy.py](/Users/hughdo/Desktop/Proj/codex-lb/app/core/clients/proxy.py).
+- Adds regression coverage in `tests/unit/test_proxy_utils.py`.
+- Keeps compact request sanitization and explicit transport overrides unchanged.
diff --git a/openspec/changes/route-image-generation-over-http/specs/responses-api-compat/spec.md b/openspec/changes/route-image-generation-over-http/specs/responses-api-compat/spec.md
new file mode 100644
index 000000000..ddd35dc59
--- /dev/null
+++ b/openspec/changes/route-image-generation-over-http/specs/responses-api-compat/spec.md
@@ -0,0 +1,40 @@
+## MODIFIED Requirements
+
+### Requirement: Upstream Responses transport strategy
+For streaming Codex/Responses proxy requests, the system MUST let operators choose the upstream transport strategy through dashboard settings. The resolved strategy MAY be `auto`, `http`, or `websocket`, and `default` MUST defer to the server configuration default.
+
+#### Scenario: Dashboard forces websocket upstream transport
+- **WHEN** the dashboard setting `upstream_stream_transport` is set to `"websocket"`
+- **THEN** streaming Responses requests use the upstream websocket transport
+
+#### Scenario: Dashboard forces HTTP upstream transport
+- **WHEN** the dashboard setting `upstream_stream_transport` is set to `"http"`
+- **THEN** streaming Responses requests use the upstream HTTP/SSE transport
+
+#### Scenario: Auto transport falls back when websocket upgrades are rejected
+- **WHEN** the resolved upstream transport strategy is `"auto"`
+- **AND** auto selection chose the websocket transport
+- **AND** the upstream rejects the websocket upgrade with HTTP `426`
+- **THEN** the proxy retries the request over the upstream HTTP/SSE transport
+
+#### Scenario: Session affinity alone does not trigger websocket upstream transport
+- **WHEN** the resolved upstream transport strategy is `"auto"`
+- **AND** a request includes a `session_id`
+- **AND** it does not include an allowlisted native Codex `originator` or explicit Codex websocket feature headers
+- **THEN** the auto strategy MUST keep using the existing model-preference transport selection rules
+
+#### Scenario: Auto transport honors websocket-preferred bootstrap models before registry warmup
+- **WHEN** the resolved upstream transport strategy is `"auto"`
+- **AND** the model registry has not loaded a snapshot yet
+- **AND** the request targets a locally bootstrapped websocket-preferred model family such as `gpt-5.4` or `gpt-5.4-*`
+- **AND** the request does not include the built-in `image_generation` tool
+- **THEN** the proxy chooses the upstream websocket transport
+
+#### Scenario: Auto transport prefers HTTP for image-generation tool requests
+- **WHEN** the resolved upstream transport strategy is `"auto"`
+- **AND** the request includes a built-in `image_generation` tool
+- **THEN** the proxy chooses the upstream HTTP/SSE transport even if the model would otherwise prefer websocket
+
+#### Scenario: Legacy settings preserve the pre-feature default
+- **WHEN** transport selection runs against a legacy settings object that does not expose the newer upstream transport fields
+- **THEN** the proxy MUST preserve the pre-feature HTTP transport default for model-preference auto-selection unless an explicit legacy websocket mode or native Codex websocket signal opts in
diff --git a/openspec/changes/route-image-generation-over-http/tasks.md b/openspec/changes/route-image-generation-over-http/tasks.md
new file mode 100644
index 000000000..ea90d1549
--- /dev/null
+++ b/openspec/changes/route-image-generation-over-http/tasks.md
@@ -0,0 +1,9 @@
+## 1. Transport policy
+
+- [x] 1.1 Add a Responses payload helper that detects the built-in `image_generation` tool.
+- [x] 1.2 Update auto upstream transport selection to prefer HTTP when that helper matches, while preserving explicit transport overrides.
+
+## 2. Verification
+
+- [x] 2.1 Add regression coverage for transport resolution and stream path selection with `image_generation`.
+- [x] 2.2 Run targeted pytest, ruff, and `openspec validate --specs`.
diff --git a/openspec/changes/support-responses-builtin-tools/proposal.md b/openspec/changes/support-responses-builtin-tools/proposal.md
new file mode 100644
index 000000000..dc22faf81
--- /dev/null
+++ b/openspec/changes/support-responses-builtin-tools/proposal.md
@@ -0,0 +1,17 @@
+# Proposal: support-responses-builtin-tools
+
+## Why
+
+Recent Codex Desktop builds now send newer built-in Responses tools such as `image_generation` and computer-use tool definitions. The current proxy still rejects those tool objects on full Responses routes and forwards them unchanged to the compact endpoint, which causes upstream `400 invalid_request_error` failures.
+
+## What Changes
+
+- Allow built-in Responses tools to pass through on `/backend-api/codex/responses` and `/v1/responses`.
+- Keep Chat Completions compatibility behavior unchanged: only `web_search` remains supported there.
+- Sanitize `/backend-api/codex/responses/compact` and `/v1/responses/compact` requests so tool-related fields are removed before the upstream compact call.
+
+## Impact
+
+- Restores compatibility with newer Codex Desktop request payloads.
+- Reduces future breakage from new built-in Responses tool types on the full Responses path.
+- Prevents compact requests from failing when desktop clients reuse full Responses payload shapes.
diff --git a/openspec/changes/support-responses-builtin-tools/specs/responses-api-compat/spec.md b/openspec/changes/support-responses-builtin-tools/specs/responses-api-compat/spec.md
new file mode 100644
index 000000000..f8c282cb4
--- /dev/null
+++ b/openspec/changes/support-responses-builtin-tools/specs/responses-api-compat/spec.md
@@ -0,0 +1,15 @@
+## MODIFIED Requirements
+### Requirement: Responses-compatible tool payload handling
+The service SHALL accept built-in Responses tool definitions on `/backend-api/codex/responses` and `/v1/responses` without locally rejecting them. The service MAY normalize documented aliases, but upstream model/tool compatibility validation MUST remain the upstream contract.
+
+#### Scenario: full Responses request includes built-in tools
+- **WHEN** a client sends `/backend-api/codex/responses` or `/v1/responses` with built-in Responses tools such as `image_generation`, `computer_use`, `computer_use_preview`, `file_search`, or `code_interpreter`
+- **THEN** the proxy forwards those tool objects upstream instead of returning a local `invalid_request_error`
+
+### Requirement: Compact requests drop tool-only fields
+The service SHALL remove `tools`, `tool_choice`, and `parallel_tool_calls` from compact request payloads before calling the upstream compact endpoint.
+
+#### Scenario: compact request reuses a full Responses payload shape
+- **WHEN** a client sends `/backend-api/codex/responses/compact` or `/v1/responses/compact` with `tools`, `tool_choice`, or `parallel_tool_calls`
+- **THEN** the proxy drops those fields before the upstream compact request
+- **AND** the compact request continues without a local or upstream `invalid_request_error` caused by `param="tools"`
diff --git a/openspec/changes/support-responses-builtin-tools/tasks.md b/openspec/changes/support-responses-builtin-tools/tasks.md
new file mode 100644
index 000000000..c8f48ae08
--- /dev/null
+++ b/openspec/changes/support-responses-builtin-tools/tasks.md
@@ -0,0 +1,10 @@
+## 1. Implementation
+
+- [x] 1.1 Allow built-in Responses tools to pass through request normalization for `/backend-api/codex/responses` and `/v1/responses`.
+- [x] 1.2 Strip tool-related fields from compact request payloads before the upstream compact call.
+
+## 2. Verification
+
+- [x] 2.1 Update unit tests for Responses request normalization and compact sanitization.
+- [x] 2.2 Update integration tests for `/v1/responses` tool passthrough and `/backend-api/codex/responses/compact` tool stripping.
+- [x] 2.3 Run targeted pytest, ruff, and `openspec validate --specs`.
diff --git a/tests/integration/test_openai_compat_features.py b/tests/integration/test_openai_compat_features.py
index 6ceace261..7ff68d90c 100644
--- a/tests/integration/test_openai_compat_features.py
+++ b/tests/integration/test_openai_compat_features.py
@@ -137,7 +137,18 @@ async def fake_stream(payload, headers, access_token, account_id, base_url=None,
         {"type": "image_generation"},
     ],
 )
-async def test_v1_responses_rejects_builtin_tools(async_client, tool_payload):
+async def test_v1_responses_forwards_builtin_tools(async_client, monkeypatch, tool_payload):
+    await _import_account(async_client, "acc_builtin_tools", "builtin-tools@example.com")
+
+    seen = {}
+
+    async def fake_stream(payload, headers, access_token, account_id, base_url=None, raise_for_status=False):
+        del headers, access_token, account_id, base_url, raise_for_status
+        seen["payload"] = payload
+        yield _completed_event("resp_builtin_tools")
+
+    monkeypatch.setattr(proxy_module, "core_stream_responses", fake_stream)
+
     request_payload = {
         "model": "gpt-5.2",
         "input": [
@@ -150,8 +161,8 @@ async def test_v1_responses_rejects_builtin_tools(async_client, tool_payload):
     }
 
     resp = await async_client.post("/v1/responses", json=request_payload)
-    assert resp.status_code == 400
-    assert resp.json()["error"]["type"] == "invalid_request_error"
+    assert resp.status_code == 200
+    assert seen["payload"].tools == [tool_payload]
 
 
 @pytest.mark.asyncio
diff --git a/tests/integration/test_proxy_compact.py b/tests/integration/test_proxy_compact.py
index c7c39f708..870f1cbdf 100644
--- a/tests/integration/test_proxy_compact.py
+++ b/tests/integration/test_proxy_compact.py
@@ -101,6 +101,44 @@ async def test_proxy_compact_no_accounts(async_client):
     assert error["code"] == "no_accounts"
 
 
+@pytest.mark.asyncio
+async def test_proxy_compact_strips_tool_fields_before_upstream(async_client, monkeypatch):
+    email = "compact-tools@example.com"
+    raw_account_id = "acc_compact_tools"
+    auth_json = _make_auth_json(raw_account_id, email)
+    files = {"auth_json": ("auth.json", json.dumps(auth_json), "application/json")}
+    response = await async_client.post("/api/accounts/import", files=files)
+    assert response.status_code == 200
+
+    seen_payloads: list[dict[str, object]] = []
+
+    async def fake_compact(payload, headers, access_token, account_id):
+        del headers, access_token, account_id
+        seen_payloads.append(cast(dict[str, object], payload.to_payload()))
+        return CompactResponsePayload.model_validate({"object": "response.compaction", "output": []})
+
+    monkeypatch.setattr(proxy_module, "core_compact_responses", fake_compact)
+
+    payload = {
+        "model": "gpt-5.1",
+        "instructions": "hi",
+        "input": [],
+        "tools": [{"type": "image_generation"}],
+        "tool_choice": {"type": "image_generation"},
+        "parallel_tool_calls": True,
+    }
+    response = await async_client.post("/backend-api/codex/responses/compact", json=payload)
+
+    assert response.status_code == 200
+    assert len(seen_payloads) == 1
+    assert seen_payloads[0]["model"] == "gpt-5.1"
+    assert seen_payloads[0]["instructions"] == "hi"
+    assert seen_payloads[0]["input"] == []
+    assert "tools" not in seen_payloads[0]
+    assert "tool_choice" not in seen_payloads[0]
+    assert "parallel_tool_calls" not in seen_payloads[0]
+
+
 @pytest.mark.asyncio
 async def test_proxy_compact_surfaces_no_additional_quota_eligible_accounts(async_client):
     email = "compact-gated@example.com"
diff --git a/tests/integration/test_proxy_sticky_sessions.py b/tests/integration/test_proxy_sticky_sessions.py
index 9fc2818c9..50d9b4244 100644
--- a/tests/integration/test_proxy_sticky_sessions.py
+++ b/tests/integration/test_proxy_sticky_sessions.py
@@ -438,6 +438,93 @@ async def fake_compact(payload, headers, access_token, account_id):
     assert stream_seen == ["acc_sid_a", "acc_sid_a"]
 
 
+@pytest.mark.asyncio
+async def test_proxy_codex_session_id_reallocates_when_pinned_budget_exhausted(async_client, monkeypatch):
+    await _set_routing_settings(async_client, sticky_threads_enabled=False)
+    acc_a_id = await _import_account(async_client, "acc_sid_budget_a", "sid_budget_a@example.com")
+    acc_b_id = await _import_account(async_client, "acc_sid_budget_b", "sid_budget_b@example.com")
+
+    now = utcnow()
+    now_epoch = int(now.replace(tzinfo=timezone.utc).timestamp())
+
+    async with SessionLocal() as session:
+        usage_repo = UsageRepository(session)
+        await usage_repo.add_entry(
+            account_id=acc_a_id,
+            used_percent=10.0,
+            window="primary",
+            reset_at=now_epoch + 3600,
+            window_minutes=300,
+        )
+        await usage_repo.add_entry(
+            account_id=acc_b_id,
+            used_percent=20.0,
+            window="primary",
+            reset_at=now_epoch + 3600,
+            window_minutes=300,
+        )
+
+    stream_seen: list[str] = []
+
+    async def fake_stream(payload, headers, access_token, account_id, base_url=None, raise_for_status=False, **_kwargs):
+        stream_seen.append(account_id)
+        yield 'data: {"type":"response.completed","response":{"id":"resp_session_budget"}}\n\n'
+
+    compact_seen: list[str] = []
+
+    async def fake_compact(payload, headers, access_token, account_id):
+        compact_seen.append(account_id)
+        return OpenAIResponsePayload.model_validate({"output": []})
+
+    monkeypatch.setattr(proxy_module, "core_stream_responses", fake_stream)
+    monkeypatch.setattr(proxy_module, "core_compact_responses", fake_compact)
+
+    headers = {"session_id": "codex-thread-budget"}
+    stream_payload = {
+        "model": "gpt-5.1",
+        "instructions": "hi",
+        "input": [],
+        "stream": True,
+    }
+    response = await async_client.post("/backend-api/codex/responses", json=stream_payload, headers=headers)
+    assert response.status_code == 200
+    assert stream_seen == ["acc_sid_budget_a"]
+
+    async with SessionLocal() as session:
+        usage_repo = UsageRepository(session)
+        await usage_repo.add_entry(
+            account_id=acc_a_id,
+            used_percent=99.0,
+            window="primary",
+            reset_at=now_epoch + 3600,
+            window_minutes=300,
+        )
+        await usage_repo.add_entry(
+            account_id=acc_b_id,
+            used_percent=5.0,
+            window="primary",
+            reset_at=now_epoch + 3600,
+            window_minutes=300,
+        )
+
+    compact_payload = {
+        "model": "gpt-5.1",
+        "instructions": "summarize",
+        "input": [{"role": "user", "content": [{"type": "input_text", "text": "hello"}]}],
+    }
+    response = await async_client.post(
+        "/backend-api/codex/responses/compact",
+        json=compact_payload,
+        headers=headers,
+    )
+    assert response.status_code == 200
+    assert compact_seen == ["acc_sid_budget_b"]
+
+    response = await async_client.post("/backend-api/codex/responses", json=stream_payload, headers=headers)
+    assert response.status_code == 200
+    assert stream_seen == ["acc_sid_budget_a", "acc_sid_budget_b"]
+
+
 @pytest.mark.asyncio
 async def test_proxy_codex_session_id_compact_first_pins_followup_stream_without_sticky_threads(
     async_client,
diff --git a/tests/unit/test_openai_requests.py b/tests/unit/test_openai_requests.py
index 95c3db1ba..e6e439559 100644
--- a/tests/unit/test_openai_requests.py
+++ b/tests/unit/test_openai_requests.py
@@ -370,6 +370,33 @@ def test_responses_accepts_builtin_tools(tool_type, expected):
     assert request.tools == [{"type": expected}]
 
 
+@pytest.mark.parametrize(
+    "tool_payload",
+    [
+        {"type": "image_generation"},
+        {
+            "type": "computer_use_preview",
+            "display_width": 1024,
+            "display_height": 768,
+            "environment": "browser",
+        },
+        {"type": "computer_use", "display_width": 1024, "display_height": 768, "environment": "browser"},
+        {"type": "file_search", "vector_store_ids": ["vs_dummy"]},
+        {"type": "code_interpreter", "container": {"type": "auto"}},
+    ],
+)
+def test_responses_accepts_builtin_tool_passthrough(tool_payload):
+    payload = {
+        "model": "gpt-5.1",
+        "instructions": "hi",
+        "input": [],
+        "tools": [tool_payload],
+    }
+    request = ResponsesRequest.model_validate(payload)
+
+    assert request.tools == [tool_payload]
+
+
 @pytest.mark.parametrize("tool_choice", [{"type": "web_search"}, {"type": "web_search_preview"}])
 def test_responses_normalizes_tool_choice_web_search_preview(tool_choice):
     payload = {
@@ -484,10 +511,59 @@ def test_v1_input_string_passthrough():
     assert request.input == [{"role": "user", "content": [{"type": "input_text", "text": "hello"}]}]
 
 
-def test_v1_rejects_builtin_tools():
-    payload = {"model": "gpt-5.1", "input": [], "tools": [{"type": "image_generation"}]}
-    with pytest.raises(ValidationError, match="Unsupported tool type"):
-        V1ResponsesRequest.model_validate(payload)
+@pytest.mark.parametrize(
+    "tool_payload",
+    [
+        {"type": "image_generation"},
+        {
+            "type": "computer_use_preview",
+            "display_width": 1024,
+            "display_height": 768,
+            "environment": "browser",
+        },
+        {"type": "computer_use", "display_width": 1024, "display_height": 768, "environment": "browser"},
+        {"type": "file_search", "vector_store_ids": ["vs_dummy"]},
+        {"type": "code_interpreter", "container": {"type": "auto"}},
+    ],
+)
+def test_v1_responses_accepts_builtin_tools(tool_payload):
+    payload = {"model": "gpt-5.1", "input": [], "tools": [tool_payload]}
+    request = V1ResponsesRequest.model_validate(payload).to_responses_request()
+
+    assert request.tools == [tool_payload]
+
+
+def test_compact_strips_tool_fields():
+    payload = {
+        "model": "gpt-5.1",
+        "instructions": "hi",
+        "input": [],
+        "tools": [{"type": "image_generation"}],
+        "tool_choice": {"type": "image_generation"},
+        "parallel_tool_calls": True,
+    }
+    request = ResponsesCompactRequest.model_validate(payload)
+
+    dumped = request.to_payload()
+    assert "tools" not in dumped
+    assert "tool_choice" not in dumped
+    assert "parallel_tool_calls" not in dumped
+
+
+def test_v1_compact_strips_tool_fields():
+    payload = {
+        "model": "gpt-5.1",
+        "input": "hello",
+        "tools": [{"type": "image_generation"}],
+        "tool_choice": {"type": "image_generation"},
+        "parallel_tool_calls": True,
+    }
+    request = V1ResponsesCompactRequest.model_validate(payload).to_compact_request()
+
+    dumped = request.to_payload()
+    assert "tools" not in dumped
+    assert "tool_choice" not in dumped
+    assert "parallel_tool_calls" not in dumped
 
 
 def test_v1_compact_messages_convert():
diff --git a/tests/unit/test_proxy_load_balancer_refresh.py b/tests/unit/test_proxy_load_balancer_refresh.py
index 64284132f..46eeede6e 100644
--- a/tests/unit/test_proxy_load_balancer_refresh.py
+++ b/tests/unit/test_proxy_load_balancer_refresh.py
@@ -311,6 +311,68 @@ async def test_select_account_reads_cached_usage_once_per_window() -> None:
     assert usage_repo.secondary_calls == 1
 
 
+@pytest.mark.asyncio
+async def test_select_account_prefers_budget_safe_account_when_any_exist() -> None:
+    safe_account = _make_account("acc-safe", "safe@example.com")
+    pressured_account = _make_account("acc-pressured", "pressured@example.com")
+    now = utcnow()
+    now_epoch = int(now.replace(tzinfo=timezone.utc).timestamp())
+
+    primary = {
+        safe_account.id: UsageHistory(
+            id=1,
+            account_id=safe_account.id,
+            recorded_at=now,
+            window="primary",
+            used_percent=10.0,
+            reset_at=now_epoch + 300,
+            window_minutes=5,
+        ),
+        pressured_account.id: UsageHistory(
+            id=2,
+            account_id=pressured_account.id,
+            recorded_at=now,
+            window="primary",
+            used_percent=99.0,
+            reset_at=now_epoch + 300,
+            window_minutes=5,
+        ),
+    }
+    secondary = {
+        safe_account.id: UsageHistory(
+            id=3,
+            account_id=safe_account.id,
+            recorded_at=now,
+            window="secondary",
+            used_percent=80.0,
+            reset_at=now_epoch + 3600,
+            window_minutes=60,
+        ),
+        pressured_account.id: UsageHistory(
+            id=4,
+            account_id=pressured_account.id,
+            recorded_at=now,
+            window="secondary",
+            used_percent=5.0,
+            reset_at=now_epoch + 3600,
+            window_minutes=60,
+        ),
+    }
+
+    accounts_repo = StubAccountsRepository([safe_account, pressured_account])
+    usage_repo = StubUsageRepository(primary=primary, secondary=secondary)
+    sticky_repo = StubStickySessionsRepository()
+
+    balancer = LoadBalancer(lambda: _repo_factory(accounts_repo, usage_repo, sticky_repo))
+    selection = await balancer.select_account(
+        routing_strategy="usage_weighted",
+        budget_threshold_pct=95.0,
+    )
+
+    assert selection.account is not None
+    assert selection.account.id == safe_account.id
+
+
 @pytest.mark.asyncio
 async def test_select_account_filters_to_assigned_account_ids() -> None:
     preferred = _make_account("acc-preferred", "preferred@example.com")
diff --git a/tests/unit/test_proxy_utils.py b/tests/unit/test_proxy_utils.py
index 68bae47b9..bfaca3d56 100644
--- a/tests/unit/test_proxy_utils.py
+++ b/tests/unit/test_proxy_utils.py
@@ -275,6 +275,42 @@ def test_resolve_stream_transport_does_not_force_websocket_for_custom_codex_orig
     assert transport == "http"
 
 
+def test_resolve_stream_transport_prefers_http_for_image_generation_even_with_native_codex_headers(monkeypatch) -> None:
+    monkeypatch.setattr(
+        proxy_module,
+        "get_model_registry",
+        lambda: SimpleNamespace(prefers_websockets=lambda model: model == "gpt-5.4"),
+    )
+
+    transport = proxy_module._resolve_stream_transport(
+        transport="auto",
+        transport_override=None,
+        model="gpt-5.4",
+        headers={"originator": "codex_chatgpt_desktop"},
+        has_image_generation_tool=True,
+    )
+
+    assert transport == "http"
+
+
+def test_resolve_stream_transport_keeps_explicit_websocket_override_for_image_generation(monkeypatch) -> None:
+    monkeypatch.setattr(
+        proxy_module,
+        "get_model_registry",
+        lambda: SimpleNamespace(prefers_websockets=lambda _model: False),
+    )
+
+    transport = proxy_module._resolve_stream_transport(
+        transport="auto",
+        transport_override="websocket",
+        model="gpt-5.4",
+        headers={},
+        has_image_generation_tool=True,
+    )
+
+    assert transport == "websocket"
+
+
 def test_response_create_client_metadata_preserves_existing_json_values_and_turn_metadata():
     payload = {
         "client_metadata": {
@@ -2322,6 +2358,56 @@ class Settings:
     ]
 
 
+@pytest.mark.asyncio
+async def test_stream_responses_auto_transport_prefers_http_for_image_generation_tool(monkeypatch):
+    class Settings:
+        upstream_base_url = "https://chatgpt.com/backend-api"
+        upstream_stream_transport = "auto"
+        upstream_connect_timeout_seconds = 8.0
+        stream_idle_timeout_seconds = 45.0
+        max_sse_event_bytes = 1024
+        image_inline_fetch_enabled = False
+        log_upstream_request_payload = False
+        proxy_request_budget_seconds = 75.0
+        log_upstream_request_summary = False
+
+    monkeypatch.setattr(proxy_module, "get_settings", lambda: Settings())
+    monkeypatch.setattr(
+        proxy_module,
+        "get_model_registry",
+        lambda: SimpleNamespace(prefers_websockets=lambda model: model == "gpt-5.4"),
+    )
+    monkeypatch.setattr(proxy_module, "_maybe_log_upstream_request_start", lambda **kwargs: None)
+    monkeypatch.setattr(proxy_module, "_maybe_log_upstream_request_complete", lambda **kwargs: None)
+
+    session = _SseSession(
+        _SsePostResponse([b'data: {"type":"response.completed","response":{"id":"resp_http_image_tool"}}\n\n'])
+    )
+    payload = ResponsesRequest.model_validate(
+        {
+            "model": "gpt-5.4",
+            "instructions": "draw",
+            "input": [{"role": "user", "content": "draw"}],
+            "tools": [{"type": "image_generation"}],
+        }
+    )
+
+    events = [
+        event
+        async for event in proxy_module.stream_responses(
+            payload,
+            headers={"originator": "codex_chatgpt_desktop"},
+            access_token="token",
+            account_id="acc_1",
+            session=cast(proxy_module.aiohttp.ClientSession, session),
+        )
+    ]
+
+    assert session.calls
+    assert not getattr(session, "ws_calls", [])
+    assert events == ['data: {"type":"response.completed","response":{"id":"resp_http_image_tool"}}\n\n']
+
+
 @pytest.mark.asyncio
 async def test_stream_responses_http_transport_keeps_http(monkeypatch):
     class Settings:
diff --git a/tests/unit/test_select_with_stickiness.py b/tests/unit/test_select_with_stickiness.py
index ea5830144..e31eb0784 100644
--- a/tests/unit/test_select_with_stickiness.py
+++ b/tests/unit/test_select_with_stickiness.py
@@ -261,7 +261,7 @@ async def test_pool_exhausted_but_better_candidate_exists_reallocates():
 
 
 @pytest.mark.asyncio
-async def test_round_robin_pool_health_check_uses_round_robin_probe():
+async def test_round_robin_pool_health_check_prefers_budget_safe_candidate():
     now = time.time()
     acc_a = AccountState("a", AccountStatus.ACTIVE, used_percent=96.0, last_selected_at=now - 10)
     acc_b = AccountState("b", AccountStatus.ACTIVE, used_percent=50.0, last_selected_at=now - 1)
@@ -277,13 +277,13 @@ async def test_round_robin_pool_health_check_uses_round_robin_probe():
     )
 
     assert result.account is not None
-    assert result.account.account_id == "a"
-    repo.delete.assert_not_called()
-    repo.upsert.assert_called_once_with("key-round-robin", "a", kind=StickySessionKind.PROMPT_CACHE)
+    assert result.account.account_id == "b"
+    repo.delete.assert_called_once_with("key-round-robin", kind=StickySessionKind.PROMPT_CACHE)
+    repo.upsert.assert_called_once_with("key-round-robin", "b", kind=StickySessionKind.PROMPT_CACHE)
 
 
 @pytest.mark.asyncio
-async def test_capacity_weighted_pool_health_check_uses_capacity_probe():
+async def test_capacity_weighted_pool_health_check_prefers_budget_safe_candidate():
     acc_a = AccountState(
         "a",
         AccountStatus.ACTIVE,
@@ -319,9 +319,9 @@ async def test_capacity_weighted_pool_health_check_uses_capacity_probe():
     )
 
     assert result.account is not None
-    assert result.account.account_id == "a"
-    repo.delete.assert_not_called()
-    repo.upsert.assert_called_once_with("key-capacity-weighted", "a", kind=StickySessionKind.PROMPT_CACHE)
+    assert result.account.account_id == "b"
+    repo.delete.assert_called_once_with("key-capacity-weighted", kind=StickySessionKind.PROMPT_CACHE)
+    repo.upsert.assert_called_once_with("key-capacity-weighted", "b", kind=StickySessionKind.PROMPT_CACHE)
 
 
 @pytest.mark.asyncio
@@ -739,7 +739,7 @@ async def test_budget_threshold_95_no_reallocation_at_85_percent():
 
 
 @pytest.mark.asyncio
-async def test_budget_threshold_does_not_reallocate_codex_session_affinity():
+async def test_budget_threshold_reallocates_codex_session_affinity():
     acc_a = _active("a", used_percent=96.0)
     acc_b = _active("b", used_percent=50.0)
     repo = _make_sticky_repo(existing_account_id="a")
@@ -755,9 +755,9 @@ async def test_budget_threshold_does_not_reallocate_codex_session_affinity():
     )
 
     assert result.account is not None
-    assert result.account.account_id == "a"
-    repo.delete.assert_not_called()
-    repo.upsert.assert_not_called()
+    assert result.account.account_id == "b"
+    repo.delete.assert_called_once_with("codex-session-123", kind=StickySessionKind.CODEX_SESSION)
+    repo.upsert.assert_called_once_with("codex-session-123", "b", kind=StickySessionKind.CODEX_SESSION)
 
 
 @pytest.mark.asyncio
diff --git a/tests/unit/test_settings_multi_replica.py b/tests/unit/test_settings_multi_replica.py
index d8e67c3a3..3bdf62b25 100644
--- a/tests/unit/test_settings_multi_replica.py
+++ b/tests/unit/test_settings_multi_replica.py
@@ -27,6 +27,7 @@ def test_settings_multi_replica_defaults():
     assert settings.proxy_response_create_limit == 64
     assert settings.proxy_compact_response_create_limit == 16
     assert settings.proxy_downstream_websocket_idle_timeout_seconds == 120.0
+    assert settings.max_sse_event_bytes == 16 * 1024 * 1024
     assert settings.proxy_refresh_failure_cooldown_seconds == 5.0
     assert settings.usage_refresh_auth_failure_cooldown_seconds == 300.0
     assert settings.otel_enabled is False