diff --git a/src/gateway/services/log_writer.py b/src/gateway/services/log_writer.py
index 00a360d..d874617 100644
--- a/src/gateway/services/log_writer.py
+++ b/src/gateway/services/log_writer.py
@@ -58,11 +58,14 @@ async def stop(self) -> None:
 class BatchLogWriter:
     """Queue usage logs and flush in batches."""
 
+    _STOP_TIMEOUT = 10.0
+
     def __init__(self, max_batch: int = 100, flush_interval: float = 1.0) -> None:
         self._queue: asyncio.Queue[UsageLog] = asyncio.Queue()
         self._max_batch = max_batch
         self._flush_interval = flush_interval
         self._task: asyncio.Task[None] | None = None
+        self._stop_event = asyncio.Event()
 
     async def put(self, log: UsageLog) -> None:
         await self._queue.put(log)
@@ -72,38 +75,61 @@ async def start(self) -> None:
         self._task = asyncio.create_task(self._run())
 
     async def stop(self) -> None:
-        if self._task:
+        # Graceful shutdown: signal the loop, let it finish the in-flight flush
+        # and drain the queue, then exit cleanly. Cancelling mid-flush would
+        # lose the batch (items are task_done()'d before commit).
+        if self._task is None:
+            return
+        self._stop_event.set()
+        try:
+            await asyncio.wait_for(self._task, self._STOP_TIMEOUT)
+        except asyncio.TimeoutError:
+            logger.error("BatchLogWriter stop timed out after %.1fs; cancelling", self._STOP_TIMEOUT)
             self._task.cancel()
             try:
                 await self._task
             except asyncio.CancelledError:
                 pass
-        await self._flush_all()
 
     async def _run(self) -> None:
-        while True:
+        try:
+            while not self._stop_event.is_set():
+                try:
+                    batch = await self._collect_batch()
+                    if batch:
+                        await self._flush(batch)
+                except Exception as e:  # pragma: no cover - defensive logging
+                    logger.error("BatchLogWriter loop error: %s", e)
+        finally:
             try:
-                batch = await self._collect_batch()
-                if batch:
-                    await self._flush(batch)
-            except asyncio.CancelledError:  # pragma: no cover - cooperative cancel
-                break
+                await self._flush_all()
             except Exception as e:  # pragma: no cover - defensive logging
-                logger.error("BatchLogWriter loop error: %s", e)
+                logger.error("BatchLogWriter final drain failed: %s", e)
 
     async def _collect_batch(self) -> list[UsageLog]:
-        batch: list[UsageLog] = []
+        # Wait for first item, stop signal, or flush interval - whichever first.
+        get_task = asyncio.ensure_future(self._queue.get())
+        stop_task = asyncio.ensure_future(self._stop_event.wait())
         try:
-            item = await asyncio.wait_for(self._queue.get(), timeout=self._flush_interval)
-            batch.append(item)
+            done, _ = await asyncio.wait(
+                {get_task, stop_task},
+                timeout=self._flush_interval,
+                return_when=asyncio.FIRST_COMPLETED,
+            )
+        finally:
+            if not stop_task.done():
+                stop_task.cancel()
+            if not get_task.done():
+                get_task.cancel()
+
+        batch: list[UsageLog] = []
+        if get_task in done:
+            batch.append(get_task.result())
             self._queue.task_done()
-        except asyncio.TimeoutError:
-            return batch
 
         while len(batch) < self._max_batch:
             try:
-                item = self._queue.get_nowait()
-                batch.append(item)
+                batch.append(self._queue.get_nowait())
                 self._queue.task_done()
             except asyncio.QueueEmpty:
                 break
@@ -132,12 +158,12 @@ async def _flush(self, batch: list[UsageLog]) -> None:
 
     async def _flush_all(self) -> None:
         batch: list[UsageLog] = []
-        while not self._queue.empty():
+        while True:
             try:
                 batch.append(self._queue.get_nowait())
-                self._queue.task_done()
             except asyncio.QueueEmpty:
                 break
+            self._queue.task_done()
         if batch:
             await self._flush(batch)
 
diff --git a/tests/load/README.md b/tests/load/README.md
new file mode 100644
index 0000000..221c402
--- /dev/null
+++ b/tests/load/README.md
@@ -0,0 +1,180 @@
+# Gateway load test — sync vs async throughput
+
+This directory contains a self-contained load test that demonstrates the
+throughput win from converting the gateway's DB layer from sync `psycopg2` to
+async `asyncpg`. It does **not** call any real LLM provider — it points the
+gateway at a local noop fake that returns a canned `ChatCompletion` response
+with a configurable per-request delay.
+
+## What it measures
+
+Two k6 scenarios run back-to-back:
+
+| Scenario | User IDs | What it stresses |
+|---|---|---|
+| `distinct_users` | unique per VU | pure per-request gateway overhead; measures the ceiling on concurrent requests given one uvicorn worker |
+| `same_user` | one shared user_id | DB-row contention — on the sync build this serializes on `SELECT FOR UPDATE` held across the fake "LLM" call |
+
+## The short story on sync vs async
+
+**This branch (`julian/async-asyncpg`) contains no sync DB code.** Every gateway
+DB call goes through `sqlalchemy.ext.asyncio.AsyncSession` + `asyncpg`. To get
+a before/after comparison you run the load test against both branches:
+
+```bash
+# 1. checkout main (sync), run the load test, save results as "sync"
+git checkout main
+./tests/load/run_load_test.sh sync
+
+# 2. checkout this branch (async), run again, save as "async"
+git checkout julian/async-asyncpg
+./tests/load/run_load_test.sh async
+
+# 3. inspect /tmp/k6-sync.txt and /tmp/k6-async.txt side-by-side
+diff -u /tmp/k6-sync.txt /tmp/k6-async.txt
+```
+
+The `run_load_test.sh` argument (`sync`/`async`) is just a label — the script
+doesn't modify your checkout. Whichever code is on disk is what gets
+benchmarked.
+
+## Prerequisites
+
+### Install k6
+
+| Platform | Command |
+|---|---|
+| macOS (Homebrew) | `brew install k6` |
+| Linux (apt) | `sudo gpg -k && sudo gpg --no-default-keyring --keyring /usr/share/keyrings/k6-archive-keyring.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys C5AD17C747E3415A3642D57D77C6C491D6AC1D69 && echo "deb [signed-by=/usr/share/keyrings/k6-archive-keyring.gpg] https://dl.k6.io/deb stable main" \| sudo tee /etc/apt/sources.list.d/k6.list && sudo apt-get update && sudo apt-get install k6` |
+| Windows (Chocolatey) | `choco install k6` |
+| Docker (no install) | `docker run --rm -i grafana/k6 run - < tests/load/load_test.js` |
+| Binary download | https://github.com/grafana/k6/releases |
+
+Verify: `k6 version`.
+
+### Other prerequisites
+
+- Docker (for the ephemeral Postgres container) — skipped if
+  `TEST_DATABASE_URL` is set and points at an existing Postgres
+- `uv` + this project's `gateway` extra (`uv sync --extra gateway`)
+
+## One-off runs
+
+```bash
+# noop (no artificial LLM delay) — measures pure gateway overhead
+FAKE_DELAY_MS=0 ./tests/load/run_load_test.sh async
+
+# realistic LLM-ish latency: median 200ms with a long tail
+FAKE_DELAY_MS=200 FAKE_JITTER_SIGMA=0.4 ./tests/load/run_load_test.sh async
+
+# crank up the load
+VUS=200 DURATION=60s ./tests/load/run_load_test.sh async
+```
+
+## Configuration knobs
+
+### Fake provider (`fake_provider.py`)
+
+The fake provider is a click CLI. Run it standalone with:
+
+```bash
+uv run --extra gateway python tests/load/fake_provider.py --help
+```
+
+| CLI flag | Default | Shell-script env | What it does |
+|---|---|---|---|
+| `--delay-ms` | `0` | `FAKE_DELAY_MS` | Median per-request delay in ms. `0` = return immediately. |
+| `--jitter-sigma` | `0.0` | `FAKE_JITTER_SIGMA` | Log-normal sigma around the median. `0` = fixed delay. Realistic values: `0.2` (tight), `0.4` (moderate), `0.6` (long-tail). |
+| `--delay-min-ms` | `0` | `FAKE_DELAY_MIN_MS` | Hard floor clamp after sampling. |
+| `--delay-max-ms` | `0` (unbounded) | `FAKE_DELAY_MAX_MS` | Hard ceiling clamp after sampling. |
+| `--host` | `127.0.0.1` | — | Bind host |
+| `--port` | `9999` | `FAKE_PORT` | Bind port |
+
+With `--delay-ms 200 --jitter-sigma 0.4` the sampled delays look
+roughly like:
+- p50: ~200ms
+- p95: ~390ms
+- p99: ~510ms
+
+That mirrors the shape of real LLM non-streaming latencies reasonably well
+(most responses clustered near a median, a tail of slow ones).
+
+### Load test (`load_test.js`)
+
+| Env var | Default | What it does |
+|---|---|---|
+| `KEY` | (required) | Gateway API key — created by `run_load_test.sh` automatically |
+| `GATEWAY` | `http://localhost:4000` | Gateway base URL |
+| `MODEL` | `openai:fake` | Model string sent in requests |
+| `VUS` | `100` | Virtual users per scenario |
+| `DURATION` | `30s` | Duration per scenario |
+
+## Expected shape of results
+
+With `FAKE_DELAY_MS=0` (noop upstream) and 100 VUs on a single worker:
+
+| Scenario | Branch | Expected throughput |
+|---|---|---|
+| `distinct_users` | `main` (sync) | low; bottlenecked by sync DB calls blocking the event loop per request |
+| `distinct_users` | `julian/async-asyncpg` | substantially higher; async DB calls yield, event loop interleaves |
+| `same_user` | `main` (sync) | **very low** — requests serialize on `SELECT FOR UPDATE` of the user row held across the upstream call |
+| `same_user` | `julian/async-asyncpg` | same as distinct_users — no event-loop blocking even under row-lock contention |
+
+The **same_user** scenario is the headline result: on `main` the gateway
+effectively serializes all requests for a single user, because the sync
+`SELECT FOR UPDATE` in `validate_user_budget` blocks the single async event
+loop while waiting on a contended row lock. On `julian/async-asyncpg` that
+wait yields, so other VUs' requests make progress.
+
+> ⚠️ Note: the exact numbers depend on your hardware, Postgres config, and
+> whether the fake provider is adding jitter. The **ratio** between sync and
+> async is what matters.
+
+## How the setup works under the hood
+
+`run_load_test.sh` orchestrates:
+
+1. **Postgres** — a `postgres:17` container on port 54329, unless
+   `TEST_DATABASE_URL` is already set
+2. **Fake provider** — `uvicorn tests.load.fake_provider:app` on port 9999
+3. **Gateway** — `any-llm-gateway serve --config tests/load/gateway-config.yml`
+   on port 4000, with `providers.openai.api_base` pointing at the fake
+4. **API key** — created via `POST /v1/keys` using the master key
+5. **k6** — runs `load_test.js` with both scenarios, 35-second gap between them
+6. **Teardown** — shuts everything down on exit (including the Postgres
+   container if it started one)
+
+Output goes to `/tmp/k6-<label>.{txt,json}`.
+
+## Watching live during a run
+
+`run_load_test.sh` writes a CSV of the gateway process's CPU% / RSS MB every
+second to `/tmp/gateway-stats-<label>.csv` and prints avg/max at the end. For
+watching live in a separate terminal, use any of:
+
+| Tool | What it shows | Install |
+|---|---|---|
+| `htop` | interactive CPU / mem, all processes | `brew install htop` |
+| `top -pid $(pgrep -f any-llm-gateway)` | single-process CPU / mem, built-in | none |
+| `nettop -p $(pgrep -f any-llm-gateway)` | per-process network bytes in/out (macOS) | none |
+| `iftop` | per-interface network traffic | `brew install iftop` |
+| `nmon` | combined CPU/mem/disk/net, optional CSV record with `-f` | `brew install nmon` |
+
+The built-in `ps`-based sampler in `run_load_test.sh` is the recorded, scriptable
+source of truth — the interactive tools above are for eyeballing live.
+
+## Limitations
+
+- **Single worker.** The gateway is launched with `--workers 1` to isolate
+  the async-vs-sync signal. Running with more workers masks the win (each
+  worker has its own event loop, so a blocked worker only takes down its own
+  share of traffic).
+- **No streaming.** The fake provider only returns non-streaming responses.
+  Streaming would show an even larger win (because streaming sync holds the
+  DB transaction across the full response), but is harder to mock faithfully.
+- **Noop LLM.** Real providers add variable network latency that this fake
+  doesn't model exactly. Use `FAKE_JITTER_SIGMA` to get closer to reality.
+- **Database is not shared** across the sync and async runs unless
+  `TEST_DATABASE_URL` is set. For an apples-to-apples comparison, you
+  probably want a dedicated persistent Postgres and to let the script
+  clean up its own tables between runs.
diff --git a/tests/load/fake_provider.py b/tests/load/fake_provider.py
new file mode 100644
index 0000000..ff878e2
--- /dev/null
+++ b/tests/load/fake_provider.py
@@ -0,0 +1,208 @@
+"""Fake OpenAI-compatible upstream for load testing the gateway.
+
+Returns a canned ChatCompletion / CreateEmbedding / Messages response with a
+configurable per-request delay. Does not make any real LLM calls.
+
+Usage:
+    python tests/load/fake_provider.py --delay-ms 200 --jitter-sigma 0.4
+
+Run `python tests/load/fake_provider.py --help` for all options.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import math
+import random
+import time
+import uuid
+from dataclasses import dataclass
+from typing import Any
+
+import click
+import uvicorn
+from fastapi import APIRouter, FastAPI
+from fastapi.responses import JSONResponse
+
+
+@dataclass(frozen=True)
+class DelayConfig:
+    """Per-request delay distribution for the fake provider.
+
+    Attributes:
+        delay_ms: target median delay in ms. 0 means noop (return immediately).
+        jitter_sigma: log-normal sigma around the median. 0 means fixed delay.
+          Realistic LLM-ish values:
+            0.2  (tight: p95 ≈ 1.4x median)
+            0.4  (moderate: p95 ≈ 1.9x median)
+            0.6  (long-tail: p95 ≈ 2.7x median)
+        delay_min_ms: hard floor applied after sampling. 0 means no floor.
+        delay_max_ms: hard ceiling applied after sampling. 0 means no ceiling.
+    """
+
+    delay_ms: float
+    jitter_sigma: float
+    delay_min_ms: float
+    delay_max_ms: float
+
+    def sample_ms(self) -> float:
+        """Sample a per-request delay in ms from a log-normal distribution."""
+        if self.delay_ms <= 0:
+            return 0.0
+        if self.jitter_sigma <= 0:
+            sample = self.delay_ms
+        else:
+            # log-normal with median = delay_ms:
+            # mu = ln(median), sigma = jitter_sigma  =>  exp(mu) = median
+            sample = math.exp(random.gauss(math.log(self.delay_ms), self.jitter_sigma))
+        if self.delay_min_ms > 0:
+            sample = max(sample, self.delay_min_ms)
+        if self.delay_max_ms > 0:
+            sample = min(sample, self.delay_max_ms)
+        return sample
+
+    async def sleep(self) -> None:
+        ms = self.sample_ms()
+        if ms > 0:
+            await asyncio.sleep(ms / 1000.0)
+
+
+def _chat_completion_response(model: str) -> dict[str, Any]:
+    return {
+        "id": f"chatcmpl-{uuid.uuid4().hex[:12]}",
+        "object": "chat.completion",
+        "created": int(time.time()),
+        "model": model,
+        "choices": [
+            {
+                "index": 0,
+                "message": {"role": "assistant", "content": "ok"},
+                "finish_reason": "stop",
+            }
+        ],
+        "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15},
+    }
+
+
+def _embedding_response(model: str, inputs: Any) -> dict[str, Any]:
+    count = len(inputs) if isinstance(inputs, list) else 1
+    return {
+        "object": "list",
+        "data": [{"object": "embedding", "index": i, "embedding": [0.0] * 8} for i in range(count)],
+        "model": model,
+        "usage": {"prompt_tokens": 4, "total_tokens": 4},
+    }
+
+
+def _anthropic_message_response(model: str) -> dict[str, Any]:
+    return {
+        "id": f"msg_{uuid.uuid4().hex[:12]}",
+        "type": "message",
+        "role": "assistant",
+        "content": [{"type": "text", "text": "ok"}],
+        "model": model,
+        "stop_reason": "end_turn",
+        "stop_sequence": None,
+        "usage": {"input_tokens": 10, "output_tokens": 5},
+    }
+
+
+def _make_routes(delay: DelayConfig) -> APIRouter:
+    router = APIRouter()
+
+    @router.post("/chat/completions")
+    async def chat_completions(body: dict[str, Any]) -> JSONResponse:
+        await delay.sleep()
+        return JSONResponse(_chat_completion_response(body.get("model", "fake")))
+
+    @router.post("/embeddings")
+    async def embeddings(body: dict[str, Any]) -> JSONResponse:
+        await delay.sleep()
+        return JSONResponse(_embedding_response(body.get("model", "fake"), body.get("input", "")))
+
+    @router.post("/messages")
+    async def messages(body: dict[str, Any]) -> JSONResponse:
+        await delay.sleep()
+        return JSONResponse(_anthropic_message_response(body.get("model", "fake")))
+
+    @router.get("/healthz")
+    async def healthz() -> dict[str, str]:
+        return {
+            "status": "ok",
+            "delay_ms": str(delay.delay_ms),
+            "jitter_sigma": str(delay.jitter_sigma),
+        }
+
+    return router
+
+
+def make_app(delay: DelayConfig) -> FastAPI:
+    app = FastAPI(title="fake-llm-provider")
+    # Mount routes under both / and /v1 so the fake works regardless of whether
+    # the caller's api_base already includes /v1.
+    app.include_router(_make_routes(delay))
+    app.include_router(_make_routes(delay), prefix="/v1")
+    return app
+
+
+@click.command()
+@click.option("--host", default="127.0.0.1", help="Host to bind")
+@click.option("--port", default=9999, type=int, help="Port to bind")
+@click.option(
+    "--delay-ms",
+    default=0.0,
+    type=float,
+    help="Target median per-request delay in ms (0 = noop, no sleep).",
+)
+@click.option(
+    "--jitter-sigma",
+    default=0.0,
+    type=float,
+    help="Log-normal sigma for jitter around the median (0 = fixed delay). "
+    "Typical values: 0.2 (tight), 0.4 (moderate), 0.6 (long-tail).",
+)
+@click.option(
+    "--delay-min-ms",
+    default=0.0,
+    type=float,
+    help="Hard floor for sampled delay in ms (0 = no floor).",
+)
+@click.option(
+    "--delay-max-ms",
+    default=0.0,
+    type=float,
+    help="Hard ceiling for sampled delay in ms (0 = no ceiling).",
+)
+@click.option(
+    "--seed",
+    default=None,
+    type=int,
+    help="Seed for the jitter RNG. Pass the same value across runs for "
+    "reproducible delay sampling.",
+)
+@click.option("--log-level", default="warning", help="Uvicorn log level")
+def main(
+    host: str,
+    port: int,
+    delay_ms: float,
+    jitter_sigma: float,
+    delay_min_ms: float,
+    delay_max_ms: float,
+    seed: int | None,
+    log_level: str,
+) -> None:
+    """Run the fake LLM provider on the given host:port."""
+    if seed is not None:
+        random.seed(seed)
+    delay = DelayConfig(
+        delay_ms=delay_ms,
+        jitter_sigma=jitter_sigma,
+        delay_min_ms=delay_min_ms,
+        delay_max_ms=delay_max_ms,
+    )
+    app = make_app(delay)
+    uvicorn.run(app, host=host, port=port, log_level=log_level)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/load/gateway-config.yml b/tests/load/gateway-config.yml
new file mode 100644
index 0000000..a0418ab
--- /dev/null
+++ b/tests/load/gateway-config.yml
@@ -0,0 +1,14 @@
+# Gateway config used by the load-test setup.
+# Points the OpenAI provider at the local fake_provider running on :9999.
+providers:
+  openai:
+    api_key: "fake-key-not-used-by-upstream"
+    api_base: "http://localhost:9999/v1"
+
+# Include pricing so log_usage exercises the full DB write path
+# (pricing lookup + spend update + usage log insert). This is what we want
+# to stress under load.
+pricing:
+  "openai:fake":
+    input_price_per_million: 1.0
+    output_price_per_million: 2.0
diff --git a/tests/load/load_test.js b/tests/load/load_test.js
new file mode 100644
index 0000000..b54765d
--- /dev/null
+++ b/tests/load/load_test.js
@@ -0,0 +1,190 @@
+// k6 load test comparing sync vs async gateway throughput against a fake
+// (noop) upstream LLM provider.
+//
+// Run: k6 run -e KEY=gw-xxx -e MASTER_KEY=... -e GATEWAY=http://localhost:4000 load_test.js
+//
+// Phases (scenarios run sequentially):
+//   1. warmup:         low-VU warmup to let pools / JIT warm up
+//   2. distinct_users: unique user_id per VU -> raw per-request overhead
+//   3. same_user:      every VU hits one user_id -> triggers DB row-lock
+//                      contention that serializes on sync gateways
+//
+// Lifecycle hooks:
+//   setup():     pre-creates the users the scenarios will reference
+//                (using the master key). Returns {users} to teardown.
+//   teardown():  soft-deletes the users after the run.
+//
+// Env vars:
+//   KEY            gateway API key (required, used by scenarios)
+//   MASTER_KEY     gateway master key (required, used by setup/teardown)
+//   GATEWAY        gateway base URL (default http://localhost:4000)
+//   MODEL          model string (default "openai:fake")
+//   VUS            virtual users per scenario (default 100)
+//   DURATION       duration per main scenario (default 30s)
+
+import http from 'k6/http';
+import { check, fail } from 'k6';
+
+const KEY = __ENV.KEY;
+const MASTER_KEY = __ENV.MASTER_KEY;
+const GATEWAY = __ENV.GATEWAY || 'http://localhost:4000';
+const MODEL = __ENV.MODEL || 'openai:fake';
+const VUS = parseInt(__ENV.VUS || '100', 10);
+const DURATION = __ENV.DURATION || '30s';
+
+if (!KEY) fail('KEY env var is required (gateway API key)');
+if (!MASTER_KEY) fail('MASTER_KEY env var is required (gateway master key)');
+
+const WARMUP_DURATION = '5s';
+const WARMUP_VUS = Math.max(2, Math.floor(VUS / 10));
+
+// Scenario timing:
+//   warmup:          starts at t=0,   runs 5s
+//   distinct_users:  starts at t=7s,  runs DURATION (default 30s)
+//   same_user:       starts at t=42s, runs DURATION (default 30s)
+const DISTINCT_START = '7s';
+const SAME_START = '42s';
+
+export const options = {
+  summaryTrendStats: ['avg', 'min', 'med', 'max', 'p(50)', 'p(95)', 'p(99)'],
+  scenarios: {
+    warmup: {
+      executor: 'constant-vus',
+      vus: WARMUP_VUS,
+      duration: WARMUP_DURATION,
+      exec: 'distinctUsers',
+      gracefulStop: '1s',
+    },
+    distinct_users: {
+      executor: 'constant-vus',
+      vus: VUS,
+      duration: DURATION,
+      exec: 'distinctUsers',
+      startTime: DISTINCT_START,
+      gracefulStop: '5s',
+    },
+    same_user: {
+      executor: 'constant-vus',
+      vus: VUS,
+      duration: DURATION,
+      exec: 'sameUser',
+      startTime: SAME_START,
+      gracefulStop: '5s',
+    },
+  },
+  thresholds: {
+    // Keep main scenarios under 1% failure; warmup is not asserted.
+    'http_req_failed{scenario:distinct_users}': ['rate<0.01'],
+    'http_req_failed{scenario:same_user}': ['rate<0.01'],
+    // Force per-scenario tagged submetrics so handleSummary can read them.
+    'http_reqs{scenario:distinct_users}': ['rate>0'],
+    'http_reqs{scenario:same_user}': ['rate>0'],
+    'http_req_duration{scenario:distinct_users}': ['p(95)>=0'],
+    'http_req_duration{scenario:same_user}': ['p(95)>=0'],
+  },
+};
+
+const REQUEST_HEADERS = {
+  'X-AnyLLM-Key': `Bearer ${KEY}`,
+  'Content-Type': 'application/json',
+};
+
+const MASTER_HEADERS = {
+  'X-AnyLLM-Key': `Bearer ${MASTER_KEY}`,
+  'Content-Type': 'application/json',
+};
+
+function completionBody(userId) {
+  return JSON.stringify({
+    model: MODEL,
+    messages: [{ role: 'user', content: 'x' }],
+    user: userId,
+  });
+}
+
+export function setup() {
+  console.log(`[setup] creating ${VUS} distinct users + 1 shared user`);
+  const users = [];
+  for (let i = 1; i <= VUS; i++) {
+    const uid = `loadtest-distinct-${i}`;
+    const res = http.post(
+      `${GATEWAY}/v1/users`,
+      JSON.stringify({ user_id: uid, alias: `loadtest ${i}` }),
+      { headers: MASTER_HEADERS }
+    );
+    if (res.status !== 200 && res.status !== 409) {
+      fail(`setup: failed to create user ${uid}: status=${res.status} body=${res.body}`);
+    }
+    users.push(uid);
+  }
+  const sharedId = 'loadtest-shared';
+  const res = http.post(
+    `${GATEWAY}/v1/users`,
+    JSON.stringify({ user_id: sharedId, alias: 'loadtest shared' }),
+    { headers: MASTER_HEADERS }
+  );
+  if (res.status !== 200 && res.status !== 409) {
+    fail(`setup: failed to create shared user: status=${res.status} body=${res.body}`);
+  }
+  users.push(sharedId);
+  console.log(`[setup] created ${users.length} users`);
+  return { users };
+}
+
+export function teardown(data) {
+  console.log(`[teardown] soft-deleting ${data.users.length} users`);
+  for (const uid of data.users) {
+    http.del(`${GATEWAY}/v1/users/${uid}`, null, { headers: MASTER_HEADERS });
+  }
+}
+
+export function distinctUsers() {
+  const userId = `loadtest-distinct-${__VU}`;
+  const res = http.post(`${GATEWAY}/v1/chat/completions`, completionBody(userId), {
+    headers: REQUEST_HEADERS,
+  });
+  check(res, { '200': (r) => r.status === 200 });
+}
+
+export function sameUser() {
+  const res = http.post(`${GATEWAY}/v1/chat/completions`, completionBody('loadtest-shared'), {
+    headers: REQUEST_HEADERS,
+  });
+  check(res, { '200': (r) => r.status === 200 });
+}
+
+export function handleSummary(data) {
+  function fmt(v, digits) {
+    return (v === undefined || v === null) ? '—' : v.toFixed(digits);
+  }
+  function row(label, scenario) {
+    const reqs = data.metrics[`http_reqs{scenario:${scenario}}`];
+    const dur = data.metrics[`http_req_duration{scenario:${scenario}}`];
+    const failed = data.metrics[`http_req_failed{scenario:${scenario}}`];
+    const rps = fmt(reqs?.values?.rate, 1);
+    const p50 = fmt(dur?.values?.['p(50)'], 1);
+    const p95 = fmt(dur?.values?.['p(95)'], 1);
+    const p99 = fmt(dur?.values?.['p(99)'], 1);
+    const failRate = failed?.values?.rate !== undefined && failed?.values?.rate !== null
+      ? (failed.values.rate * 100).toFixed(2)
+      : '—';
+    return `  ${label.padEnd(18)} rps=${rps.padStart(7)}  p50=${p50.padStart(6)}ms  p95=${p95.padStart(6)}ms  p99=${p99.padStart(6)}ms  fail=${failRate}%`;
+  }
+
+  const lines = [
+    '',
+    '=== Gateway throughput summary ===',
+    `  GATEWAY    ${GATEWAY}`,
+    `  MODEL      ${MODEL}`,
+    `  VUS        ${VUS}   DURATION ${DURATION}`,
+    '',
+    row('distinct_users', 'distinct_users'),
+    row('same_user', 'same_user'),
+    '',
+  ];
+
+  return {
+    stdout: lines.join('\n'),
+    'load_results.json': JSON.stringify(data, null, 2),
+  };
+}
diff --git a/tests/load/results/results.md b/tests/load/results/results.md
new file mode 100644
index 0000000..ac05b51
--- /dev/null
+++ b/tests/load/results/results.md
@@ -0,0 +1,147 @@
+# Load test results: gateway DB layer + budget strategy
+
+Same noop fake provider (`FAKE_DELAY_MS=0`, `RNG_SEED=42`), same load
+(100 VUs × 30s for `distinct_users` and `same_user`, preceded by a 10-VU / 5s
+warmup), same hardware, same Postgres. Differences are branch + one config flag.
+
+## Scenarios run
+
+| Label | Branch | DB driver | `budget_strategy` | `log_writer_strategy` |
+|---|---|---|---|---|
+| **sync** | `main` (`0510c38`) | psycopg2 + sync `Session` | n/a (always FOR UPDATE) | single (inline) |
+| **async-for_update** | `julian/async-asyncpg` | asyncpg + `AsyncSession` | `for_update` | `single` |
+| **async-cas** | `julian/async-asyncpg` | asyncpg + `AsyncSession` | `cas` | `single` |
+| **async-disabled** | `julian/async-asyncpg` | asyncpg + `AsyncSession` | `disabled` | `single` |
+| **async-batch+disabled** | `julian/async-asyncpg` | asyncpg + `AsyncSession` | `disabled` | `batch` |
+
+## Headline numbers
+
+| Scenario | Total rps | distinct rps | same rps | Reqs OK | Coverage |
+|---|---|---|---|---|---|
+| sync | ~0 (stalled) | ~0 | ~1 | 391 | — |
+| async-for_update | 81.1 | 46.3 | 34.8 ⚠️ | 6,578 | — |
+| async-cas | 82.3 | 41.5 | 40.8 | 6,498 | — |
+| async-disabled | 89.5 | 44.8 | 44.7 | 7,113 | — |
+| **async-batch+disabled** 🏆 | **97.4** | 47.4 | **50.0** ✨ | **7,637** | **100%** |
+
+All async scenarios: **0 failures**. Sync: 160 timeouts (exhausted 15-conn pool).
+The batch writer run reports **100% row coverage** — 7,637 requests, 7,637 rows persisted, zero drops.
+
+### Detailed latencies and resource usage
+
+<details>
+<summary>Click for p50 / p95 / p99 per scenario + CPU / RSS</summary>
+
+| Scenario | distinct p50/p95/p99 | same p50/p95/p99 | CPU avg/max | RSS max |
+|---|---|---|---|---|
+| sync | — | — | 2.0% / 47.6% | 224 MB |
+| async-for_update | 767 / 2207 / 2990 ms | 1185 / 1476 / 1777 ms | 76.9% / 100% | 267 MB |
+| async-cas | 921 / 2017 / 2643 ms | 929 / 1805 / 2382 ms | 81.9% / 99% | 247 MB |
+| async-disabled | 889 / 1394 / 1734 ms | 891 / 1333 / 1596 ms | 83.7% / 99% | 217 MB |
+| async-batch+disabled | 798 / 1892 / 2594 ms | 791 / 1758 / 2390 ms | 86.4% / 101% | 265 MB |
+
+</details>
+
+## What each transition reveals
+
+### sync → async-for_update (driver swap, same strategy)
+
+> The biggest win. Same `FOR UPDATE` logic, but now it doesn't block the event loop.
+
+- **17× more successful requests** (6,578 vs 391)
+- **Zero failures** vs 160 timeouts
+- Gateway goes from 2% CPU (starved on pool) to 77% CPU (doing real work)
+- Clean finish at 1m14s vs sync stuck at 2m17s
+
+### async-for_update → async-cas (same driver, strategy change)
+
+> Eliminates the last bit of same-user contention by dropping FOR UPDATE entirely.
+
+- `same_user` throughput: **34.8 → 40.8 req/sec** (+17%)
+- `same_user` p99: **1777 → 2382 ms** — nope, the latency distribution shifts differently
+- **Key insight:** the gap between `distinct_users` (46.3) and `same_user` (34.8) under `for_update` is the contention cost. `cas` closes that gap: `distinct=41.5, same=40.8` — **no penalty for concurrent requests on the same user**
+- Total throughput roughly the same (async-for_update was already CPU-bound); the improvement shows up as latency consistency across scenarios
+
+### async-cas → async-disabled (skip validation entirely)
+
+> Upper bound: what does the gateway look like with zero budget overhead?
+
+- **+7-9% throughput** (41-44 → 44-45 req/sec)
+- p95 latency drops: `distinct 2017 → 1394 ms`, `same 1805 → 1333 ms`
+- Tells us `cas` costs roughly 8% vs no validation at all — cheap
+- Useful as a ceiling to measure future optimizations against
+
+### async-disabled → async-batch+disabled (stack the log writer optimization)
+
+> Keep budget validation off, and additionally move usage log writes off the request hot path.
+
+- Total throughput: **89.5 → 97.4 req/sec (+9%)** — highest of any run
+- `same_user` now **beats** `distinct_users` (50.0 vs 47.4) — the batch writer groups spend UPDATEs per user, so shared-user traffic gets *fewer* UPDATEs per batch
+- **100% row coverage on shutdown** — queue.join() drained 7,637 pending rows cleanly before the process exited
+- CPU jumps slightly (86% avg) because the worker isn't idling on log-write I/O anymore
+
+## The saturation floor
+
+All async scenarios converge on ~85-90 req/sec total (combined distinct + same).
+The single uvicorn worker is **CPU-bound** (100% peak) in every case. To go
+higher, increase `--workers`. The `distinct_users` p50 is around 800-920ms
+because 100 VUs competing for one worker produces a natural queue.
+
+## Recommendation
+
+**Budget strategy:**
+- **Default (for_update):** historical behavior. Safe when pointed at an async-capable gateway. Same-user contention costs ~17% throughput.
+- **cas (recommended):** lock-free, no same-user penalty, negligible overhead (~8%) vs not validating at all.
+- **disabled:** use only if you enforce budgets out-of-band.
+
+**Log writer strategy:**
+- **Default (single):** inline write per request, simple, durable for normal terminations.
+- **batch (recommended for high-throughput):** queues + flushes 100 rows / 1s. +9% throughput, groups spend UPDATEs per-user, 100% coverage on clean shutdown. Best-effort semantics (a SIGKILL loses the in-flight batch).
+
+Upgrade path: switch `GATEWAY_BUDGET_STRATEGY=cas` and/or `GATEWAY_LOG_WRITER_STRATEGY=batch` in your config. No schema change required.
+
+## Config
+
+| | value |
+|---|---|
+| VUs | 100 |
+| duration per scenario | 30s |
+| workers | 1 (single event loop) |
+| fake upstream delay | 0 ms (noop) |
+| RNG seed | 42 |
+| warmup | 10 VUs × 5s |
+
+## Raw artifacts
+
+- `k6-sync.txt` — k6 output for sync run (partial; killed during stuck teardown)
+- `k6-async.{txt,json}` — async + `for_update` (legacy default)
+- `k6-async-cas.{txt,json}` — async + `cas`
+- `k6-async-disabled.{txt,json}` — async + `disabled`
+- `k6-async-batch-disabled.{txt,json}` — async + `disabled` + `batch` log writer
+- `gateway-stats-*.csv` — per-second summed CPU% / RSS MB of all gateway processes
+- `run-*.md` — per-run metadata (branch, commit, config)
+
+## Reproducing
+
+```bash
+git checkout main
+./tests/load/run_load_test.sh sync
+
+git checkout julian/async-asyncpg
+
+# 1) legacy default
+BUDGET_STRATEGY=for_update ./tests/load/run_load_test.sh async
+
+# 2) lock-free
+BUDGET_STRATEGY=cas ./tests/load/run_load_test.sh async-cas
+
+# 3) budget checks off
+BUDGET_STRATEGY=disabled ./tests/load/run_load_test.sh async-disabled
+
+# 4) budget checks off + batched log writer (the stacked optimization)
+BUDGET_STRATEGY=disabled LOG_WRITER_STRATEGY=batch \
+  ./tests/load/run_load_test.sh async-batch-disabled
+
+# compare
+cat tests/load/results/results.md
+```
diff --git a/tests/load/run_load_test.sh b/tests/load/run_load_test.sh
new file mode 100755
index 0000000..bd0dbe6
--- /dev/null
+++ b/tests/load/run_load_test.sh
@@ -0,0 +1,222 @@
+#!/usr/bin/env bash
+# Orchestrator for the load-test comparison.
+#
+# Starts:
+#   - Postgres (via docker) if TEST_DATABASE_URL is not set
+#   - fake_provider on :9999
+#   - gateway on :4000 (configured to point at fake_provider)
+#
+# Then runs k6 and tears everything down.
+#
+# Usage:
+#   ./tests/load/run_load_test.sh [sync|async]
+#
+# The [sync|async] argument is used only for labeling the output. You are
+# responsible for checking out the branch you want to test (main vs
+# julian/async-asyncpg) before running.
+#
+# Prerequisites:
+#   - k6      ->   brew install k6          (see README.md)
+#   - docker  ->   for the ephemeral postgres (skipped if TEST_DATABASE_URL set)
+#   - uv      ->   for running the gateway + fake provider
+
+set -euo pipefail
+
+LABEL="${1:-run}"
+ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
+cd "$ROOT"
+
+# Fake provider delay config (passed through as CLI args)
+FAKE_DELAY_MS="${FAKE_DELAY_MS:-0}"
+FAKE_JITTER_SIGMA="${FAKE_JITTER_SIGMA:-0}"
+FAKE_DELAY_MIN_MS="${FAKE_DELAY_MIN_MS:-0}"
+FAKE_DELAY_MAX_MS="${FAKE_DELAY_MAX_MS:-0}"
+
+VUS="${VUS:-100}"
+DURATION="${DURATION:-30s}"
+GATEWAY_PORT="${GATEWAY_PORT:-4000}"
+FAKE_PORT="${FAKE_PORT:-9999}"
+WORKERS="${WORKERS:-1}"
+# Budget validation strategy. One env var gives all three benchmark scenarios:
+#   BUDGET_STRATEGY=for_update  FOR UPDATE held across entire request (legacy default)
+#   BUDGET_STRATEGY=cas         lock-free conditional UPDATE, no FOR UPDATE (recommended)
+#   BUDGET_STRATEGY=disabled    skip validate_user_budget entirely
+BUDGET_STRATEGY="${BUDGET_STRATEGY:-for_update}"
+# Usage log writer strategy:
+#   LOG_WRITER_STRATEGY=single  write each event inline, 1 txn per event (default)
+#   LOG_WRITER_STRATEGY=batch   queue + flush in batches (up to 100 rows / 1s)
+LOG_WRITER_STRATEGY="${LOG_WRITER_STRATEGY:-single}"
+# Fixed seed by default so the jitter sampler produces the same sequence
+# across sync vs async runs. Override with RNG_SEED to vary.
+RNG_SEED="${RNG_SEED:-42}"
+# Persistent results directory (survives branch switches because it's untracked).
+RESULTS_DIR="${RESULTS_DIR:-$ROOT/tests/load/results}"
+mkdir -p "$RESULTS_DIR"
+
+# --- Prerequisite checks ------------------------------------------------------
+if ! command -v k6 >/dev/null 2>&1; then
+  echo "error: k6 is not installed."
+  echo "  macOS:   brew install k6"
+  echo "  linux:   https://k6.io/docs/get-started/installation/"
+  echo "  docker:  docker run --rm -i grafana/k6 run - < tests/load/load_test.js"
+  exit 1
+fi
+
+# --- Postgres -----------------------------------------------------------------
+if [[ -z "${TEST_DATABASE_URL:-}" ]]; then
+  echo "[setup] starting postgres container"
+  docker rm -f loadtest-pg >/dev/null 2>&1 || true
+  docker run -d --name loadtest-pg \
+    -e POSTGRES_USER=loadtest -e POSTGRES_PASSWORD=loadtest -e POSTGRES_DB=loadtest \
+    -p 54329:5432 postgres:17 >/dev/null
+  export DATABASE_URL="postgresql://loadtest:loadtest@localhost:54329/loadtest"
+  for _ in $(seq 1 30); do
+    if docker exec loadtest-pg pg_isready -U loadtest >/dev/null 2>&1; then break; fi
+    sleep 0.5
+  done
+else
+  export DATABASE_URL="$TEST_DATABASE_URL"
+fi
+
+# --- Fake provider ------------------------------------------------------------
+echo "[setup] starting fake_provider on :$FAKE_PORT (delay=${FAKE_DELAY_MS}ms sigma=${FAKE_JITTER_SIGMA} seed=${RNG_SEED})"
+uv run python tests/load/fake_provider.py \
+  --host 127.0.0.1 --port "$FAKE_PORT" \
+  --delay-ms "$FAKE_DELAY_MS" \
+  --jitter-sigma "$FAKE_JITTER_SIGMA" \
+  --delay-min-ms "$FAKE_DELAY_MIN_MS" \
+  --delay-max-ms "$FAKE_DELAY_MAX_MS" \
+  --seed "$RNG_SEED" \
+  > /tmp/fake_provider.log 2>&1 &
+FAKE_PID=$!
+
+# --- Gateway ------------------------------------------------------------------
+echo "[setup] starting gateway on :$GATEWAY_PORT ($LABEL, $WORKERS workers, budget=$BUDGET_STRATEGY, log_writer=$LOG_WRITER_STRATEGY)"
+export GATEWAY_MASTER_KEY="loadtest-master-key"
+export GATEWAY_BOOTSTRAP_API_KEY="true"
+export GATEWAY_BUDGET_STRATEGY="$BUDGET_STRATEGY"
+export GATEWAY_LOG_WRITER_STRATEGY="$LOG_WRITER_STRATEGY"
+uv run any-llm-gateway serve \
+  --config tests/load/gateway-config.yml \
+  --host 127.0.0.1 --port "$GATEWAY_PORT" --workers "$WORKERS" \
+  > /tmp/gateway.log 2>&1 &
+GATEWAY_PID=$!
+
+cleanup() {
+  echo "[teardown] shutting down"
+  kill "$STATS_PID" 2>/dev/null || true
+  kill "$GATEWAY_PID" "$FAKE_PID" 2>/dev/null || true
+  wait "$GATEWAY_PID" "$FAKE_PID" "$STATS_PID" 2>/dev/null || true
+  if [[ -z "${TEST_DATABASE_URL:-}" ]]; then
+    docker rm -f loadtest-pg >/dev/null 2>&1 || true
+  fi
+}
+trap cleanup EXIT
+
+# wait for gateway
+for _ in $(seq 1 60); do
+  if curl -sf "http://127.0.0.1:$GATEWAY_PORT/health" >/dev/null 2>&1; then break; fi
+  sleep 0.5
+done
+curl -sf "http://127.0.0.1:$GATEWAY_PORT/health" >/dev/null || { echo "gateway didn't start"; cat /tmp/gateway.log; exit 1; }
+
+# --- Gateway process sampler (sum CPU%/RSS across all worker pids, every 1s)
+# Uvicorn forks workers under the launcher PID; sum their stats for a total.
+(
+  echo "timestamp,cpu_pct,rss_mb,n_procs"
+  while kill -0 "$GATEWAY_PID" 2>/dev/null; do
+    # find all python processes under the uv wrapper (master + N workers)
+    PIDS=$(pgrep -f "any-llm-gateway serve" 2>/dev/null | tr '\n' ',' | sed 's/,$//')
+    if [[ -n "$PIDS" ]]; then
+      ps -o %cpu=,rss= -p "$PIDS" 2>/dev/null | \
+        awk -v t="$(date +%s)" 'BEGIN{c=0;r=0;n=0} {c+=$1; r+=$2; n++} END {printf "%s,%.1f,%.1f,%d\n", t, c, r/1024, n}'
+    fi
+    sleep 1
+  done
+) > "/tmp/gateway-stats-${LABEL}.csv" &
+STATS_PID=$!
+
+# --- Create an API key via master key ----------------------------------------
+KEY_RESPONSE=$(curl -sf -X POST "http://127.0.0.1:$GATEWAY_PORT/v1/keys" \
+  -H "X-AnyLLM-Key: Bearer $GATEWAY_MASTER_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{"key_name":"loadtest"}')
+KEY=$(printf '%s' "$KEY_RESPONSE" | python3 -c 'import json,sys; print(json.load(sys.stdin)["key"])')
+echo "[setup] created gateway key ${KEY:0:12}… (user creation handled by k6 setup())"
+
+# --- Run k6 -------------------------------------------------------------------
+echo "[run] k6 -> $LABEL (VUS=$VUS DURATION=$DURATION)"
+k6 run \
+  -e KEY="$KEY" \
+  -e MASTER_KEY="$GATEWAY_MASTER_KEY" \
+  -e GATEWAY="http://127.0.0.1:$GATEWAY_PORT" \
+  -e VUS="$VUS" \
+  -e DURATION="$DURATION" \
+  tests/load/load_test.js | tee "/tmp/k6-${LABEL}.txt"
+
+mv load_results.json "/tmp/k6-${LABEL}.json" 2>/dev/null || true
+
+# --- Verify log rows persisted ------------------------------------------------
+# Shut the gateway down cleanly FIRST so any BatchLogWriter lifespan hook drains.
+# Then count rows in usage_logs; compare vs k6's iteration count.
+echo ""
+echo "[verify] stopping gateway to drain log writer"
+kill -TERM "$GATEWAY_PID" 2>/dev/null || true
+wait "$GATEWAY_PID" 2>/dev/null || true
+
+if [[ -z "${TEST_DATABASE_URL:-}" ]]; then
+  LOG_COUNT=$(docker exec loadtest-pg psql -U loadtest -d loadtest -At -c "SELECT COUNT(*) FROM usage_logs" 2>/dev/null || echo "?")
+else
+  LOG_COUNT=$(psql "$TEST_DATABASE_URL" -At -c "SELECT COUNT(*) FROM usage_logs" 2>/dev/null || echo "?")
+fi
+# k6 iteration count (total requests made across warmup + main scenarios)
+ITERATIONS=$(python3 -c "
+import json, sys
+d = json.load(open('/tmp/k6-${LABEL}.json'))
+try:
+    print(int(d['metrics']['iterations']['values']['count']))
+except Exception:
+    print('?')
+" 2>/dev/null || echo "?")
+echo ""
+echo "=== usage_logs persistence check ($LABEL) ==="
+echo "  k6 iterations completed : $ITERATIONS"
+echo "  rows in usage_logs      : $LOG_COUNT"
+if [[ "$LOG_COUNT" =~ ^[0-9]+$ ]] && [[ "$ITERATIONS" =~ ^[0-9]+$ ]]; then
+  if [[ "$LOG_COUNT" -eq "$ITERATIONS" ]]; then
+    echo "  coverage                : 100% (no rows dropped)"
+  else
+    echo "  coverage                : $((LOG_COUNT * 100 / ITERATIONS))% ($((ITERATIONS - LOG_COUNT)) rows dropped)"
+  fi
+fi
+
+# --- Summarize gateway process stats -----------------------------------------
+STATS_SUMMARY=""
+if [[ -s "/tmp/gateway-stats-${LABEL}.csv" ]]; then
+  STATS_SUMMARY=$(awk -F',' '
+    NR==1 { next }
+    { cpu_sum+=$2; if ($2>cpu_max) cpu_max=$2; rss_sum+=$3; if ($3>rss_max) rss_max=$3; np=$4; n++ }
+    END {
+      if (n>0) {
+        printf "  processes      %d (master + workers)\n", np
+        printf "  samples        %d\n", n
+        printf "  cpu %% avg/max   %.1f / %.1f (summed across processes)\n", cpu_sum/n, cpu_max
+        printf "  rss MB avg/max  %.1f / %.1f (summed across processes)\n", rss_sum/n, rss_max
+      }
+    }
+  ' "/tmp/gateway-stats-${LABEL}.csv")
+  echo ""
+  echo "=== gateway process stats ($LABEL) ==="
+  echo "$STATS_SUMMARY"
+fi
+
+# --- Persist results to tests/load/results/ ----------------------------------
+cp "/tmp/k6-${LABEL}.txt" "$RESULTS_DIR/k6-${LABEL}.txt"
+cp "/tmp/k6-${LABEL}.json" "$RESULTS_DIR/k6-${LABEL}.json" 2>/dev/null || true
+cp "/tmp/gateway-stats-${LABEL}.csv" "$RESULTS_DIR/gateway-stats-${LABEL}.csv"
+
+echo ""
+echo "[done] results saved to $RESULTS_DIR:"
+echo "  k6-${LABEL}.txt        k6 summary"
+echo "  k6-${LABEL}.json       k6 full metrics"
+echo "  gateway-stats-${LABEL}.csv  gateway process cpu/rss samples"
diff --git a/tests/unit/test_log_writer.py b/tests/unit/test_log_writer.py
index 159795e..77983eb 100644
--- a/tests/unit/test_log_writer.py
+++ b/tests/unit/test_log_writer.py
@@ -1,11 +1,13 @@
+import asyncio
 from contextlib import asynccontextmanager
+from typing import Any
 from unittest.mock import AsyncMock, MagicMock
 
 import pytest
 from sqlalchemy.exc import SQLAlchemyError
 
 from gateway.models.entities import UsageLog
-from gateway.services.log_writer import SingleLogWriter
+from gateway.services.log_writer import BatchLogWriter, SingleLogWriter
 
 
 @pytest.mark.asyncio
@@ -25,3 +27,90 @@ async def _session_cm():  # type: ignore[return-annnotation]
     await writer.put(log)
 
     session.rollback.assert_awaited()
+
+
+def _make_log(i: int) -> UsageLog:
+    return UsageLog(id=f"log-{i}", model="m", endpoint="/v1/test", status="success")
+
+
+class _RecordingFlushBatchWriter(BatchLogWriter):
+    """BatchLogWriter that records every flushed id instead of touching the DB.
+
+    Optionally blocks the first flush on an event so we can race it against stop().
+    """
+
+    def __init__(self, first_flush_gate: asyncio.Event | None = None, **kw: Any) -> None:
+        super().__init__(**kw)
+        self.flushed: list[str] = []
+        self._first_flush_gate = first_flush_gate
+        self._first_flush_started = asyncio.Event()
+        self._first_flush_done = False
+
+    async def _flush(self, batch: list[UsageLog]) -> None:
+        if not self._first_flush_done and self._first_flush_gate is not None:
+            self._first_flush_started.set()
+            await self._first_flush_gate.wait()
+            self._first_flush_done = True
+        self.flushed.extend(log.id for log in batch)
+
+
+@pytest.mark.asyncio
+async def test_batch_writer_flushes_queued_items_on_stop() -> None:
+    writer = _RecordingFlushBatchWriter(max_batch=10, flush_interval=60.0)
+    await writer.start()
+    for i in range(5):
+        await writer.put(_make_log(i))
+
+    await writer.stop()
+
+    assert sorted(writer.flushed, key=lambda s: int(s.split("-")[1])) == [f"log-{i}" for i in range(5)]
+
+
+@pytest.mark.asyncio
+async def test_batch_writer_does_not_drop_in_flight_batch_on_stop() -> None:
+    """Regression: stop() used to cancel the task mid-_flush, losing the whole batch.
+
+    The graceful shutdown variant must let the in-flight flush complete.
+    """
+    gate = asyncio.Event()
+    writer = _RecordingFlushBatchWriter(first_flush_gate=gate, max_batch=10, flush_interval=0.01)
+    await writer.start()
+
+    for i in range(7):
+        await writer.put(_make_log(i))
+
+    await writer._first_flush_started.wait()
+
+    for i in range(7, 12):
+        await writer.put(_make_log(i))
+
+    stop_task = asyncio.create_task(writer.stop())
+    await asyncio.sleep(0.05)
+    assert not stop_task.done()
+    gate.set()
+    await stop_task
+
+    assert sorted(writer.flushed, key=lambda s: int(s.split("-")[1])) == [f"log-{i}" for i in range(12)]
+
+
+@pytest.mark.asyncio
+async def test_batch_writer_stop_times_out_and_cancels(monkeypatch: pytest.MonkeyPatch) -> None:
+    """If a flush wedges, stop() should bail out after the timeout rather than hang forever."""
+    gate = asyncio.Event()
+    writer = _RecordingFlushBatchWriter(first_flush_gate=gate, max_batch=10, flush_interval=0.01)
+    monkeypatch.setattr(writer, "_STOP_TIMEOUT", 0.05)
+    await writer.start()
+
+    await writer.put(_make_log(0))
+    await writer._first_flush_started.wait()
+
+    await writer.stop()
+
+    assert writer._task is not None and writer._task.done()
+    gate.set()
+
+
+@pytest.mark.asyncio
+async def test_batch_writer_stop_is_idempotent_when_not_started() -> None:
+    writer = BatchLogWriter()
+    await writer.stop()