agentcontrol · amabito · Mar 31, 2026 · Apr 10, 2026 · Apr 11, 2026 · lan17
diff --git a/evaluators/contrib/budget/README.md b/evaluators/contrib/budget/README.md
@@ -0,0 +1,136 @@
+# agent-control-evaluator-budget
+
+Budget evaluator for agent-control that tracks cumulative LLM token and cost usage per scope and time window.
+
+## Install
+
+```bash
+pip install agent-control-evaluator-budget
+```
+
+## Quickstart
+
+```python
+from agent_control_evaluator_budget.budget import (
+    BudgetEvaluatorConfig,
+    BudgetLimitRule,
+    ModelPricing,
+)
+
+config = BudgetEvaluatorConfig(
+    budget_id="support-daily",
+    limits=[
+        BudgetLimitRule(
+            scope={"agent": "support"},
+            group_by="user_id",
+            window_seconds=86_400,
+            limit=500,
+            limit_unit="usd_cents",
+        ),
+        BudgetLimitRule(
+            scope={"agent": "support"},
+            group_by="user_id",
+            window_seconds=86_400,
+            limit=50_000,
+            limit_unit="tokens",
+        ),
+    ],
+    pricing={
+        "gpt-4.1-mini": ModelPricing(input_per_1k=0.04, output_per_1k=0.16),
+    },
+    model_path="model",
+    metadata_paths={
+        "agent": "metadata.agent",
+        "user_id": "metadata.user_id",
+    },
+    unknown_model_behavior="block",
+)
+```
+
+The evaluator reads token usage from standard fields such as `usage.input_tokens` and `usage.output_tokens`. Configure `token_path` only when your event shape uses a custom location.
+
+## Scope and group_by
+
+Each `BudgetLimitRule` has a static `scope` and an optional `group_by` field.
+
+`scope` filters which events a rule applies to. A rule with `scope={"agent": "support"}` only applies when extracted metadata contains `agent="support"`. An empty scope is global.
+
+`group_by` creates independent buckets per extracted metadata value. The common per-user pattern is:
+
+```python
+BudgetLimitRule(
+    scope={"agent": "support"},
+    group_by="user_id",
+    window_seconds=86_400,
+    limit=500,
+    limit_unit="usd_cents",
+)
+```
+
+With `metadata_paths={"user_id": "metadata.user_id"}`, each user gets a separate daily budget inside the support scope.
+
+## Budget pools
+
+`budget_id` identifies the accumulated budget pool.
+
+Evaluators with the same `budget_id` share accumulated spend and token totals across all evaluator instances. Each evaluator still evaluates using its own configured rules -- the shared state is the bucket (the rolling sum), not the rule set. Evaluators with different `budget_id` values are fully isolated.
+
+Use stable names such as `support-daily`, `billing-global`, or `tenant-acme-monthly`. Avoid generating a new `budget_id` per request unless each request should have an isolated budget.
+
+## Pricing
+
+`ModelPricing` stores cost rates in cents per 1K tokens:
+
+```python
+ModelPricing(input_per_1k=0.04, output_per_1k=0.16)
+```
+
+`input_per_1k` is applied to input tokens. `output_per_1k` is applied to output tokens.
+
+Pricing is required when any rule uses `limit_unit="usd_cents"`. Token-only rules can omit pricing. If an event uses a model that is not in the pricing table and a cost rule exists, `unknown_model_behavior="block"` fails closed. Use `"warn"` to log a warning and treat the cost as 0.
+
+## Dual Ceiling Pattern
+
+Use two evaluators when cost and token ceilings need independent control records or different `budget_id` pools:
+
+```python
+cost_config = BudgetEvaluatorConfig(
+    budget_id="support-cost-daily",
+    limits=[
+        BudgetLimitRule(
+            scope={"agent": "support"},
+            group_by="user_id",
+            window_seconds=86_400,
+            limit=500,
+            limit_unit="usd_cents",
+        )
+    ],
+    pricing={
+        "gpt-4.1-mini": ModelPricing(input_per_1k=0.04, output_per_1k=0.16),
+    },
+    model_path="model",
+    metadata_paths={"agent": "metadata.agent", "user_id": "metadata.user_id"},
+)
+
+token_config = BudgetEvaluatorConfig(
+    budget_id="support-token-daily",
+    limits=[
+        BudgetLimitRule(
+            scope={"agent": "support"},
+            group_by="user_id",
+            window_seconds=86_400,
+            limit=50_000,
+            limit_unit="tokens",
+        )
+    ],
+    metadata_paths={"agent": "metadata.agent", "user_id": "metadata.user_id"},
+)
+```
+
+This pattern lets cost and token budgets reset, alert, and roll out independently. A single evaluator can also contain both rules when one shared pool and one control result are sufficient.
+
+## Limitations
+
+`InMemoryBudgetStore` is single-process only. State is lost on restart and is not shared across workers or pods.
+
+Use a distributed store for production deployments that run multiple processes, multiple workers, or multiple pods.
diff --git a/evaluators/contrib/budget/pyproject.toml b/evaluators/contrib/budget/pyproject.toml
@@ -0,0 +1,47 @@
+[project]
+name = "agent-control-evaluator-budget"
+version = "0.1.0"
+description = "Budget evaluator for agent-control -- cumulative LLM cost and token tracking"
+readme = "README.md"
+requires-python = ">=3.12"
+license = { text = "Apache-2.0" }
+authors = [{ name = "Agent Control Team" }]
+dependencies = [
+    "agent-control-evaluators>=3.0.0",
+    "agent-control-models>=3.0.0",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-asyncio>=0.23.0",
+    "ruff>=0.1.0",
+    "mypy>=1.8.0",
+]
+
+[project.entry-points."agent_control.evaluators"]
+budget = "agent_control_evaluator_budget.budget:BudgetEvaluator"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/agent_control_evaluator_budget"]
+
+[tool.ruff]
+line-length = 100
+target-version = "py312"
+
+[tool.ruff.lint]
+select = ["E", "F", "I"]
+
+[tool.uv.sources]
+agent-control-evaluators = { path = "../../builtin", editable = true }
+agent-control-models = { path = "../../../models", editable = true }
+
+[dependency-groups]
+dev = [
+    "pytest>=9.0.2",
+    "pytest-asyncio>=1.3.0",
+]
diff --git a/evaluators/contrib/budget/src/agent_control_evaluator_budget/__init__.py b/evaluators/contrib/budget/src/agent_control_evaluator_budget/__init__.py
diff --git a/evaluators/contrib/budget/src/agent_control_evaluator_budget/budget/__init__.py b/evaluators/contrib/budget/src/agent_control_evaluator_budget/budget/__init__.py
@@ -0,0 +1,24 @@
+"""Budget evaluator for per-agent LLM cost and token tracking."""
+
+from agent_control_evaluator_budget.budget.config import (
+    BudgetEvaluatorConfig,
+    BudgetLimitRule,
+    ModelPricing,
+)
+from agent_control_evaluator_budget.budget.evaluator import BudgetEvaluator
+from agent_control_evaluator_budget.budget.memory_store import InMemoryBudgetStore
+from agent_control_evaluator_budget.budget.store import BudgetSnapshot, BudgetStore
+
+# Note: clear_budget_stores is a testing utility and is intentionally not
+# re-exported here. Import it directly from the evaluator submodule in tests:
+#   from agent_control_evaluator_budget.budget.evaluator import clear_budget_stores
+
+__all__ = [
+    "BudgetEvaluator",
+    "BudgetEvaluatorConfig",
+    "BudgetLimitRule",
+    "BudgetSnapshot",
+    "BudgetStore",
+    "InMemoryBudgetStore",
+    "ModelPricing",
+]
diff --git a/evaluators/contrib/budget/src/agent_control_evaluator_budget/budget/config.py b/evaluators/contrib/budget/src/agent_control_evaluator_budget/budget/config.py
@@ -0,0 +1,115 @@
+"""Configuration for the budget evaluator."""
+
+from __future__ import annotations
+
+from typing import Literal
+
+from agent_control_evaluators._base import EvaluatorConfig
+from pydantic import Field, field_validator, model_validator
+
+# ---------------------------------------------------------------------------
+# Window convenience constants (seconds)
+# ---------------------------------------------------------------------------
+
+WINDOW_HOURLY = 3600
+WINDOW_DAILY = 86400
+WINDOW_WEEKLY = 604800
+WINDOW_MONTHLY = 2592000  # 30 days
+
+
+class ModelPricing(EvaluatorConfig):
+    """Per-model token pricing in cents per 1K tokens."""
+
+    input_per_1k: float = 0.0
+    output_per_1k: float = 0.0
+
+
+class BudgetLimitRule(EvaluatorConfig):
+    """A single budget limit rule.
+
+    Each rule defines a ceiling for a combination of scope dimensions
+    and time window. Multiple rules can apply to the same step -- the
+    evaluator checks all of them and triggers on the first breach.
+
+    Attributes:
+        scope: Static scope dimensions that must match for this rule
+            to apply. Empty dict = global rule.
+            Examples:
+                {"agent": "summarizer"} -- per-agent limit
+                {"agent": "summarizer", "channel": "slack"} -- agent+channel limit
+        group_by: If set, the limit is applied independently for each
+            unique value of this dimension. e.g. group_by="user_id" means
+            each user gets their own budget. None = shared/global limit.
+        window_seconds: Time window for accumulation in seconds.
+            None = cumulative (no reset). See WINDOW_* constants.
+        limit: Maximum usage in the window. Interpreted by limit_unit.
+        limit_unit: Unit for limit. usd_cents checks spend; tokens checks
+            input + output tokens.
+    """
+
+    scope: dict[str, str] = Field(default_factory=dict)
+    group_by: str | None = None
+    window_seconds: int | None = None
+    limit: int
+    limit_unit: Literal["usd_cents", "tokens"] = "usd_cents"
+
+    @field_validator("limit")
+    @classmethod
+    def validate_limit(cls, v: int) -> int:
+        if v <= 0:
+            raise ValueError("limit must be a positive integer")
+        return v
+
+    @field_validator("window_seconds")
+    @classmethod
+    def validate_window_seconds(cls, v: int | None) -> int | None:
+        if v is not None and v <= 0:
+            raise ValueError("window_seconds must be positive")
+        return v
+
+
+class BudgetEvaluatorConfig(EvaluatorConfig):
+    """Configuration for the budget evaluator.
+
+    Attributes:
+        limits: List of budget limit rules. Each is checked independently.
+        budget_id: Unique budget pool identifier. Same budget_id shares
+            accumulated spend. Different budget_id is fully isolated.
+        unknown_model_behavior: What to do when a model is not found in the
+            pricing table and a cost-based rule exists. block=fail closed,
+            warn=log warning and treat cost as 0.
+        pricing: Optional model pricing table. Maps model name to ModelPricing.
+            Used to derive cost in USD from token counts and model name.
+        token_path: Dot-notation path to extract token usage from step
+            data (e.g. "usage.total_tokens"). If None, looks for standard
+            fields (input_tokens, output_tokens, total_tokens, usage).
+        model_path: Dot-notation path to extract model name (for pricing lookup).
+        metadata_paths: Mapping of metadata field name to dot-notation path
+            in step data. Used to extract scope dimensions (channel, user_id, etc).
+    """
+
+    limits: list[BudgetLimitRule] = Field(min_length=1)
+    budget_id: str = Field(
+        default="default",
+        description=(
+            "Unique budget pool identifier. Same budget_id shares accumulated spend. "
+            "Different budget_id is fully isolated."
+        ),
+    )
+    unknown_model_behavior: Literal["block", "warn"] = Field(
+        default="block",
+        description=(
+            "What to do when a model is not found in the pricing table and a cost-based "
+            "rule exists. block=fail closed, warn=log warning and treat cost as 0."
+        ),
+    )
+    pricing: dict[str, ModelPricing] | None = None
+    token_path: str | None = None
+    model_path: str | None = None
+    metadata_paths: dict[str, str] = Field(default_factory=dict)
+
+    @model_validator(mode="after")
+    def require_pricing_for_cost_rules(self) -> "BudgetEvaluatorConfig":
+        if self.pricing is None and any(rule.limit_unit == "usd_cents" for rule in self.limits):
+            raise ValueError('pricing is required when any rule uses limit_unit="usd_cents"')
+        return self