Layr-Labs · ethenotethan · Apr 22, 2026 · Apr 22, 2026
diff --git a/agent/discovery/validator.py b/agent/discovery/validator.py
@@ -119,9 +119,14 @@ def validate_analysis(
 ) -> List[str]:
     """Validate post-analysis output.
 
+    Citations now live in sidecar ``{component}.citations.json`` files
+    produced by the citation extractor (the ``## Citations`` block is
+    stripped from the Markdown after extraction to avoid duplication).
+
     Checks:
     - Every component has a corresponding analysis .md file
-    - Analysis files contain a ## Citations section with valid JSON
+    - Every component has a corresponding .citations.json file with a
+      non-empty citations array
     - Cited files exist in the repository (spot check)
     - Cited line ranges are plausible (start <= end, within file length)
     """
@@ -134,65 +139,69 @@ def validate_analysis(
             errors.append(f"Missing analysis file for component '{comp.name}'")
             continue
 
-        # Spot-check citations
-        citation_errors = _validate_citations_in_file(
-            analysis_file, comp.name, repo_root
+        # Spot-check citations from the sidecar JSON
+        citations_file = analyses_dir / f"{comp.name}.citations.json"
+        citation_errors = _validate_citations_sidecar(
+            citations_file, comp.name, repo_root
         )
         errors.extend(citation_errors)
 
     return errors
 
 
-def _validate_citations_in_file(
-    md_path: Path,
+def _validate_citations_sidecar(
+    sidecar_path: Path,
     component_name: str,
     repo_root: Path,
     max_spot_checks: int = 5,
 ) -> List[str]:
-    """Validate citations embedded in a Markdown analysis file.
+    """Validate citations from the ``{component}.citations.json`` sidecar.
 
     Performs lightweight checks:
-    - ## Citations section exists
+    - Sidecar file exists
     - JSON is parseable
-    - Spot-checks up to max_spot_checks citations for file existence and line plausibility.
+    - Contains a non-empty ``citations`` array
+    - Spot-checks up to max_spot_checks citations for file existence and
+      line plausibility.
     """
     import json
     import re
 
     errors: List[str] = []
-    text = md_path.read_text(encoding="utf-8")
 
-    # Look for ## Citations + ```json block
-    pattern = re.compile(
-        r"^## Citations\b.*?```json\s*\n(.*?)\n\s*```",
-        re.MULTILINE | re.DOTALL,
-    )
-    match = pattern.search(text)
-    if not match:
+    if not sidecar_path.exists():
         errors.append(
-            f"[{component_name}] Missing ## Citations section with JSON block"
+            f"[{component_name}] Missing sidecar file: {sidecar_path.name}"
         )
         return errors
 
-    json_text = match.group(1).strip()
-    if not json_text:
-        errors.append(f"[{component_name}] ## Citations JSON block is empty")
-        return errors
-
     try:
-        data = json.loads(json_text)
+        payload = json.loads(sidecar_path.read_text(encoding="utf-8"))
     except json.JSONDecodeError as exc:
-        errors.append(f"[{component_name}] Invalid JSON in ## Citations: {exc}")
+        errors.append(
+            f"[{component_name}] Invalid JSON in {sidecar_path.name}: {exc}"
+        )
         return errors
 
+    if not isinstance(payload, dict):
+        errors.append(
+            f"[{component_name}] {sidecar_path.name} must be a JSON object, "
+            f"got {type(payload).__name__}"
+        )
+        return errors
+
+    data = payload.get("citations", [])
     if not isinstance(data, list):
         errors.append(
-            f"[{component_name}] ## Citations must be a JSON array, got {type(data).__name__}"
+            f"[{component_name}] 'citations' must be a JSON array, "
+            f"got {type(data).__name__}"
         )
         return errors
 
     if len(data) == 0:
-        errors.append(f"[{component_name}] ## Citations array is empty (expected 10+)")
+        errors.append(
+            f"[{component_name}] citations array is empty (expected 10+)"
+        )
         return errors
 
     # Spot-check a sample of citations

diff --git a/agent/utils/citation_extractor.py b/agent/utils/citation_extractor.py
@@ -33,6 +33,48 @@
     re.MULTILINE | re.DOTALL,
 )
 
+# Patterns for stripping extracted JSON sections out of the Markdown once
+# they've been lifted into sidecar files. Each pattern matches the full
+# section (heading + fenced JSON block) so it can be removed wholesale.
+# Anchored with ``\b`` on the heading keyword so ``## Analysis Summary`` is
+# not caught by the ``## Analysis Data`` pattern.
+_STRIP_CITATIONS_RE = re.compile(
+    r"^## Citations\b[^\n]*\n.*?```json\s*\n.*?\n\s*```\s*\n?",
+    re.MULTILINE | re.DOTALL,
+)
+_STRIP_ANALYSIS_DATA_RE = re.compile(
+    r"^## Analysis Data\b[^\n]*\n.*?```json\s*\n.*?\n\s*```\s*\n?",
+    re.MULTILINE | re.DOTALL,
+)
+# Collapse three-or-more consecutive blank lines left behind by the strips.
+_COLLAPSE_BLANK_LINES_RE = re.compile(r"\n{3,}")
+
+
+def strip_extracted_sections(markdown_text: str) -> str:
+    """Remove ``## Citations`` and ``## Analysis Data`` JSON blocks from Markdown.
+
+    These sections are emitted by the component-analyzer for machine
+    consumption, but the citations are lifted into a sidecar ``.citations.json``
+    and ``## Analysis Data`` is never consumed downstream. Leaving them in the
+    human-facing Markdown is just duplicated noise.
+
+    This function is idempotent: running it twice on the same text produces
+    the same result as running it once, so re-invoking the pipeline on an
+    already-cleaned analysis file is a no-op.
+
+    Args:
+        markdown_text: Raw Markdown from a component analysis file.
+
+    Returns:
+        Markdown with the two JSON sections removed and trailing blank-line
+        runs collapsed. Ends with a single trailing newline.
+    """
+    cleaned = _STRIP_CITATIONS_RE.sub("", markdown_text)
+    cleaned = _STRIP_ANALYSIS_DATA_RE.sub("", cleaned)
+    cleaned = _COLLAPSE_BLANK_LINES_RE.sub("\n\n", cleaned)
+    # Normalise trailing whitespace to exactly one newline.
+    return cleaned.rstrip() + "\n"
+
 
 @dataclass
 class ExtractionResult:
@@ -86,13 +128,20 @@ def extract_citations_from_markdown(
 
 
 def validate_citation_dict(
-    raw: Dict[str, Any], repo_root: Optional[Path] = None
+    raw: Dict[str, Any],
+    repo_root: Optional[Path] = None,
+    component_root: Optional[str] = None,
 ) -> Tuple[Optional[CodeCitation], List[str]]:
     """Validate a single raw citation dict and convert to CodeCitation.
 
     Args:
         raw: A dict from the parsed JSON array.
         repo_root: Optional path to the source repository for file existence checks.
+        component_root: Optional repo-root-relative path of the component whose
+            analysis produced this citation (e.g. ``"omlx/models"``). Used as a
+            fallback prefix when the LLM emits component-relative paths like
+            ``"models/base_model.py"`` instead of repo-root-relative
+            ``"omlx/models/base_model.py"``.
 
     Returns:
         (CodeCitation or None, list of error strings).
@@ -133,11 +182,58 @@ def validate_citation_dict(
             )
             file_path = cleaned
 
-    # Optional: check that the file exists on disk
+    # Existence check with path-shape fallbacks. LLMs emit citation paths
+    # in several shapes; we try the canonical one first, then fall back.
     if repo_root and file_path:
-        full_path = repo_root / file_path
-        if not full_path.exists():
+        normalised = file_path.lstrip("./").lstrip("/")
+
+        candidate_paths: List[str] = [normalised]
+        if component_root:
+            root_parts = [
+                p for p in component_root.strip("./").rstrip("/").split("/") if p
+            ]
+            file_parts = [p for p in normalised.split("/") if p]
+
+            # Strategy A: dedupe trailing segments of component_root that
+            # overlap with leading segments of file_path, then join. Handles
+            # the common case where the LLM repeats the component's own
+            # directory name:
+            #   component_root = "omlx/models"
+            #   file_path      = "models/base_model.py"
+            #   -> merge to "omlx/models/base_model.py"
+            for overlap in range(
+                min(len(file_parts), len(root_parts)), 0, -1
+            ):
+                if root_parts[-overlap:] == file_parts[:overlap]:
+                    merged = "/".join(root_parts + file_parts[overlap:])
+                    if merged not in candidate_paths:
+                        candidate_paths.append(merged)
+                    break
+
+            # Strategy B: straight prefix (no overlap). Handles the case
+            # where the LLM wrote a bare filename:
+            #   component_root = "omlx/models"
+            #   file_path      = "base_model.py"
+            #   -> "omlx/models/base_model.py"
+            joined = "/".join(root_parts + file_parts) if root_parts else normalised
+            if joined not in candidate_paths:
+                candidate_paths.append(joined)
+
+        resolved: Optional[str] = None
+        for candidate in candidate_paths:
+            if (repo_root / candidate).exists():
+                resolved = candidate
+                break
+
+        if resolved is None:
             errors.append(f"Cited file does not exist: {file_path}")
+        elif resolved != file_path:
+            logger.debug(
+                "Resolved citation path via fallback: %s -> %s",
+                file_path,
+                resolved,
+            )
+            file_path = resolved
 
     if errors:
         return None, errors
@@ -156,13 +252,17 @@ def extract_component_citations(
     md_path: Path,
     component_name: str,
     repo_root: Optional[Path] = None,
+    component_root: Optional[str] = None,
 ) -> ExtractionResult:
     """Extract and validate citations from a single component analysis Markdown file.
 
     Args:
         md_path: Path to the component's .md analysis file.
         component_name: Name of the component (for labeling).
         repo_root: Optional source repo path for file existence validation.
+        component_root: Optional repo-root-relative path of this component
+            (e.g. ``"omlx/models"``), used as a fallback path prefix when
+            the LLM emits component-relative citation paths.
 
     Returns:
         ExtractionResult with validated citations and any errors.
@@ -182,7 +282,9 @@ def extract_component_citations(
     result.raw_count = len(raw_dicts)
 
     for i, raw in enumerate(raw_dicts):
-        citation, val_errors = validate_citation_dict(raw, repo_root)
+        citation, val_errors = validate_citation_dict(
+            raw, repo_root, component_root
+        )
         if val_errors:
             for err in val_errors:
                 result.errors.append(f"Citation [{i}]: {err}")
@@ -197,6 +299,7 @@ def build_citations_index(
     repo_root: Optional[Path] = None,
     source_repo: str = "",
     source_commit: str = "",
+    component_roots: Optional[Dict[str, str]] = None,
 ) -> Dict[str, Any]:
     """Extract citations from all analysis Markdown files and produce an index.
 
@@ -207,6 +310,12 @@ def build_citations_index(
         repo_root: Optional source repo path for file existence validation.
         source_repo: Repository URL for generating deep links.
         source_commit: Git commit hash for permalink stability.
+        component_roots: Optional ``{component_name: root_path}`` map. When
+            provided, a citation path that isn't valid at the repo root will
+            be retried with the component's root_path as prefix. This
+            recovers citations where the LLM wrote paths relative to the
+            component directory (e.g. ``"base_model.py"`` for the ``models``
+            component at ``omlx/models``).
 
     Returns:
         Dict suitable for writing as all_citations.json, containing:
@@ -217,13 +326,19 @@ def build_citations_index(
     from datetime import datetime, timezone
 
     results: List[ExtractionResult] = []
+    component_roots = component_roots or {}
 
     # Find all .md analysis files (skip any .citations.json files)
     md_files = sorted(analyses_dir.glob("*.md"))
 
     for md_path in md_files:
         component_name = md_path.stem
-        result = extract_component_citations(md_path, component_name, repo_root)
+        result = extract_component_citations(
+            md_path,
+            component_name,
+            repo_root,
+            component_root=component_roots.get(component_name),
+        )
         results.append(result)
 
         # Write per-component citations file
@@ -254,6 +369,23 @@ def build_citations_index(
                 result.raw_count,
             )
 
+            # Citations + analysis-data JSON blocks are now in the sidecar;
+            # strip them from the Markdown so the human-facing analysis isn't
+            # duplicating machine-readable artefacts. Guarded by the
+            # successful-write branch so that files with malformed citation
+            # blocks are left untouched for debugging.
+            try:
+                original = md_path.read_text(encoding="utf-8")
+                stripped = strip_extracted_sections(original)
+                if stripped != original:
+                    md_path.write_text(stripped, encoding="utf-8")
+            except OSError as exc:
+                logger.warning(
+                    "Could not strip extracted sections from %s: %s",
+                    md_path,
+                    exc,
+                )
+
     # Build aggregated index
     total_valid = sum(r.valid_count for r in results)
     total_raw = sum(r.raw_count for r in results)