Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 36 additions & 27 deletions agent/discovery/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,14 @@ def validate_analysis(
) -> List[str]:
"""Validate post-analysis output.

Citations now live in sidecar ``{component}.citations.json`` files
produced by the citation extractor (the ``## Citations`` block is
stripped from the Markdown after extraction to avoid duplication).

Checks:
- Every component has a corresponding analysis .md file
- Analysis files contain a ## Citations section with valid JSON
- Every component has a corresponding .citations.json file with a
non-empty citations array
- Cited files exist in the repository (spot check)
- Cited line ranges are plausible (start <= end, within file length)
"""
Expand All @@ -134,65 +139,69 @@ def validate_analysis(
errors.append(f"Missing analysis file for component '{comp.name}'")
continue

# Spot-check citations
citation_errors = _validate_citations_in_file(
analysis_file, comp.name, repo_root
# Spot-check citations from the sidecar JSON
citations_file = analyses_dir / f"{comp.name}.citations.json"
citation_errors = _validate_citations_sidecar(
citations_file, comp.name, repo_root
)
errors.extend(citation_errors)

return errors


def _validate_citations_in_file(
md_path: Path,
def _validate_citations_sidecar(
sidecar_path: Path,
component_name: str,
repo_root: Path,
max_spot_checks: int = 5,
) -> List[str]:
"""Validate citations embedded in a Markdown analysis file.
"""Validate citations from the ``{component}.citations.json`` sidecar.

Performs lightweight checks:
- ## Citations section exists
- Sidecar file exists
- JSON is parseable
- Spot-checks up to max_spot_checks citations for file existence and line plausibility.
- Contains a non-empty ``citations`` array
- Spot-checks up to max_spot_checks citations for file existence and
line plausibility.
"""
import json
import re

errors: List[str] = []
text = md_path.read_text(encoding="utf-8")

# Look for ## Citations + ```json block
pattern = re.compile(
r"^## Citations\b.*?```json\s*\n(.*?)\n\s*```",
re.MULTILINE | re.DOTALL,
)
match = pattern.search(text)
if not match:
if not sidecar_path.exists():
errors.append(
f"[{component_name}] Missing ## Citations section with JSON block"
f"[{component_name}] Missing sidecar file: {sidecar_path.name}"
)
return errors

json_text = match.group(1).strip()
if not json_text:
errors.append(f"[{component_name}] ## Citations JSON block is empty")
return errors

try:
data = json.loads(json_text)
payload = json.loads(sidecar_path.read_text(encoding="utf-8"))
except json.JSONDecodeError as exc:
errors.append(f"[{component_name}] Invalid JSON in ## Citations: {exc}")
errors.append(
f"[{component_name}] Invalid JSON in {sidecar_path.name}: {exc}"
)
return errors

if not isinstance(payload, dict):
errors.append(
f"[{component_name}] {sidecar_path.name} must be a JSON object, "
f"got {type(payload).__name__}"
)
return errors

data = payload.get("citations", [])
if not isinstance(data, list):
errors.append(
f"[{component_name}] ## Citations must be a JSON array, got {type(data).__name__}"
f"[{component_name}] 'citations' must be a JSON array, "
f"got {type(data).__name__}"
)
return errors

if len(data) == 0:
errors.append(f"[{component_name}] ## Citations array is empty (expected 10+)")
errors.append(
f"[{component_name}] citations array is empty (expected 10+)"
)
return errors

# Spot-check a sample of citations
Expand Down
144 changes: 138 additions & 6 deletions agent/utils/citation_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,48 @@
re.MULTILINE | re.DOTALL,
)

# Patterns for stripping extracted JSON sections out of the Markdown once
# they've been lifted into sidecar files. Each pattern matches the full
# section (heading + fenced JSON block) so it can be removed wholesale.
# Anchored with ``\b`` on the heading keyword so ``## Analysis Summary`` is
# not caught by the ``## Analysis Data`` pattern.
_STRIP_CITATIONS_RE = re.compile(
r"^## Citations\b[^\n]*\n.*?```json\s*\n.*?\n\s*```\s*\n?",
re.MULTILINE | re.DOTALL,
)
_STRIP_ANALYSIS_DATA_RE = re.compile(
r"^## Analysis Data\b[^\n]*\n.*?```json\s*\n.*?\n\s*```\s*\n?",
re.MULTILINE | re.DOTALL,
)
# Collapse three-or-more consecutive blank lines left behind by the strips.
_COLLAPSE_BLANK_LINES_RE = re.compile(r"\n{3,}")


def strip_extracted_sections(markdown_text: str) -> str:
"""Remove ``## Citations`` and ``## Analysis Data`` JSON blocks from Markdown.

These sections are emitted by the component-analyzer for machine
consumption, but the citations are lifted into a sidecar ``.citations.json``
and ``## Analysis Data`` is never consumed downstream. Leaving them in the
human-facing Markdown is just duplicated noise.

This function is idempotent: running it twice on the same text produces
the same result as running it once, so re-invoking the pipeline on an
already-cleaned analysis file is a no-op.

Args:
markdown_text: Raw Markdown from a component analysis file.

Returns:
Markdown with the two JSON sections removed and trailing blank-line
runs collapsed. Ends with a single trailing newline.
"""
cleaned = _STRIP_CITATIONS_RE.sub("", markdown_text)
cleaned = _STRIP_ANALYSIS_DATA_RE.sub("", cleaned)
cleaned = _COLLAPSE_BLANK_LINES_RE.sub("\n\n", cleaned)
# Normalise trailing whitespace to exactly one newline.
return cleaned.rstrip() + "\n"


@dataclass
class ExtractionResult:
Expand Down Expand Up @@ -86,13 +128,20 @@ def extract_citations_from_markdown(


def validate_citation_dict(
raw: Dict[str, Any], repo_root: Optional[Path] = None
raw: Dict[str, Any],
repo_root: Optional[Path] = None,
component_root: Optional[str] = None,
) -> Tuple[Optional[CodeCitation], List[str]]:
"""Validate a single raw citation dict and convert to CodeCitation.

Args:
raw: A dict from the parsed JSON array.
repo_root: Optional path to the source repository for file existence checks.
component_root: Optional repo-root-relative path of the component whose
analysis produced this citation (e.g. ``"omlx/models"``). Used as a
fallback prefix when the LLM emits component-relative paths like
``"models/base_model.py"`` instead of repo-root-relative
``"omlx/models/base_model.py"``.

Returns:
(CodeCitation or None, list of error strings).
Expand Down Expand Up @@ -133,11 +182,58 @@ def validate_citation_dict(
)
file_path = cleaned

# Optional: check that the file exists on disk
# Existence check with path-shape fallbacks. LLMs emit citation paths
# in several shapes; we try the canonical one first, then fall back.
if repo_root and file_path:
full_path = repo_root / file_path
if not full_path.exists():
normalised = file_path.lstrip("./").lstrip("/")

candidate_paths: List[str] = [normalised]
if component_root:
root_parts = [
p for p in component_root.strip("./").rstrip("/").split("/") if p
]
file_parts = [p for p in normalised.split("/") if p]

# Strategy A: dedupe trailing segments of component_root that
# overlap with leading segments of file_path, then join. Handles
# the common case where the LLM repeats the component's own
# directory name:
# component_root = "omlx/models"
# file_path = "models/base_model.py"
# -> merge to "omlx/models/base_model.py"
for overlap in range(
min(len(file_parts), len(root_parts)), 0, -1
):
if root_parts[-overlap:] == file_parts[:overlap]:
merged = "/".join(root_parts + file_parts[overlap:])
if merged not in candidate_paths:
candidate_paths.append(merged)
break

# Strategy B: straight prefix (no overlap). Handles the case
# where the LLM wrote a bare filename:
# component_root = "omlx/models"
# file_path = "base_model.py"
# -> "omlx/models/base_model.py"
joined = "/".join(root_parts + file_parts) if root_parts else normalised
if joined not in candidate_paths:
candidate_paths.append(joined)

resolved: Optional[str] = None
for candidate in candidate_paths:
if (repo_root / candidate).exists():
resolved = candidate
break

if resolved is None:
errors.append(f"Cited file does not exist: {file_path}")
elif resolved != file_path:
logger.debug(
"Resolved citation path via fallback: %s -> %s",
file_path,
resolved,
)
file_path = resolved

if errors:
return None, errors
Expand All @@ -156,13 +252,17 @@ def extract_component_citations(
md_path: Path,
component_name: str,
repo_root: Optional[Path] = None,
component_root: Optional[str] = None,
) -> ExtractionResult:
"""Extract and validate citations from a single component analysis Markdown file.

Args:
md_path: Path to the component's .md analysis file.
component_name: Name of the component (for labeling).
repo_root: Optional source repo path for file existence validation.
component_root: Optional repo-root-relative path of this component
(e.g. ``"omlx/models"``), used as a fallback path prefix when
the LLM emits component-relative citation paths.

Returns:
ExtractionResult with validated citations and any errors.
Expand All @@ -182,7 +282,9 @@ def extract_component_citations(
result.raw_count = len(raw_dicts)

for i, raw in enumerate(raw_dicts):
citation, val_errors = validate_citation_dict(raw, repo_root)
citation, val_errors = validate_citation_dict(
raw, repo_root, component_root
)
if val_errors:
for err in val_errors:
result.errors.append(f"Citation [{i}]: {err}")
Expand All @@ -197,6 +299,7 @@ def build_citations_index(
repo_root: Optional[Path] = None,
source_repo: str = "",
source_commit: str = "",
component_roots: Optional[Dict[str, str]] = None,
) -> Dict[str, Any]:
"""Extract citations from all analysis Markdown files and produce an index.

Expand All @@ -207,6 +310,12 @@ def build_citations_index(
repo_root: Optional source repo path for file existence validation.
source_repo: Repository URL for generating deep links.
source_commit: Git commit hash for permalink stability.
component_roots: Optional ``{component_name: root_path}`` map. When
provided, a citation path that isn't valid at the repo root will
be retried with the component's root_path as prefix. This
recovers citations where the LLM wrote paths relative to the
component directory (e.g. ``"base_model.py"`` for the ``models``
component at ``omlx/models``).

Returns:
Dict suitable for writing as all_citations.json, containing:
Expand All @@ -217,13 +326,19 @@ def build_citations_index(
from datetime import datetime, timezone

results: List[ExtractionResult] = []
component_roots = component_roots or {}

# Find all .md analysis files (skip any .citations.json files)
md_files = sorted(analyses_dir.glob("*.md"))

for md_path in md_files:
component_name = md_path.stem
result = extract_component_citations(md_path, component_name, repo_root)
result = extract_component_citations(
md_path,
component_name,
repo_root,
component_root=component_roots.get(component_name),
)
results.append(result)

# Write per-component citations file
Expand Down Expand Up @@ -254,6 +369,23 @@ def build_citations_index(
result.raw_count,
)

# Citations + analysis-data JSON blocks are now in the sidecar;
# strip them from the Markdown so the human-facing analysis isn't
# duplicating machine-readable artefacts. Guarded by the
# successful-write branch so that files with malformed citation
# blocks are left untouched for debugging.
try:
original = md_path.read_text(encoding="utf-8")
stripped = strip_extracted_sections(original)
if stripped != original:
md_path.write_text(stripped, encoding="utf-8")
except OSError as exc:
logger.warning(
"Could not strip extracted sections from %s: %s",
md_path,
exc,
)

# Build aggregated index
total_valid = sum(r.valid_count for r in results)
total_raw = sum(r.raw_count for r in results)
Expand Down
Loading
Loading