diff --git a/agent/discovery/validator.py b/agent/discovery/validator.py index e794016..11516a5 100644 --- a/agent/discovery/validator.py +++ b/agent/discovery/validator.py @@ -119,9 +119,14 @@ def validate_analysis( ) -> List[str]: """Validate post-analysis output. + Citations now live in sidecar ``{component}.citations.json`` files + produced by the citation extractor (the ``## Citations`` block is + stripped from the Markdown after extraction to avoid duplication). + Checks: - Every component has a corresponding analysis .md file - - Analysis files contain a ## Citations section with valid JSON + - Every component has a corresponding .citations.json file with a + non-empty citations array - Cited files exist in the repository (spot check) - Cited line ranges are plausible (start <= end, within file length) """ @@ -134,65 +139,69 @@ def validate_analysis( errors.append(f"Missing analysis file for component '{comp.name}'") continue - # Spot-check citations - citation_errors = _validate_citations_in_file( - analysis_file, comp.name, repo_root + # Spot-check citations from the sidecar JSON + citations_file = analyses_dir / f"{comp.name}.citations.json" + citation_errors = _validate_citations_sidecar( + citations_file, comp.name, repo_root ) errors.extend(citation_errors) return errors -def _validate_citations_in_file( - md_path: Path, +def _validate_citations_sidecar( + sidecar_path: Path, component_name: str, repo_root: Path, max_spot_checks: int = 5, ) -> List[str]: - """Validate citations embedded in a Markdown analysis file. + """Validate citations from the ``{component}.citations.json`` sidecar. Performs lightweight checks: - - ## Citations section exists + - Sidecar file exists - JSON is parseable - - Spot-checks up to max_spot_checks citations for file existence and line plausibility. + - Contains a non-empty ``citations`` array + - Spot-checks up to max_spot_checks citations for file existence and + line plausibility. """ import json import re errors: List[str] = [] - text = md_path.read_text(encoding="utf-8") - # Look for ## Citations + ```json block - pattern = re.compile( - r"^## Citations\b.*?```json\s*\n(.*?)\n\s*```", - re.MULTILINE | re.DOTALL, - ) - match = pattern.search(text) - if not match: + if not sidecar_path.exists(): errors.append( - f"[{component_name}] Missing ## Citations section with JSON block" + f"[{component_name}] Missing sidecar file: {sidecar_path.name}" ) return errors - json_text = match.group(1).strip() - if not json_text: - errors.append(f"[{component_name}] ## Citations JSON block is empty") - return errors - try: - data = json.loads(json_text) + payload = json.loads(sidecar_path.read_text(encoding="utf-8")) except json.JSONDecodeError as exc: - errors.append(f"[{component_name}] Invalid JSON in ## Citations: {exc}") + errors.append( + f"[{component_name}] Invalid JSON in {sidecar_path.name}: {exc}" + ) return errors + if not isinstance(payload, dict): + errors.append( + f"[{component_name}] {sidecar_path.name} must be a JSON object, " + f"got {type(payload).__name__}" + ) + return errors + + data = payload.get("citations", []) if not isinstance(data, list): errors.append( - f"[{component_name}] ## Citations must be a JSON array, got {type(data).__name__}" + f"[{component_name}] 'citations' must be a JSON array, " + f"got {type(data).__name__}" ) return errors if len(data) == 0: - errors.append(f"[{component_name}] ## Citations array is empty (expected 10+)") + errors.append( + f"[{component_name}] citations array is empty (expected 10+)" + ) return errors # Spot-check a sample of citations diff --git a/agent/utils/citation_extractor.py b/agent/utils/citation_extractor.py index 78be773..d4940df 100644 --- a/agent/utils/citation_extractor.py +++ b/agent/utils/citation_extractor.py @@ -33,6 +33,48 @@ re.MULTILINE | re.DOTALL, ) +# Patterns for stripping extracted JSON sections out of the Markdown once +# they've been lifted into sidecar files. Each pattern matches the full +# section (heading + fenced JSON block) so it can be removed wholesale. +# Anchored with ``\b`` on the heading keyword so ``## Analysis Summary`` is +# not caught by the ``## Analysis Data`` pattern. +_STRIP_CITATIONS_RE = re.compile( + r"^## Citations\b[^\n]*\n.*?```json\s*\n.*?\n\s*```\s*\n?", + re.MULTILINE | re.DOTALL, +) +_STRIP_ANALYSIS_DATA_RE = re.compile( + r"^## Analysis Data\b[^\n]*\n.*?```json\s*\n.*?\n\s*```\s*\n?", + re.MULTILINE | re.DOTALL, +) +# Collapse three-or-more consecutive blank lines left behind by the strips. +_COLLAPSE_BLANK_LINES_RE = re.compile(r"\n{3,}") + + +def strip_extracted_sections(markdown_text: str) -> str: + """Remove ``## Citations`` and ``## Analysis Data`` JSON blocks from Markdown. + + These sections are emitted by the component-analyzer for machine + consumption, but the citations are lifted into a sidecar ``.citations.json`` + and ``## Analysis Data`` is never consumed downstream. Leaving them in the + human-facing Markdown is just duplicated noise. + + This function is idempotent: running it twice on the same text produces + the same result as running it once, so re-invoking the pipeline on an + already-cleaned analysis file is a no-op. + + Args: + markdown_text: Raw Markdown from a component analysis file. + + Returns: + Markdown with the two JSON sections removed and trailing blank-line + runs collapsed. Ends with a single trailing newline. + """ + cleaned = _STRIP_CITATIONS_RE.sub("", markdown_text) + cleaned = _STRIP_ANALYSIS_DATA_RE.sub("", cleaned) + cleaned = _COLLAPSE_BLANK_LINES_RE.sub("\n\n", cleaned) + # Normalise trailing whitespace to exactly one newline. + return cleaned.rstrip() + "\n" + @dataclass class ExtractionResult: @@ -86,13 +128,20 @@ def extract_citations_from_markdown( def validate_citation_dict( - raw: Dict[str, Any], repo_root: Optional[Path] = None + raw: Dict[str, Any], + repo_root: Optional[Path] = None, + component_root: Optional[str] = None, ) -> Tuple[Optional[CodeCitation], List[str]]: """Validate a single raw citation dict and convert to CodeCitation. Args: raw: A dict from the parsed JSON array. repo_root: Optional path to the source repository for file existence checks. + component_root: Optional repo-root-relative path of the component whose + analysis produced this citation (e.g. ``"omlx/models"``). Used as a + fallback prefix when the LLM emits component-relative paths like + ``"models/base_model.py"`` instead of repo-root-relative + ``"omlx/models/base_model.py"``. Returns: (CodeCitation or None, list of error strings). @@ -133,11 +182,58 @@ def validate_citation_dict( ) file_path = cleaned - # Optional: check that the file exists on disk + # Existence check with path-shape fallbacks. LLMs emit citation paths + # in several shapes; we try the canonical one first, then fall back. if repo_root and file_path: - full_path = repo_root / file_path - if not full_path.exists(): + normalised = file_path.lstrip("./").lstrip("/") + + candidate_paths: List[str] = [normalised] + if component_root: + root_parts = [ + p for p in component_root.strip("./").rstrip("/").split("/") if p + ] + file_parts = [p for p in normalised.split("/") if p] + + # Strategy A: dedupe trailing segments of component_root that + # overlap with leading segments of file_path, then join. Handles + # the common case where the LLM repeats the component's own + # directory name: + # component_root = "omlx/models" + # file_path = "models/base_model.py" + # -> merge to "omlx/models/base_model.py" + for overlap in range( + min(len(file_parts), len(root_parts)), 0, -1 + ): + if root_parts[-overlap:] == file_parts[:overlap]: + merged = "/".join(root_parts + file_parts[overlap:]) + if merged not in candidate_paths: + candidate_paths.append(merged) + break + + # Strategy B: straight prefix (no overlap). Handles the case + # where the LLM wrote a bare filename: + # component_root = "omlx/models" + # file_path = "base_model.py" + # -> "omlx/models/base_model.py" + joined = "/".join(root_parts + file_parts) if root_parts else normalised + if joined not in candidate_paths: + candidate_paths.append(joined) + + resolved: Optional[str] = None + for candidate in candidate_paths: + if (repo_root / candidate).exists(): + resolved = candidate + break + + if resolved is None: errors.append(f"Cited file does not exist: {file_path}") + elif resolved != file_path: + logger.debug( + "Resolved citation path via fallback: %s -> %s", + file_path, + resolved, + ) + file_path = resolved if errors: return None, errors @@ -156,6 +252,7 @@ def extract_component_citations( md_path: Path, component_name: str, repo_root: Optional[Path] = None, + component_root: Optional[str] = None, ) -> ExtractionResult: """Extract and validate citations from a single component analysis Markdown file. @@ -163,6 +260,9 @@ def extract_component_citations( md_path: Path to the component's .md analysis file. component_name: Name of the component (for labeling). repo_root: Optional source repo path for file existence validation. + component_root: Optional repo-root-relative path of this component + (e.g. ``"omlx/models"``), used as a fallback path prefix when + the LLM emits component-relative citation paths. Returns: ExtractionResult with validated citations and any errors. @@ -182,7 +282,9 @@ def extract_component_citations( result.raw_count = len(raw_dicts) for i, raw in enumerate(raw_dicts): - citation, val_errors = validate_citation_dict(raw, repo_root) + citation, val_errors = validate_citation_dict( + raw, repo_root, component_root + ) if val_errors: for err in val_errors: result.errors.append(f"Citation [{i}]: {err}") @@ -197,6 +299,7 @@ def build_citations_index( repo_root: Optional[Path] = None, source_repo: str = "", source_commit: str = "", + component_roots: Optional[Dict[str, str]] = None, ) -> Dict[str, Any]: """Extract citations from all analysis Markdown files and produce an index. @@ -207,6 +310,12 @@ def build_citations_index( repo_root: Optional source repo path for file existence validation. source_repo: Repository URL for generating deep links. source_commit: Git commit hash for permalink stability. + component_roots: Optional ``{component_name: root_path}`` map. When + provided, a citation path that isn't valid at the repo root will + be retried with the component's root_path as prefix. This + recovers citations where the LLM wrote paths relative to the + component directory (e.g. ``"base_model.py"`` for the ``models`` + component at ``omlx/models``). Returns: Dict suitable for writing as all_citations.json, containing: @@ -217,13 +326,19 @@ def build_citations_index( from datetime import datetime, timezone results: List[ExtractionResult] = [] + component_roots = component_roots or {} # Find all .md analysis files (skip any .citations.json files) md_files = sorted(analyses_dir.glob("*.md")) for md_path in md_files: component_name = md_path.stem - result = extract_component_citations(md_path, component_name, repo_root) + result = extract_component_citations( + md_path, + component_name, + repo_root, + component_root=component_roots.get(component_name), + ) results.append(result) # Write per-component citations file @@ -254,6 +369,23 @@ def build_citations_index( result.raw_count, ) + # Citations + analysis-data JSON blocks are now in the sidecar; + # strip them from the Markdown so the human-facing analysis isn't + # duplicating machine-readable artefacts. Guarded by the + # successful-write branch so that files with malformed citation + # blocks are left untouched for debugging. + try: + original = md_path.read_text(encoding="utf-8") + stripped = strip_extracted_sections(original) + if stripped != original: + md_path.write_text(stripped, encoding="utf-8") + except OSError as exc: + logger.warning( + "Could not strip extracted sections from %s: %s", + md_path, + exc, + ) + # Build aggregated index total_valid = sum(r.valid_count for r in results) total_raw = sum(r.raw_count for r in results) diff --git a/tests/test_citations.py b/tests/test_citations.py index fb3181a..589cca3 100644 --- a/tests/test_citations.py +++ b/tests/test_citations.py @@ -11,6 +11,7 @@ validate_citation_dict, extract_component_citations, build_citations_index, + strip_extracted_sections, ExtractionResult, ) @@ -393,3 +394,325 @@ def test_source_url_enrichment(self, tmp_path): assert citation["source_url"] == ( "https://github.com/myorg/myrepo/blob/deadbeef/src/config.rs#L1-L15" ) + + +# --------------------------------------------------------------------------- +# component_root fallback resolution +# +# LLMs frequently emit citation paths that aren't repo-root-relative. The +# validator/extractor recover these using the component's root_path as a +# prefix candidate, with two strategies (suffix-overlap dedup, plain +# prepend). These tests pin each recovery path. +# --------------------------------------------------------------------------- + + +class TestComponentRootFallback: + def _make_repo(self, tmp_path, relpath): + full = tmp_path / relpath + full.parent.mkdir(parents=True, exist_ok=True) + full.write_text("content\n") + return full + + def test_repo_root_relative_path_accepted_as_is(self, tmp_path): + """If the LLM emits the canonical repo-root path, no rewrite happens.""" + self._make_repo(tmp_path, "omlx/models/base.py") + raw = { + "file_path": "omlx/models/base.py", + "start_line": 1, + "end_line": 5, + "claim": "base model", + } + citation, errors = validate_citation_dict( + raw, repo_root=tmp_path, component_root="omlx/models" + ) + assert errors == [] + assert citation.file_path == "omlx/models/base.py" + + def test_plain_prepend_for_bare_filename(self, tmp_path): + """Bare filename + component_root -> prepended full path.""" + self._make_repo(tmp_path, "omlx/models/base.py") + raw = { + "file_path": "base.py", + "start_line": 1, + "end_line": 5, + "claim": "base model", + } + citation, errors = validate_citation_dict( + raw, repo_root=tmp_path, component_root="omlx/models" + ) + assert errors == [] + assert citation.file_path == "omlx/models/base.py" + + def test_suffix_overlap_dedup(self, tmp_path): + """Path that repeats the component's directory name (a common LLM + mistake) is merged via overlap dedup, not naive concat.""" + self._make_repo(tmp_path, "omlx/models/base.py") + raw = { + "file_path": "models/base.py", # repeats last segment of component_root + "start_line": 1, + "end_line": 5, + "claim": "base model", + } + citation, errors = validate_citation_dict( + raw, repo_root=tmp_path, component_root="omlx/models" + ) + assert errors == [] + # NOT "omlx/models/models/base.py" — the dedup must win. + assert citation.file_path == "omlx/models/base.py" + + def test_multi_segment_overlap(self, tmp_path): + """A longer overlap should dedup correctly too.""" + self._make_repo(tmp_path, "pkg/auth/token/jwt.rs") + raw = { + "file_path": "auth/token/jwt.rs", + "start_line": 1, + "end_line": 5, + "claim": "jwt", + } + citation, errors = validate_citation_dict( + raw, repo_root=tmp_path, component_root="pkg/auth/token" + ) + assert errors == [] + assert citation.file_path == "pkg/auth/token/jwt.rs" + + def test_leading_dot_slash_stripped(self, tmp_path): + self._make_repo(tmp_path, "omlx/models/base.py") + raw = { + "file_path": "./base.py", + "start_line": 1, + "end_line": 5, + "claim": "base", + } + citation, errors = validate_citation_dict( + raw, repo_root=tmp_path, component_root="omlx/models" + ) + assert errors == [] + assert citation.file_path == "omlx/models/base.py" + + def test_absolute_project_prefix_still_stripped(self, tmp_path): + """The existing /tmp/*/project/ strip continues to work alongside + the new component-root fallback.""" + self._make_repo(tmp_path, "omlx/models/base.py") + raw = { + "file_path": "/tmp/omlx/project/omlx/models/base.py", + "start_line": 1, + "end_line": 5, + "claim": "base", + } + citation, errors = validate_citation_dict( + raw, repo_root=tmp_path, component_root="omlx/models" + ) + assert errors == [] + assert citation.file_path == "omlx/models/base.py" + + def test_unresolvable_path_still_errors(self, tmp_path): + """If neither the bare path nor component-rooted variants exist, + we keep the original error.""" + raw = { + "file_path": "ghost/file.py", + "start_line": 1, + "end_line": 5, + "claim": "ghost", + } + citation, errors = validate_citation_dict( + raw, repo_root=tmp_path, component_root="omlx/models" + ) + assert citation is None + assert any("does not exist" in e for e in errors) + + def test_component_root_unused_when_no_repo_root(self): + """component_root only activates when repo_root is provided (i.e. + when we're actually doing existence checks).""" + raw = { + "file_path": "models/base.py", + "start_line": 1, + "end_line": 5, + "claim": "base", + } + citation, errors = validate_citation_dict( + raw, repo_root=None, component_root="omlx/models" + ) + assert errors == [] + # Path is passed through unchanged when we can't verify. + assert citation.file_path == "models/base.py" + + def test_index_uses_component_roots_map(self, tmp_path): + """build_citations_index threads component_roots through to the + validator.""" + # Layout: repo with the component's real file + (tmp_path / "omlx/models").mkdir(parents=True) + (tmp_path / "omlx/models/base.py").write_text("x\n") + + # Analysis emits a component-relative path (the common failure mode) + analyses_dir = tmp_path / "service_analyses" + analyses_dir.mkdir() + (analyses_dir / "models.md").write_text( + dedent( + """ + # models + + ## Citations + ```json + [ + { + "file_path": "base.py", + "start_line": 1, + "end_line": 5, + "claim": "entry", + "section": "Key Components" + } + ] + ``` + """ + ) + ) + + index = build_citations_index( + analyses_dir=analyses_dir, + repo_root=tmp_path, + component_roots={"models": "omlx/models"}, + ) + assert index["metadata"]["total_citations"] == 1 + assert index["all_citations"][0]["file_path"] == "omlx/models/base.py" + + def test_index_without_component_roots_still_works(self, tmp_path): + """Backwards compatibility: callers that don't pass component_roots + get the pre-existing behavior (path accepted as-is or rejected).""" + (tmp_path / "src").mkdir() + (tmp_path / "src/main.rs").write_text("fn main() {}\n") + + analyses_dir = tmp_path / "service_analyses" + analyses_dir.mkdir() + (analyses_dir / "lib.md").write_text( + dedent( + """ + # lib + + ## Citations + ```json + [ + { + "file_path": "src/main.rs", + "start_line": 1, + "end_line": 1, + "claim": "entry", + "section": "Key Components" + } + ] + ``` + """ + ) + ) + + index = build_citations_index( + analyses_dir=analyses_dir, + repo_root=tmp_path, + # no component_roots + ) + assert index["metadata"]["total_citations"] == 1 + + +# --------------------------------------------------------------------------- +# strip_extracted_sections — ensures the duplicated JSON blocks are removed +# from the human-facing Markdown after being lifted into sidecar files. +# --------------------------------------------------------------------------- + + +class TestStripExtractedSections: + def test_strips_citations_section(self): + md = SAMPLE_MD_WITH_CITATIONS + cleaned = strip_extracted_sections(md) + assert "## Citations" not in cleaned + assert "```json" not in cleaned + # Prose sections around the block are preserved. + assert "## Architecture" in cleaned + assert "## Analysis Summary" in cleaned + + def test_strips_analysis_data_section(self): + md = dedent( + """\ + # lib + + ## Architecture + Prose. + + ## Analysis Data + ```json + {"summary": "x", "tech_stack": ["python"]} + ``` + + ## Citations + ```json + [{"file_path": "a.py", "start_line": 1, "end_line": 2, "claim": "c"}] + ``` + """ + ) + cleaned = strip_extracted_sections(md) + assert "## Analysis Data" not in cleaned + assert "## Citations" not in cleaned + assert "```json" not in cleaned + assert "## Architecture" in cleaned + + def test_does_not_strip_analysis_summary(self): + """Regex must not match ``## Analysis Summary`` (a common human section).""" + md = SAMPLE_MD_WITH_CITATIONS + cleaned = strip_extracted_sections(md) + assert "## Analysis Summary" in cleaned + + def test_idempotent(self): + md = SAMPLE_MD_WITH_CITATIONS + once = strip_extracted_sections(md) + twice = strip_extracted_sections(once) + assert once == twice + + def test_no_change_when_no_sections(self): + md = SAMPLE_MD_NO_CITATIONS + cleaned = strip_extracted_sections(md) + # Content survives; we only normalise trailing whitespace. + assert cleaned.rstrip() == md.rstrip() + + def test_trailing_newline_normalised(self): + cleaned = strip_extracted_sections(SAMPLE_MD_WITH_CITATIONS) + assert cleaned.endswith("\n") + assert not cleaned.endswith("\n\n\n") + + def test_build_index_strips_md_after_extraction(self, tmp_path): + """End-to-end: running the pipeline removes the duplicated blocks + from the Markdown while preserving the sidecar JSON.""" + analyses_dir = tmp_path / "service_analyses" + analyses_dir.mkdir() + md_path = analyses_dir / "my-library.md" + md_path.write_text(SAMPLE_MD_WITH_CITATIONS) + + build_citations_index( + analyses_dir=analyses_dir, + source_repo="https://github.com/org/repo", + source_commit="abc", + ) + + # Sidecar was written + sidecar = analyses_dir / "my-library.citations.json" + assert sidecar.exists() + + # MD no longer contains the extracted sections + cleaned_md = md_path.read_text() + assert "## Citations" not in cleaned_md + assert "```json" not in cleaned_md + # Non-extracted prose is preserved + assert "## Architecture" in cleaned_md + + def test_build_index_leaves_md_untouched_on_extraction_failure(self, tmp_path): + """If extraction fails (e.g. invalid JSON), the MD must be left as-is + so the malformed block is still visible for debugging.""" + analyses_dir = tmp_path / "service_analyses" + analyses_dir.mkdir() + md_path = analyses_dir / "bad.md" + md_path.write_text(SAMPLE_MD_INVALID_JSON) + original = md_path.read_text() + + build_citations_index(analyses_dir=analyses_dir) + + # MD unchanged + assert md_path.read_text() == original + # No sidecar written + assert not (analyses_dir / "bad.citations.json").exists()