diff --git a/agent/discovery/languages/python.py b/agent/discovery/languages/python.py index ac0f535..b51b66b 100644 --- a/agent/discovery/languages/python.py +++ b/agent/discovery/languages/python.py @@ -1,12 +1,42 @@ -"""Python language discovery plugin.""" +"""Python language discovery plugin. +Handles two patterns: +1. Multi-manifest repos: each pyproject.toml is a separate component. +2. Single-package monoliths (common in Python): one pyproject.toml at + the repo root with a source tree containing multiple top-level + sub-packages (e.g., omlx/engine, omlx/api, omlx/cache). The plugin + scans imports via AST to discover these sub-packages and traces + their internal dependency edges, mirroring the Go plugin's approach + for single-module monorepos. +""" + +import ast +import logging import re from pathlib import Path -from typing import List +from typing import Dict, List, Optional, Set, Tuple from .base import LanguagePlugin from agent.schemas.core import Component, ComponentKind, ExternalDependency +logger = logging.getLogger(__name__) + +# Directory names inside a source package that should NOT be treated as +# sub-package components (they aren't logical subsystems). +SUB_PACKAGE_SKIP_DIRS = { + "tests", "test", "_tests", "_test", "testing", + "__pycache__", "migrations", "fixtures", +} + +# A candidate sub-package must have at least this many .py files (inclusive +# of __init__.py) to be emitted as its own component. Filters out namespace +# shims that just re-export other packages. +MIN_SUB_PACKAGE_PY_FILES = 2 + +# Minimum number of sub-packages required before we bother splitting the +# project. Below this, a single root component is more informative. +MIN_SUB_PACKAGES_TO_SPLIT = 2 + class PythonPlugin(LanguagePlugin): @@ -45,19 +75,33 @@ def _parse_pyproject(self, path: Path, repo_root: Path) -> List[Component]: external_deps = self._extract_dependencies(text) internal_deps = self._find_internal_deps(text, repo_root, component_root) - # Classify + # Classify root component from the manifest kind = self._classify(text, component_root) - return [Component( + # Attempt to split into sub-packages. For monolithic Python packages + # with a substantial internal module tree, this yields one Component + # per top-level sub-package plus a root Component. + subpackage_components = self._discover_subpackages( + component_root=component_root, + repo_root=repo_root, + project_name=name, + manifest_path=path, + ) + + root_component = Component( name=name, kind=kind, type="python-package", root_path=rel_root or ".", manifest_path=str(path.relative_to(repo_root)), description=description, - internal_dependencies=internal_deps, + internal_dependencies=sorted( + set(internal_deps) | {c.name for c in subpackage_components} + ), external_dependencies=external_deps, - )] + ) + + return [root_component] + subpackage_components def _extract_field(self, text: str, field: str) -> str: """Extract a simple string field from pyproject.toml.""" @@ -108,7 +152,7 @@ def _extract_dependencies(self, text: str) -> List[ExternalDependency]: def _find_internal_deps( self, text: str, repo_root: Path, component_root: Path ) -> List[str]: - """Find internal (workspace) dependencies.""" + """Find internal (workspace) dependencies declared in the manifest.""" internal: List[str] = [] # Look for path dependencies in pyproject.toml @@ -120,7 +164,7 @@ def _find_internal_deps( return internal def _classify(self, text: str, component_root: Path) -> ComponentKind: - """Classify a Python package.""" + """Classify a Python package from manifest-level signals.""" text_lower = text.lower() # Has CLI scripts? @@ -161,3 +205,239 @@ def _classify(self, text: str, component_root: Path) -> ComponentKind: if has_gui_scripts: return ComponentKind.FRONTEND return ComponentKind.LIBRARY + + # ------------------------------------------------------------------ + # Sub-package discovery (for monolithic Python packages) + # ------------------------------------------------------------------ + + def _discover_subpackages( + self, + component_root: Path, + repo_root: Path, + project_name: str, + manifest_path: Path, + ) -> List[Component]: + """Emit Components for top-level sub-packages inside the main source tree. + + For a flat-layout project like: + pyproject.toml (name = "omlx") + omlx/ + __init__.py + cli.py + engine/ + __init__.py + core.py + api/ + __init__.py + routes.py + + This yields Components for ``engine`` and ``api``. Import edges between + them are traced via AST (both absolute and relative imports). + + Returns an empty list if the project doesn't meet the monolithic- + package pattern (no source tree, <2 sub-packages, etc.). + """ + src_root = self._find_source_package(component_root, project_name) + if src_root is None: + return [] + + # Collect candidate sub-package directories + candidates: List[Path] = [] + for entry in sorted(src_root.iterdir()): + if not entry.is_dir(): + continue + if entry.name in SUB_PACKAGE_SKIP_DIRS: + continue + if entry.name.startswith(".") or entry.name.startswith("_"): + # Skip dunder and private dirs (e.g., __pycache__, _internal) + continue + if not (entry / "__init__.py").exists(): + continue + py_files = [p for p in entry.rglob("*.py") if "__pycache__" not in p.parts] + if len(py_files) < MIN_SUB_PACKAGE_PY_FILES: + continue + candidates.append(entry) + + if len(candidates) < MIN_SUB_PACKAGES_TO_SPLIT: + return [] + + src_pkg_name = src_root.name # e.g., "omlx" + candidate_names = {c.name for c in candidates} + manifest_rel = str(manifest_path.relative_to(repo_root)) + + # Resolve cross-package imports for each candidate + components: List[Component] = [] + for sub_dir in candidates: + resolved_imports = self._scan_python_imports(sub_dir, src_root) + + # Keep imports that point at a sibling sub-package + internal: Set[str] = set() + for imp in resolved_imports: + parts = imp.split(".") + if ( + len(parts) >= 2 + and parts[0] == src_pkg_name + and parts[1] in candidate_names + and parts[1] != sub_dir.name + ): + internal.add(parts[1]) + + kind = self._classify_subpackage(sub_dir) + + components.append(Component( + name=sub_dir.name, + kind=kind, + type="python-package", + root_path=str(sub_dir.relative_to(repo_root)), + manifest_path=manifest_rel, + description="", + internal_dependencies=sorted(internal), + # Module-level external deps are declared once on the root + # component; intentionally left empty here to avoid duplication. + external_dependencies=[], + metadata={"parent_package": project_name}, + )) + + return components + + def _find_source_package( + self, component_root: Path, project_name: str + ) -> Optional[Path]: + """Locate the source package directory for a project. + + Tries (in order): + 1. ``/src//`` (src-layout, PEP 518) + 2. ``//`` (flat-layout) + 3. ``/src//`` + 4. ``//`` + + Returns the first directory that exists and contains ``__init__.py``. + """ + name_underscored = project_name.replace("-", "_") + candidates = [ + component_root / "src" / project_name, + component_root / project_name, + component_root / "src" / name_underscored, + component_root / name_underscored, + ] + for c in candidates: + if c.is_dir() and (c / "__init__.py").exists(): + return c + return None + + def _scan_python_imports( + self, sub_dir: Path, src_root: Path + ) -> Set[str]: + """AST-parse every .py file in ``sub_dir`` and return resolved import paths. + + Both absolute (``from foo.bar import X``) and relative + (``from ..bar import X``) imports are returned in absolute dotted form + relative to ``src_root.parent`` (e.g., ``omlx.engine.core``). + """ + src_package_parent = src_root.parent # dir that contains the top-level package + imports: Set[str] = set() + + for py_file in sub_dir.rglob("*.py"): + if "__pycache__" in py_file.parts: + continue + try: + source = py_file.read_text(encoding="utf-8", errors="replace") + tree = ast.parse(source, filename=str(py_file)) + except (SyntaxError, OSError): + continue + + # Compute dotted module path for this file (needed for relative resolution) + try: + rel = py_file.relative_to(src_package_parent) + except ValueError: + continue + module_parts = list(rel.with_suffix("").parts) + # If file is /foo/__init__.py, its module is .foo + if module_parts and module_parts[-1] == "__init__": + module_parts = module_parts[:-1] + + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + if alias.name: + imports.add(alias.name) + elif isinstance(node, ast.ImportFrom): + level = node.level or 0 + if level == 0: + # Absolute import: `from foo.bar import X` + if node.module: + imports.add(node.module) + else: + # Relative import: `from ..bar import X` + # Resolve against the importing module's parent package + base = module_parts[: max(0, len(module_parts) - level)] + if node.module: + resolved = base + node.module.split(".") + else: + resolved = base + if resolved: + imports.add(".".join(resolved)) + + return imports + + def _classify_subpackage(self, sub_dir: Path) -> ComponentKind: + """Classify a sub-package by scanning a bounded sample of its source. + + Framework detection uses word-boundary regex matching on ``import`` + statements specifically, to avoid false positives from substrings in + comments or docstrings (e.g., "dash" matching "dashboard"). + """ + web_frameworks = ("fastapi", "flask", "django", "starlette", "sanic", "aiohttp", "tornado") + cli_frameworks = ("argparse", "click", "typer", "docopt", "fire") + pipeline_markers = ("airflow", "dagster", "prefect", "luigi", "dbt") + frontend_markers = ("streamlit", "gradio", "panel", "dash") + + # Read up to 20 files, capped at 4 KB each — enough to catch framework + # imports and route decorators without scanning massive test fixtures. + sampled = "" + for py_file in list(sub_dir.rglob("*.py"))[:20]: + if "__pycache__" in py_file.parts: + continue + try: + sampled += py_file.read_text(encoding="utf-8", errors="replace")[:4000] + sampled += "\n" + except OSError: + continue + + if not sampled: + return ComponentKind.LIBRARY + + def has_import_of(markers: Tuple[str, ...]) -> bool: + # Match `import ` or `from [...] import ...` + # Requires word boundary after the marker so `dash` doesn't match `dashboard`. + pattern = ( + r"(?m)^\s*(?:from|import)\s+(" + + "|".join(re.escape(m) for m in markers) + + r")\b" + ) + return bool(re.search(pattern, sampled)) + + is_pipeline = has_import_of(pipeline_markers) + is_frontend = has_import_of(frontend_markers) + is_service = has_import_of(web_frameworks) and bool( + re.search( + r"(app\s*=\s*(FastAPI|Flask|Starlette|Sanic)|@(?:app|router|bp)\.(?:get|post|put|delete|patch|route))", + sampled, + ) + ) + is_cli = has_import_of(cli_frameworks) and bool( + re.search( + r"(argparse\.ArgumentParser|@click\.(?:command|group)|@app\.command|typer\.Typer|if\s+__name__\s*==\s*[\"']__main__[\"'])", + sampled, + ) + ) + + if is_pipeline: + return ComponentKind.PIPELINE + if is_frontend: + return ComponentKind.FRONTEND + if is_service: + return ComponentKind.SERVICE + if is_cli: + return ComponentKind.CLI + return ComponentKind.LIBRARY diff --git a/tests/discovery/test_python_plugin.py b/tests/discovery/test_python_plugin.py index ee2b164..5e8ede3 100644 --- a/tests/discovery/test_python_plugin.py +++ b/tests/discovery/test_python_plugin.py @@ -128,3 +128,306 @@ def test_subdirectory_manifest(self, plugin, repo): repo.root / "libs/mylib/pyproject.toml", repo.root ) assert comps[0].root_path == "libs/mylib" + + +class TestSubpackageDiscovery: + """Splits a monolithic Python package into sub-package components.""" + + def _write_flat_layout(self, repo, project_name="myapp"): + """Build a flat-layout project with 3 sub-packages: api, engine, cache.""" + repo.write("pyproject.toml", f""" +[project] +name = "{project_name}" +description = "An app" +""") + # Top-level package files + repo.write(f"{project_name}/__init__.py", "") + repo.write(f"{project_name}/cli.py", "print('cli')") + + # Sub-package: api (imports engine + cache) + repo.write(f"{project_name}/api/__init__.py", "") + repo.write( + f"{project_name}/api/routes.py", + f"from {project_name}.engine import Engine\n" + f"from ..cache import Cache\n", + ) + + # Sub-package: engine (imports cache) + repo.write(f"{project_name}/engine/__init__.py", "") + repo.write( + f"{project_name}/engine/core.py", + f"from {project_name}.cache import Cache\n", + ) + + # Sub-package: cache (no internal deps) + repo.write(f"{project_name}/cache/__init__.py", "") + repo.write(f"{project_name}/cache/lru.py", "class Cache: pass") + + def test_flat_layout_splits_into_subpackages(self, plugin, repo): + self._write_flat_layout(repo, "myapp") + comps = plugin.parse_manifest(repo.root / "pyproject.toml", repo.root) + + # Root + 3 sub-packages + assert len(comps) == 4 + names = {c.name for c in comps} + assert names == {"myapp", "api", "engine", "cache"} + + def test_subpackage_absolute_import_edges(self, plugin, repo): + self._write_flat_layout(repo, "myapp") + comps = plugin.parse_manifest(repo.root / "pyproject.toml", repo.root) + by_name = {c.name: c for c in comps} + + # engine -> cache (absolute import: from myapp.cache) + assert by_name["engine"].internal_dependencies == ["cache"] + # cache has no internal deps + assert by_name["cache"].internal_dependencies == [] + + def test_subpackage_relative_import_edges(self, plugin, repo): + self._write_flat_layout(repo, "myapp") + comps = plugin.parse_manifest(repo.root / "pyproject.toml", repo.root) + by_name = {c.name: c for c in comps} + + # api -> engine (absolute) and api -> cache (relative: from ..cache) + assert set(by_name["api"].internal_dependencies) == {"engine", "cache"} + + def test_root_links_to_subpackages(self, plugin, repo): + self._write_flat_layout(repo, "myapp") + comps = plugin.parse_manifest(repo.root / "pyproject.toml", repo.root) + by_name = {c.name: c for c in comps} + + # Root component's internal_dependencies include every sub-package + assert set(by_name["myapp"].internal_dependencies) >= {"api", "engine", "cache"} + + def test_subpackage_root_paths(self, plugin, repo): + self._write_flat_layout(repo, "myapp") + comps = plugin.parse_manifest(repo.root / "pyproject.toml", repo.root) + by_name = {c.name: c for c in comps} + + assert by_name["api"].root_path == "myapp/api" + assert by_name["engine"].root_path == "myapp/engine" + assert by_name["cache"].root_path == "myapp/cache" + + def test_subpackage_manifest_path_points_to_root_pyproject(self, plugin, repo): + self._write_flat_layout(repo, "myapp") + comps = plugin.parse_manifest(repo.root / "pyproject.toml", repo.root) + + for c in comps: + assert c.manifest_path == "pyproject.toml" + + def test_src_layout_splits_into_subpackages(self, plugin, repo): + """src/// layout is also supported.""" + repo.write("pyproject.toml", """ +[project] +name = "srcapp" +""") + repo.write("src/srcapp/__init__.py", "") + repo.write("src/srcapp/api/__init__.py", "") + repo.write("src/srcapp/api/routes.py", "x = 1") + repo.write("src/srcapp/engine/__init__.py", "") + repo.write("src/srcapp/engine/core.py", "y = 2") + + comps = plugin.parse_manifest(repo.root / "pyproject.toml", repo.root) + names = {c.name for c in comps} + assert {"srcapp", "api", "engine"} <= names + + by_name = {c.name: c for c in comps} + assert by_name["api"].root_path == "src/srcapp/api" + + def test_hyphenated_project_name_maps_to_underscored_dir(self, plugin, repo): + """A project named 'my-app' lives in 'my_app/' on disk.""" + repo.write("pyproject.toml", """ +[project] +name = "my-app" +""") + repo.write("my_app/__init__.py", "") + repo.write("my_app/alpha/__init__.py", "") + repo.write("my_app/alpha/a.py", "x = 1") + repo.write("my_app/beta/__init__.py", "") + repo.write("my_app/beta/b.py", "y = 2") + + comps = plugin.parse_manifest(repo.root / "pyproject.toml", repo.root) + names = {c.name for c in comps} + assert {"alpha", "beta"} <= names + + def test_skips_when_only_one_subpackage(self, plugin, repo): + """Fewer than MIN_SUB_PACKAGES_TO_SPLIT sub-packages keeps single-component behavior.""" + repo.write("pyproject.toml", """ +[project] +name = "tinyapp" +""") + repo.write("tinyapp/__init__.py", "") + repo.write("tinyapp/main.py", "print('hi')") + repo.write("tinyapp/onlysub/__init__.py", "") + repo.write("tinyapp/onlysub/thing.py", "x = 1") + + comps = plugin.parse_manifest(repo.root / "pyproject.toml", repo.root) + assert len(comps) == 1 + assert comps[0].name == "tinyapp" + + def test_skips_when_no_source_package(self, plugin, repo): + """No source tree on disk => single root component, no sub-packages.""" + repo.write("pyproject.toml", """ +[project] +name = "ghost" +""") + comps = plugin.parse_manifest(repo.root / "pyproject.toml", repo.root) + assert len(comps) == 1 + assert comps[0].name == "ghost" + + def test_skips_tests_and_private_dirs(self, plugin, repo): + """tests/, _private/, __pycache__/ etc. don't become components.""" + repo.write("pyproject.toml", """ +[project] +name = "app" +""") + repo.write("app/__init__.py", "") + # These three should become components + repo.write("app/alpha/__init__.py", "") + repo.write("app/alpha/a.py", "x = 1") + repo.write("app/beta/__init__.py", "") + repo.write("app/beta/b.py", "y = 2") + # These should NOT + repo.write("app/tests/__init__.py", "") + repo.write("app/tests/test_alpha.py", "pass") + repo.write("app/_internal/__init__.py", "") + repo.write("app/_internal/helpers.py", "pass") + + comps = plugin.parse_manifest(repo.root / "pyproject.toml", repo.root) + sub_names = {c.name for c in comps if c.name != "app"} + assert sub_names == {"alpha", "beta"} + + def test_skips_subpackage_with_too_few_files(self, plugin, repo): + """A sub-package dir with only __init__.py is ignored.""" + repo.write("pyproject.toml", """ +[project] +name = "app" +""") + repo.write("app/__init__.py", "") + repo.write("app/real/__init__.py", "") + repo.write("app/real/real.py", "x = 1") + repo.write("app/another/__init__.py", "") + repo.write("app/another/another.py", "y = 2") + # Too-small dir: only __init__.py + repo.write("app/stub/__init__.py", "") + + comps = plugin.parse_manifest(repo.root / "pyproject.toml", repo.root) + sub_names = {c.name for c in comps if c.name != "app"} + assert sub_names == {"real", "another"} + assert "stub" not in sub_names + + def test_subpackage_classification_service(self, plugin, repo): + """A sub-package with FastAPI routes is classified SERVICE.""" + repo.write("pyproject.toml", """ +[project] +name = "app" +""") + repo.write("app/__init__.py", "") + repo.write("app/api/__init__.py", "") + repo.write( + "app/api/server.py", + "from fastapi import FastAPI\n" + "app = FastAPI()\n" + "@app.get('/health')\n" + "def health():\n" + " return 'ok'\n", + ) + # Another plain library sub-package so we meet MIN_SUB_PACKAGES_TO_SPLIT + repo.write("app/lib/__init__.py", "") + repo.write("app/lib/stuff.py", "x = 1") + + comps = plugin.parse_manifest(repo.root / "pyproject.toml", repo.root) + by_name = {c.name: c for c in comps} + assert by_name["api"].kind == ComponentKind.SERVICE + assert by_name["lib"].kind == ComponentKind.LIBRARY + + def test_subpackage_classification_cli(self, plugin, repo): + """A sub-package with argparse + main guard is classified CLI.""" + repo.write("pyproject.toml", """ +[project] +name = "app" +""") + repo.write("app/__init__.py", "") + repo.write("app/tool/__init__.py", "") + repo.write( + "app/tool/__main__.py", + "import argparse\n" + "def main():\n" + " parser = argparse.ArgumentParser()\n" + "if __name__ == '__main__':\n" + " main()\n", + ) + repo.write("app/tool/helpers.py", "def h(): pass") + repo.write("app/lib/__init__.py", "") + repo.write("app/lib/stuff.py", "x = 1") + + comps = plugin.parse_manifest(repo.root / "pyproject.toml", repo.root) + by_name = {c.name: c for c in comps} + assert by_name["tool"].kind == ComponentKind.CLI + + def test_external_deps_only_on_root(self, plugin, repo): + """Module-level external deps are declared once on the root component.""" + repo.write("pyproject.toml", """ +[project] +name = "app" +dependencies = [ + "requests>=2", + "pydantic", +] +""") + repo.write("app/__init__.py", "") + repo.write("app/alpha/__init__.py", "") + repo.write("app/alpha/a.py", "x = 1") + repo.write("app/beta/__init__.py", "") + repo.write("app/beta/b.py", "y = 2") + + comps = plugin.parse_manifest(repo.root / "pyproject.toml", repo.root) + by_name = {c.name: c for c in comps} + root_deps = {d.name for d in by_name["app"].external_dependencies} + assert "requests" in root_deps + assert "pydantic" in root_deps + # Sub-packages don't duplicate module-level deps + assert by_name["alpha"].external_dependencies == [] + assert by_name["beta"].external_dependencies == [] + + def test_classification_ignores_framework_names_in_comments(self, plugin, repo): + """Substring matches in docstrings/comments must NOT trigger misclassification. + + Regression test: 'dashboard' should not match the 'dash' frontend + framework marker; only real `import dash` statements should. + """ + repo.write("pyproject.toml", """ +[project] +name = "app" +""") + repo.write("app/__init__.py", "") + repo.write("app/alpha/__init__.py", "") + repo.write( + "app/alpha/a.py", + '"""A dashboard-adjacent utility module."""\n' + "# This has nothing to do with the dash framework.\n" + "def helper():\n" + " return 42\n", + ) + repo.write("app/beta/__init__.py", "") + repo.write("app/beta/b.py", "class Thing: pass") + + comps = plugin.parse_manifest(repo.root / "pyproject.toml", repo.root) + by_name = {c.name: c for c in comps} + # Without the word-boundary fix, 'alpha' would be misclassified as FRONTEND + assert by_name["alpha"].kind == ComponentKind.LIBRARY + + def test_subpackage_metadata_links_back_to_parent(self, plugin, repo): + repo.write("pyproject.toml", """ +[project] +name = "app" +""") + repo.write("app/__init__.py", "") + repo.write("app/alpha/__init__.py", "") + repo.write("app/alpha/a.py", "x = 1") + repo.write("app/beta/__init__.py", "") + repo.write("app/beta/b.py", "y = 2") + + comps = plugin.parse_manifest(repo.root / "pyproject.toml", repo.root) + by_name = {c.name: c for c in comps} + assert by_name["alpha"].metadata.get("parent_package") == "app" + assert by_name["beta"].metadata.get("parent_package") == "app"