EmboFlow/scripts/check_doc_code_sync.py

#!/usr/bin/env python3
import argparse
import subprocess
import sys
from pathlib import Path


DOC_PATTERNS = (
    "design/",
    "docs/",
    "adr",
    "architecture",
    "prd",
    "spec",
    "plan",
)

CODE_SUFFIXES = {
    ".py",
    ".ts",
    ".tsx",
    ".js",
    ".jsx",
    ".java",
    ".go",
    ".rs",
    ".rb",
    ".php",
    ".kt",
    ".swift",
    ".scala",
    ".sh",
}

CODE_HINTS = ("apps/", "packages/", "scripts/")
TEST_HINTS = ("test", "spec", "__tests__", "tests/")
CONFIG_SUFFIXES = {".yml", ".yaml", ".json", ".toml", ".ini", ".env"}
CONFIG_HINTS = ("docker", "compose", "k8s", "helm", "terraform", ".github/", ".githooks/", ".env")


def run_git(repo: Path, *args: str) -> list[str]:
    result = subprocess.run(
        ["git", "-C", str(repo), *args],
        capture_output=True,
        text=True,
        check=False,
    )
    if result.returncode != 0:
        raise RuntimeError(result.stderr.strip() or "git command failed")
    return [line.rstrip() for line in result.stdout.splitlines() if line.strip()]


def classify(path_text: str) -> str:
    lower = path_text.lower()
    path = Path(path_text)

    if any(token in lower for token in DOC_PATTERNS) or path.suffix == ".md":
        return "docs"
    if any(token in lower for token in TEST_HINTS):
        return "tests"
    if any(token in lower for token in CODE_HINTS):
        return "code"
    if path.suffix in CODE_SUFFIXES:
        return "code"
    if path.suffix in CONFIG_SUFFIXES or any(token in lower for token in CONFIG_HINTS):
        return "config"
    return "other"


def print_group(title: str, items: list[str]) -> None:
    print(f"\n{title}:")
    if not items:
        print("  - none")
        return
    for item in items:
        print(f"  - {item}")


def assess_changes(
    docs: list[str],
    code: list[str],
    tests: list[str],
    config: list[str],
    other: list[str],
    strict: bool,
) -> dict:
    warnings: list[str] = []
    blockers: list[str] = []

    if code and not docs:
        message = "Code changed but no design/doc files changed."
        warnings.append(message)
        if strict:
            blockers.append(message)
    if config and not docs:
        message = "Config or deployment files changed without any doc updates."
        warnings.append(message)
        if strict:
            blockers.append(message)
    if docs and not code and not config and not tests:
        warnings.append(
            "Docs changed without code changes. This may be intentional, but verify they still match the repository."
        )
    if code and not tests:
        warnings.append(
            "Code changed without any test-file changes. Verify whether tests should change."
        )
    if other:
        warnings.append(
            "Unclassified files changed. Confirm they do not affect documented behavior or runtime assumptions."
        )

    return {
        "warnings": warnings,
        "blockers": blockers,
        "blocking": bool(blockers),
    }


def extract_status_paths(lines: list[str]) -> list[str]:
    return sorted({line[3:] for line in lines if len(line) > 3})


def collect_paths(repo: Path, args: argparse.Namespace) -> list[str]:
    if args.staged:
        return run_git(repo, "diff", "--cached", "--name-only", "--diff-filter=ACMR")
    if args.base_ref:
        return run_git(repo, "diff", "--name-only", "--diff-filter=ACMR", f"{args.base_ref}...HEAD")
    if args.rev_range:
        if ".." in args.rev_range:
            return run_git(repo, "diff", "--name-only", "--diff-filter=ACMR", args.rev_range)
        return run_git(repo, "diff-tree", "--no-commit-id", "--name-only", "-r", args.rev_range)

    changed = run_git(repo, "status", "--short")
    return extract_status_paths(changed)


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Check whether doc changes track code changes.")
    parser.add_argument("repo", nargs="?", default=".", help="git repository path")
    parser.add_argument("--strict", action="store_true", help="fail on blocking drift")
    parser.add_argument("--staged", action="store_true", help="inspect staged files only")
    parser.add_argument("--base-ref", help="compare changes from base ref to HEAD")
    parser.add_argument("--rev-range", help="inspect a git revision range or a single commit")
    return parser.parse_args()


def main() -> int:
    args = parse_args()
    repo = Path(args.repo).expanduser().resolve()

    if not (repo / ".git").exists():
        print(f"Not a git repository: {repo}")
        return 2

    paths = sorted(set(collect_paths(repo, args)))

    docs = [p for p in paths if classify(p) == "docs"]
    code = [p for p in paths if classify(p) == "code"]
    tests = [p for p in paths if classify(p) == "tests"]
    config = [p for p in paths if classify(p) == "config"]
    other = [p for p in paths if classify(p) == "other"]
    assessment = assess_changes(docs, code, tests, config, other, args.strict)

    print(f"Repository: {repo}")
    print(f"Changed files: {len(paths)}")
    print_group("Design and doc files", docs)
    print_group("Code files", code)
    print_group("Test files", tests)
    print_group("Config and infra files", config)
    print_group("Other files", other)

    print("\nAssessment:")
    if not assessment["warnings"]:
        print("  - No obvious doc/code drift detected from changed-file classification.")
    else:
        for warning in assessment["warnings"]:
            print(f"  - {warning}")

    print("\nNext actions:")
    if code and not docs:
        print("  - Review design/ or docs/ and update affected architecture, workflow, or API notes.")
    if docs:
        print("  - Confirm each changed doc still matches the actual implementation.")
    if code:
        print("  - Confirm changed code paths match documented workflow, schema, and runtime assumptions.")
    if other:
        print("  - Review unclassified paths and decide whether docs or tests should be updated.")

    if assessment["blocking"]:
        print("\nResult: blocking drift detected.")
        return 1

    return 0


if __name__ == "__main__":
    raise SystemExit(main())