🎉 feat: initialize foundation docs guardrails and workspace skeleton

2026-03-26 17:18:40 +08:00 · 2026-03-26 17:18:40 +08:00 · f41816bbd9
commit f41816bbd9
43 changed files with 3258 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1,17 @@
 NODE_ENV=development
 WEB_PORT=3000
 API_PORT=3001
 WORKER_PORT=3002
 MONGO_PORT=27017
 MONGO_DB=emboflow
 MONGO_ROOT_USERNAME=emboflow
 MONGO_ROOT_PASSWORD=emboflow
 MINIO_PORT=9000
 MINIO_CONSOLE_PORT=9001
 MINIO_ROOT_USER=emboflow
 MINIO_ROOT_PASSWORD=emboflow123
 STORAGE_PROVIDER=minio
--- a/.githooks/commit-msg
+++ b/.githooks/commit-msg
@ -0,0 +1,7 @@
 #!/usr/bin/env bash
 set -euo pipefail
 repo_root="$(git rev-parse --show-toplevel)"
 cd "$repo_root"
 python3 scripts/check_commit_message.py --file "$1"
--- a/.githooks/pre-commit
+++ b/.githooks/pre-commit
@ -0,0 +1,7 @@
 #!/usr/bin/env bash
 set -euo pipefail
 repo_root="$(git rev-parse --show-toplevel)"
 cd "$repo_root"
 python3 scripts/check_doc_code_sync.py . --staged --strict
--- a/.githooks/pre-push
+++ b/.githooks/pre-push
@ -0,0 +1,19 @@
 #!/usr/bin/env bash
 set -euo pipefail
 repo_root="$(git rev-parse --show-toplevel)"
 cd "$repo_root"
 if git rev-parse --abbrev-ref --symbolic-full-name "@{upstream}" >/dev/null 2>&1; then
  base_ref="$(git rev-parse --abbrev-ref --symbolic-full-name "@{upstream}")"
  python3 scripts/check_doc_code_sync.py . --base-ref "$base_ref" --strict
  python3 scripts/check_commit_message.py --rev-range "$base_ref..HEAD"
 elif git rev-parse HEAD~1 >/dev/null 2>&1; then
  python3 scripts/check_doc_code_sync.py . --base-ref HEAD~1 --strict
  python3 scripts/check_commit_message.py --rev-range "HEAD~1..HEAD"
 else
  python3 scripts/check_doc_code_sync.py . --rev-range HEAD --strict
  python3 scripts/check_commit_message.py --rev-range HEAD
 fi
 python3 -m unittest discover -s tests -p 'test_*.py'
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -0,0 +1,40 @@
 # Summary
 - Describe the change in clear English.
 - Explain the user-visible or system-level impact.
 # Design Sync
 - [ ] I reviewed the relevant files under `design/` before implementing.
 - [ ] I updated the affected design or docs files in the same change set, or I confirmed no design update was required.
 - [ ] I ran `python3 scripts/check_doc_code_sync.py . --strict`.
 Design files reviewed or updated:
 - ``
 If design and code are not fully aligned yet, explain the gap:
 -
 # Validation
 - [ ] I ran local checks relevant to this change.
 - [ ] I ran `bash scripts/install_hooks.sh` in this clone or already had the repo hooks installed.
 - [ ] My commit messages in this PR are English-only and use a gitmoji prefix.
 Commands run:
 ```bash
 # paste commands here
 ```
 # Scope Checklist
 - [ ] This PR updates behavior, contracts, or runtime assumptions intentionally.
 - [ ] This PR does not silently break documented architecture or workflow assumptions.
 - [ ] This PR includes tests if behavior changed, or I confirmed tests were not required.
 # Notes For Reviewers
 - Call out any risky areas, follow-up work, or unresolved assumptions.
--- a/.github/workflows/guardrails.yml
+++ b/.github/workflows/guardrails.yml
@ -0,0 +1,45 @@
 name: Guardrails
 on:
  pull_request:
  push:
 jobs:
  repository-guardrails:
    runs-on: ubuntu-latest
    steps:
      - name: Check out repository
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: Compute git range
        id: git_range
        shell: bash
        run: |
          if [ "${GITHUB_EVENT_NAME}" = "pull_request" ]; then
            RANGE="${{ github.event.pull_request.base.sha }}..${{ github.sha }}"
          elif [ "${{ github.event.before }}" != "0000000000000000000000000000000000000000" ]; then
            RANGE="${{ github.event.before }}..${{ github.sha }}"
          else
            RANGE="${{ github.sha }}"
          fi
          echo "range=${RANGE}" >> "$GITHUB_OUTPUT"
      - name: Validate commit messages
        run: |
          python3 scripts/check_commit_message.py --rev-range "${{ steps.git_range.outputs.range }}"
      - name: Validate design and code sync
        run: |
          python3 scripts/check_doc_code_sync.py . --rev-range "${{ steps.git_range.outputs.range }}" --strict
      - name: Run repository tests
        run: |
          python3 -m unittest discover -s tests -p 'test_*.py'
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,93 @@
 # Contributing To EmboFlow
 ## Core Rule
 Keep `design/` and implementation aligned in the same change set.
 Do not treat design files as background notes. If a code change affects product behavior, workflow behavior, data models, contracts, runtime assumptions, permissions, or deployment assumptions, update the corresponding design documents before closing the task.
 ## Required Workflow
 1. Read the relevant files under `design/` before implementing.
 2. Summarize the intended contract you are changing.
 3. Implement the code change.
 4. Update the affected design files in the same work session.
 5. Install the local git hooks once per clone:
 ```bash
 bash scripts/install_hooks.sh
 ```
 6. Use English-only commit messages with a gitmoji prefix, for example:
 ```text
 :sparkles: add workflow guardrails and CI checks
 ```
 7. Run the local sync check when needed:
 ```bash
 python3 scripts/check_doc_code_sync.py . --strict
 ```
 8. If design and code still diverge, document that explicitly in your final summary.
 ## When Design Updates Are Required
 Update design files when a change affects:
 - user-visible behavior
 - workflow nodes or execution paths
 - data model or storage structure
 - API or schema contracts
 - plugin or executor behavior
 - workspace, project, or permission rules
 - deployment or runtime assumptions
 ## When Design Updates May Be Skipped
 Design updates are usually not required for:
 - pure refactors with no behavior change
 - test-only changes
 - formatting, comments, and naming cleanup
 Even in those cases, verify that no documented statement became false indirectly.
 ## Primary Design Locations
 - `design/00-overview/`
 - `design/01-product/`
 - `design/02-architecture/`
 - `design/03-workflows/`
 - `design/05-data/`
 - `design/08-decisions/`
 ## Local Tooling
 This repository includes:
 - git hook templates under `.githooks/`
 - a hook installer:
 ```bash
 bash scripts/install_hooks.sh
 ```
 - a design/code sync checker:
 ```bash
 python3 scripts/check_doc_code_sync.py . --strict
 ```
 - a commit message validator:
 ```bash
 python3 scripts/check_commit_message.py --rev-range HEAD
 ```
 The hooks and CI enforce:
 - English-only commit messages with a gitmoji prefix
 - design/code consistency checks
 - repository unit tests before push
--- a/apps/api/package.json
+++ b/apps/api/package.json
@ -0,0 +1,8 @@
 {
  "name": "@emboflow/api",
  "private": true,
  "version": "0.1.0",
  "scripts": {
    "dev": "echo 'api app scaffold pending'"
  }
 }
--- a/apps/web/package.json
+++ b/apps/web/package.json
@ -0,0 +1,8 @@
 {
  "name": "@emboflow/web",
  "private": true,
  "version": "0.1.0",
  "scripts": {
    "dev": "echo 'web app scaffold pending'"
  }
 }
--- a/apps/worker/package.json
+++ b/apps/worker/package.json
@ -0,0 +1,8 @@
 {
  "name": "@emboflow/worker",
  "private": true,
  "version": "0.1.0",
  "scripts": {
    "dev": "echo 'worker app scaffold pending'"
  }
 }
--- a/design/00-overview/.gitkeep
+++ b/design/00-overview/.gitkeep
@ -0,0 +1 @@
--- a/design/00-overview/emboflow-platform-overview.md
+++ b/design/00-overview/emboflow-platform-overview.md
@ -0,0 +1,70 @@
 # EmboFlow Platform Overview
 ## Positioning
 EmboFlow is a browser-based embodied data engineering platform for ingesting raw assets, organizing dataset workflows on a visual canvas, processing and converting data, annotating and inspecting results, exporting normalized artifacts, and generating downstream training configurations.
 The platform is designed around plugin-based extensibility, but the first version should deliver a stable built-in core before opening broader extension surfaces.
 ## Primary Users
 - Individual engineers building embodied datasets
 - Team operators managing collection, preprocessing, delivery, and annotation workflows
 - Data engineering teams that need repeatable conversion and packaging pipelines
 - Teams preparing datasets for external training systems
 ## V1 Product Goal
 Build a usable end-to-end platform that allows users to:
 1. Log into a personal or team workspace
 2. Create a project
 3. Upload or import raw embodied data assets
 4. Auto-detect asset structure and generate preview summaries
 5. Compose processing pipelines on a canvas
 6. Configure node parameters and inject code into processing nodes
 7. Execute workflows asynchronously and inspect logs and outputs
 8. Export normalized delivery packages, training datasets, or training config files
 ## Supported Input Formats in V1
 - RLDS
 - LeRobot v2/v3
 - HDF5
 - Rosbag
 - Raw video folders and delivery-style directory packages
 - Compressed archives containing the above
 ## Core Product Principles
 - Raw assets are first-class objects
 - Canonical semantic datasets are derived, not assumed
 - Visualization can operate directly on raw assets
 - Workflow execution is asynchronous and traceable
 - Plugins are versioned and managed
 - User-injected code is supported with strict runtime boundaries
 - Training execution is out of scope for V1, but training handoff is in scope
 ## Major Workspaces
 - Asset Workspace: upload, import, scan, probe, browse
 - Canvas Workspace: build and run workflows
 - Explore Workspace: inspect raw assets and processed outputs
 - Label Workspace: create and review annotation tasks
 - Admin Workspace: users, workspaces, plugins, storage, runtime settings
 ## V1 Output Types
 - Standardized embodied dataset exports
 - Customer delivery packages
 - Validation and quality reports
 - Annotation artifacts
 - Training configuration packages for downstream training systems
 ## Non-Goals for V1
 - Built-in training execution orchestration
 - Real-time collaborative editing on the same canvas
 - Public plugin marketplace
 - Fully generalized MLOps lifecycle management
 - Advanced distributed scheduling in the first deployment
--- a/design/01-product/.gitkeep
+++ b/design/01-product/.gitkeep
@ -0,0 +1 @@
--- a/design/01-product/v1-scope-and-mvp.md
+++ b/design/01-product/v1-scope-and-mvp.md
@ -0,0 +1,90 @@
 # EmboFlow V1 Scope And MVP
 ## MVP Definition
 The first release should prove that EmboFlow can turn raw embodied data assets into structured outputs through a visual workflow engine.
 ### MVP Success Path
 1. A user signs into a workspace
 2. The user creates a project
 3. The user uploads or imports a raw asset
 4. The platform probes the asset and generates a structure summary
 5. The user previews the asset
 6. The user composes a canvas workflow
 7. The workflow executes asynchronously
 8. The user reviews logs, outputs, and generated artifacts
 9. The user exports a normalized dataset, delivery package, or training config
 ## In Scope For V1
 - User login and workspace model
 - Personal and team workspaces
 - Project resource isolation
 - Raw asset upload and import
 - Object storage integration
 - Asset probing and structure detection
 - Raw asset preview
 - Canvas workflow editor
 - Built-in node library for ingest, transform, inspect, export
 - Node configuration through schema-driven forms
 - Code injection for processing nodes
 - Workflow run orchestration
 - Logs, status, retries, and artifact tracking
 - Dataset conversion and delivery-package normalization
 - Training config export
 - Plugin registration skeleton
 ## Important Business Scenarios
 ### Embodied Dataset Conversion
 - Import RLDS, LeRobot, HDF5, or Rosbag
 - Map to canonical semantics
 - Export to target dataset format
 ### Delivery Package Normalization
 - Import customer-provided raw directory or archive
 - Rename top-level folders
 - Validate required file structure
 - Validate metadata files
 - Check video file quality and naming
 - Export or upload normalized package
 ### Data Processing Workflow Authoring
 - Drag nodes onto canvas
 - Connect nodes into DAG
 - Tune parameters
 - Inject code into processing nodes
 - Re-run pipeline with traceable history
 ## V1 Modules To Build Deeply
 - Identity and workspace management
 - Asset ingestion and probing
 - Workflow editor and node model
 - Execution engine
 - Built-in dataset conversion nodes
 - Built-in delivery normalization nodes
 - Preview and inspection
 - Artifact export
 ## V1 Modules To Keep Lightweight
 - Annotation
 - Collaboration
 - Plugin lifecycle UX
 - Advanced analytics
 - Kubernetes and Volcano scheduling adapters
 - Advanced multi-sensor synchronized visual analytics
 ## Explicit V1 Exclusions
 - Platform-managed training execution
 - Real-time multi-user canvas co-editing
 - Full marketplace for third-party plugins
 - Complex enterprise approval workflows
 - Streaming data processing
 - Large-scale distributed execution as a deployment requirement
--- a/design/02-architecture/.gitkeep
+++ b/design/02-architecture/.gitkeep
@ -0,0 +1 @@
--- a/design/02-architecture/deployment-architecture.md
+++ b/design/02-architecture/deployment-architecture.md
@ -0,0 +1,115 @@
 # EmboFlow Deployment Architecture
 ## V1 Deployment Target
 The first deployment target is a single public server. The platform should be deployed in a way that is operationally simple now and migration-friendly later.
 ## Recommended V1 Deployment Topology
 - Reverse proxy
 - Web frontend service
 - API service
 - Worker service
 - MongoDB
 - Optional MinIO
 - Host Docker runtime for execution containers
 ## Deployment Principles
 - Single-host deployment first
 - All major services containerized
 - Persistent state mounted on host volumes
 - Object storage can be external or self-hosted
 - Execution workers separated from API service
 - Future scheduler migration should not require domain model changes
 ## Recommended Runtime Layout
 ### Edge
 - Nginx or equivalent reverse proxy
 - HTTPS termination
 - Static web delivery or web upstream routing
 ### Application
 - `web`
 - `api`
 - `worker`
 ### Data
 - `mongo`
 - `minio` optional
 ## Object Storage Strategy
 The product should support both:
 - Cloud object storage such as BOS or S3-compatible services
 - Self-hosted MinIO for development, demos, or private deployment
 The application should expose a unified storage abstraction instead of embedding provider-specific logic across modules.
 ## Local Scheduler In V1
 V1 should use a local scheduler. Worker processes execute tasks on the same deployment host.
 Design constraints:
 - RuntimeSpec must already exist
 - Scheduler abstraction must already exist
 - Docker executor must already be scheduler-compatible
 This keeps future migration to Kubernetes or Volcano feasible.
 ## Host-Level Persistent Directories
 Recommended host directories:
 - application config
 - mongodb data
 - minio data
 - uploaded file staging
 - execution temp workspace
 - logs
 - backup data
 ## Execution Isolation
 The host Docker runtime serves two different purposes:
 - Running the platform deployment stack
 - Running task execution containers
 These must be treated as separate concerns in configuration and security design.
 ## Future Migration Path
 ### Stage 1
 - Single-host deployment
 - Local scheduler
 - Docker executor
 ### Stage 2
 - Kubernetes-based service deployment
 - Kubernetes scheduler adapter for workflow tasks
 ### Stage 3
 - Volcano scheduler adapter
 - Better support for large batch jobs and training-adjacent workloads
 ## Operational Baseline
 V1 should provide basic operational support for:
 - health checks
 - service restart
 - execution failure visibility
 - disk space monitoring
 - object storage connectivity checks
 - MongoDB backup and restore procedures
 - worker online status
--- a/design/02-architecture/system-architecture.md
+++ b/design/02-architecture/system-architecture.md
@ -0,0 +1,200 @@
 # EmboFlow System Architecture
 ## Architecture Style
 EmboFlow V1 is a browser/server platform built as:
 - Web frontend
 - Modular backend control plane
 - Independent worker runtime
 - MongoDB as the only database
 - Object storage abstraction over cloud object storage or MinIO
 - Local scheduler in V1 with future migration path to Kubernetes and Volcano
 The architecture should preserve clear service boundaries even if V1 is implemented as a modular monolith plus workers.
 ## High-Level Layers
 ### Frontend Layer
 - Asset workspace
 - Canvas workspace
 - Explore workspace
 - Label workspace
 - Admin workspace
 ### Control Plane
 - Identity and authorization
 - Workspace and project management
 - Asset and dataset metadata
 - Workflow definition management
 - Plugin registry and activation
 - Run orchestration API
 - Artifact indexing
 ### Execution Plane
 - Workflow DAG compilation
 - Task queue dispatch
 - Worker execution
 - Executor routing
 - Log and artifact collection
 ### Storage Layer
 - MongoDB for metadata and run state
 - Object storage for files and large outputs
 - Temporary local working directories for execution
 ## Core Domain Objects
 - User
 - Workspace
 - Project
 - Asset
 - Dataset
 - DatasetVersion
 - WorkflowDefinition
 - WorkflowVersion
 - WorkflowRun
 - RunTask
 - Artifact
 - AnnotationTask
 - Annotation
 - Plugin
 - StorageConnection
 ## Raw Asset And Canonical Dataset Model
 The platform must distinguish between:
 - Raw Asset View
 - Canonical Dataset View
 Raw assets preserve source structure, file paths, metadata layout, and original naming. Canonical datasets provide a normalized semantic layer for workflow nodes and export logic.
 Visualization may read raw assets directly. Conversion, orchestration, and export should primarily target canonical semantics.
 ## Workflow Model
 Workflow definitions are versioned and contain:
 - Visual graph state
 - Logical node and edge graph
 - Runtime configuration
 - Plugin references
 Workflow execution produces immutable workflow runs. A run snapshots:
 - Workflow version
 - Node configuration
 - Injected code
 - Executor settings
 - Input bindings
 Runs compile into task DAGs.
 ## Node And Plugin Model
 ### Node Categories
 - Source
 - Transform
 - Inspect
 - Annotate
 - Export
 - Utility
 ### Node Definition Contract
 Each node definition includes:
 - Metadata
 - Input schema
 - Output schema
 - Config schema
 - UI schema
 - Executor type
 - Runtime limits
 - Optional code hook contract
 ### Plugin Types
 - Node plugins
 - Reader/writer plugins
 - Renderer plugins
 - Executor plugins
 - Integration plugins
 ## Execution Architecture
 ### Executors
 - Python executor
 - Docker executor
 - HTTP executor
 V1 should prioritize Python and Docker. HTTP executor is useful for integrating external services.
 ### Schedulers
 - Local scheduler in V1
 - Kubernetes scheduler later
 - Volcano scheduler later
 Executors and schedulers are separate abstractions:
 - Executor defines how logic runs
 - Scheduler defines where and under what scheduling policy it runs
 ## Storage Architecture
 ### MongoDB Collections
 Recommended primary collections:
 - users
 - workspaces
 - projects
 - memberships
 - assets
 - asset_probe_reports
 - datasets
 - dataset_versions
 - workflow_definitions
 - workflow_definition_versions
 - workflow_runs
 - run_tasks
 - artifacts
 - annotation_tasks
 - annotations
 - plugins
 - storage_connections
 - audit_logs
 ### Object Storage Content
 - Raw uploads
 - Imported archives
 - Normalized export packages
 - Training config packages
 - Preview resources
 - Logs and attachments
 - Large manifests and file indexes
 ## Security Model
 User-injected code is low-trust code and must not run in web or API processes.
 V1 runtime policy:
 - Built-in trusted nodes may use Python executor
 - Plugin code should run in controlled runtimes
 - User-injected code should default to Docker executor
 - Network access should be denied by default for user code
 - Input and output paths should be explicitly mounted
 ## Deployment Direction
 V1 deployment target is a single public server using containerized application services. The architecture must still preserve future migration to multi-node environments.
--- a/design/03-workflows/.gitkeep
+++ b/design/03-workflows/.gitkeep
@ -0,0 +1 @@
--- a/design/03-workflows/workflow-execution-model.md
+++ b/design/03-workflows/workflow-execution-model.md
@ -0,0 +1,316 @@
 # EmboFlow Workflow Execution Model
 ## Goal
 Define how EmboFlow represents, validates, executes, and observes canvas workflows.
 The workflow system is the product core. The canvas is only the editing surface. The real system of record is the versioned workflow definition and its immutable run snapshots.
 ## Core Objects
 - `WorkflowDefinition`
  Logical workflow identity under a project
 - `WorkflowVersion`
  Immutable snapshot of nodes, edges, runtime defaults, and plugin references
 - `NodeInstance`
  Concrete node on a workflow graph
 - `WorkflowRun`
  One execution of one workflow version
 - `RunTask`
  Executable unit derived from a node during one run
 - `Artifact`
  Managed output from a task or run
 ## Workflow Layers
 Each workflow version contains three layers.
 ### Visual Layer
 Used only by the editor:
 - node positions
 - collapsed state
 - groups
 - zoom defaults
 - viewport metadata
 ### Logic Layer
 Used for graph semantics:
 - nodes
 - edges
 - input/output ports
 - branch conditions
 - merge semantics
 - dependency graph
 ### Runtime Layer
 Used for execution:
 - node config values
 - executor settings
 - runtime resource limits
 - retry policy
 - code hooks
 - cache policy
 Visual changes must not change workflow semantics. Runtime changes must produce a new workflow version.
 ## Node Categories
 V1 node categories:
 - `Source`
 - `Transform`
 - `Inspect`
 - `Annotate`
 - `Export`
 - `Utility`
 ### V1 Built-In Node Families
 - asset upload/import
 - archive extract
 - folder rename
 - directory validation
 - metadata validation
 - video quality inspection
 - dataset readers for RLDS, LeRobot, HDF5, Rosbag
 - canonical mapping nodes
 - dataset writers and exporters
 - training config export
 - Python processing node
 ## Node Definition Contract
 Each node definition must expose:
 - `id`
 - `name`
 - `category`
 - `version`
 - `description`
 - `inputSchema`
 - `outputSchema`
 - `configSchema`
 - `uiSchema`
 - `executorType`
 - `runtimeDefaults`
 - `permissions`
 - `capabilities`
 - `codeHookSpec`
 ### Code Hook Spec
 V1 supports user code hooks only on:
 - `Transform`
 - `Inspect`
 - `Utility`
 Hooks must use a constrained entrypoint instead of arbitrary script structure.
 Example:
 ```python
 def process(input_data, context):
    return input_data
 ```
 This keeps serialization, logging, and runtime control predictable.
 ## Data Flow Contract
 Tasks should exchange managed references, not loose file paths.
 V1 reference types:
 - `assetRef`
 - `datasetVersionRef`
 - `artifactRef`
 - `annotationTaskRef`
 - `inlineConfig`
 Executors may materialize files internally, but the platform-level contract must remain reference-based.
 ## Validation Stages
 Workflow execution must validate in this order:
 1. workflow version exists
 2. referenced plugins exist and are enabled
 3. node schemas are valid
 4. edge connections are schema-compatible
 5. runtime configuration is complete
 6. referenced assets and datasets are accessible
 7. code hooks pass static validation
 8. executor and scheduler requirements are satisfiable
 Validation failure must block run creation.
 ## Run Lifecycle
 When a user executes a workflow:
 1. resolve workflow version
 2. snapshot all runtime-relevant inputs
 3. resolve plugin versions
 4. freeze node config and code hooks
 5. compile graph into a DAG
 6. create `WorkflowRun`
 7. create `RunTask` entries
 8. enqueue ready tasks
 9. collect outputs, logs, and task state
 10. finalize run status and summary
 ## Run State Model
 ### WorkflowRun Status
 - `pending`
 - `queued`
 - `running`
 - `success`
 - `failed`
 - `cancelled`
 - `partial_success`
 ### RunTask Status
 - `pending`
 - `queued`
 - `running`
 - `success`
 - `failed`
 - `cancelled`
 - `skipped`
 `partial_success` is used for workflows where non-blocking nodes fail but the run still produces valid outputs.
 ## Retry And Failure Policy
 Each node instance may define:
 - retry count
 - retry backoff policy
 - fail-fast behavior
 - continue-on-error behavior
 - manual retry eligibility
 V1 should support:
 - `fail_fast`
 - `continue_on_error`
 - `retry_n_times`
 - `manual_retry`
 ## Cache Model
 V1 should support node-level cache reuse.
 Recommended cache key inputs:
 - workflow version
 - node id
 - upstream reference summary
 - config summary
 - code hook digest
 - plugin version
 - executor version
 Cache hit behavior:
 - reuse output artifact refs
 - reuse output summaries
 - retain previous logs reference
 - mark task as cache-resolved in metadata
 ## Execution Context
 Each task receives a normalized execution context containing:
 - workspace id
 - project id
 - workflow run id
 - task id
 - actor id
 - node config
 - code hook content
 - input references
 - storage context
 - temp working directory
 - runtime resource limits
 This context must be available across Python, Docker, and HTTP executors.
 ## Observability Requirements
 Each task must emit:
 - status transitions
 - start time and finish time
 - duration
 - executor metadata
 - resource request metadata
 - stdout/stderr log stream
 - structured task summary
 - artifact refs
 The UI must allow:
 - graph-level run status
 - node-level log inspection
 - node-level artifact browsing
 - task retry entrypoint
 - direct navigation from a node to preview output
 ## Canvas Interaction Rules
 V1 editor behavior should enforce:
 - port-level connection rules
 - incompatible edge blocking
 - dirty-state detection
 - explicit save before publish/run if graph changed
 - per-node validation badges
 - run from latest saved version, not unsaved draft
 ## Example V1 Pipelines
 ### Delivery Normalization
 ```text
 Raw Folder Import
  -> Archive Extract
  -> Folder Rename
  -> Directory Validation
  -> Metadata Validation
  -> Video Quality Check
  -> Delivery Export
 ```
 ### Dataset Conversion
 ```text
 Rosbag Reader
  -> Canonical Mapping
  -> Frame Filter
  -> Metadata Normalize
  -> LeRobot Writer
  -> Training Config Export
 ```
 ## V1 Non-Goals
 The V1 workflow engine does not need:
 - loop semantics
 - streaming execution
 - unbounded dynamic fan-out
 - event-driven triggers
 - advanced distributed DAG partitioning
 The V1 goal is a stable, observable DAG executor for data engineering workflows.
--- a/design/04-ui-ux/.gitkeep
+++ b/design/04-ui-ux/.gitkeep
@ -0,0 +1 @@
--- a/design/04-ui-ux/information-architecture-and-key-screens.md
+++ b/design/04-ui-ux/information-architecture-and-key-screens.md
@ -0,0 +1,296 @@
 # EmboFlow Information Architecture And Key Screens
 ## Goal
 Define the primary navigation model, main screens, and key interaction patterns for EmboFlow V1.
 The UI should feel like a serious data workflow product, not a generic low-code canvas. The most important interaction is the relationship between assets, workflows, runs, and outputs.
 ## Information Architecture
 Top-level product areas:
 - Workspace switcher
 - Project selector
 - Asset Workspace
 - Canvas Workspace
 - Explore Workspace
 - Label Workspace
 - Admin Workspace
 ## Navigation Model
 ### Global Header
 Recommended global header content:
 - workspace switcher
 - project switcher
 - search entry
 - run notifications
 - user menu
 ### Primary Sidebar
 Recommended primary navigation:
 - Assets
 - Workflows
 - Runs
 - Explore
 - Labels
 - Admin
 This keeps the product model explicit:
 - assets are inputs
 - workflows define transformation logic
 - runs represent execution history
 - explore is where users inspect outputs and raw inputs
 ## Screen 1: Workspace And Project Entry
 Purpose:
 - choose personal or team workspace
 - choose or create project
 - view recent projects and recent workflow runs
 V1 should emphasize project-level organization because all major resources are project-scoped.
 ## Screen 2: Asset Workspace
 Purpose:
 - upload or import raw assets
 - inspect asset type and status
 - review probe summary
 - launch preview or workflow entrypoint
 Core regions:
 - asset list with filters
 - import actions
 - asset status and source type
 - probe summary card
 - recommended next actions
 Key actions:
 - upload file
 - upload archive
 - import object storage prefix
 - register storage path
 - open preview
 - create workflow from asset
 ## Screen 3: Asset Detail / Explore Entry
 Purpose:
 - inspect one asset deeply
 - browse folder structure
 - inspect metadata and detected format
 - preview representative files
 Suggested panels:
 - left: file tree or asset structure
 - center: preview surface
 - right: metadata, probe report, warnings, recommended nodes
 This screen should support both:
 - raw asset view
 - canonical dataset summary view when available
 ## Screen 4: Canvas Workspace
 This is the core authoring surface.
 ### Layout
 Recommended layout, aligned with the Xspark reference pattern:
 - left: node library and workflow tools
 - center: canvas
 - right: node configuration panel
 ### Left Panel
 Contains:
 - source nodes
 - transform nodes
 - inspect nodes
 - annotate nodes
 - export nodes
 - utility nodes
 - search/filter
 ### Center Canvas
 Supports:
 - drag-and-drop node placement
 - edge creation
 - zoom and pan
 - mini-map
 - node badges for validation status
 - run-state overlays when viewing an executed version
 ### Right Configuration Panel
 The right panel is schema-driven.
 It should render:
 - node title
 - node description
 - config fields
 - input/output schema summary
 - executor selection
 - runtime policy
 - code hook editor if supported
 - validation errors
 This panel is critical. It should feel like a structured system console, not a generic form dump.
 ## Screen 5: Workflow Run Detail
 Purpose:
 - inspect execution state
 - view DAG progress
 - open task logs
 - inspect task outputs
 - retry failed nodes
 Recommended layout:
 - top: run summary and status
 - center: workflow graph with execution overlays
 - bottom or side drawer: logs and artifacts for selected node
 ## Screen 6: Explore Workspace
 Purpose:
 - inspect raw or processed outputs outside the canvas authoring context
 - compare source and transformed outputs
 - validate whether a run produced expected results
 V1 renderer set:
 - directory tree renderer
 - JSON renderer
 - video renderer
 - dataset summary renderer
 - quality report renderer
 This workspace should open from:
 - asset detail
 - workflow node output
 - artifact detail
 ## Screen 7: Label Workspace
 Purpose:
 - process annotation tasks
 - review results
 - attach annotations to data outputs
 V1 should keep this lightweight:
 - frame labels
 - clip labels
 - temporal segment labels
 - quality tags
 The label workspace should be able to open from an artifact or dataset version, not only from a workflow node.
 ## Screen 8: Admin Workspace
 Purpose:
 - manage members
 - manage storage connections
 - manage plugin enablement
 - inspect audit and runtime settings
 Suggested sections:
 - members and roles
 - workspace settings
 - storage connections
 - plugin registry
 - executor policies
 - audit log viewer
 ## Key UX Principles
 ### 1. Separate authoring from inspection
 Do not overload the canvas with deep preview or annotation workflows. The canvas configures process. Explore and Label workspaces handle dense interaction.
 ### 2. Keep lineage visible
 Users should be able to move across:
 - asset
 - workflow
 - run
 - task
 - artifact
 - annotation
 without losing context.
 ### 3. Prefer explicit system terminology
 Use consistent object names in the UI:
 - Asset
 - Dataset
 - Workflow
 - Run
 - Task
 - Artifact
 - Plugin
 Do not rename the same concept differently across pages.
 ### 4. Make validation obvious before execution
 Before users run a workflow, the editor should visibly show:
 - missing config
 - invalid schema connections
 - unsupported executor choices
 - permission or plugin issues
 ### 5. Keep the product usable on standard screens
 The canvas and right configuration panel must work on laptop-sized displays. On narrower screens, the right panel may collapse into a drawer.
 ## V1 Visual Direction
 The UI should communicate:
 - precision
 - observability
 - traceability
 - strong operator control
 It should feel closer to a workflow control console than a consumer productivity app.
 ## V1 Non-Goals
 V1 UI does not need:
 - real-time multi-user cursor collaboration
 - advanced canvas commenting systems
 - highly customized renderer marketplace UX
 - heavy design polish ahead of workflow clarity
--- a/design/05-data/.gitkeep
+++ b/design/05-data/.gitkeep
@ -0,0 +1 @@
--- a/design/05-data/mongodb-data-model.md
+++ b/design/05-data/mongodb-data-model.md
@ -0,0 +1,521 @@
 # EmboFlow MongoDB Data Model
 ## Goal
 Define the MongoDB-only persistence model for EmboFlow V1.
 The database must support:
 - user and workspace isolation
 - raw asset tracking
 - canonical dataset versions
 - workflow versioning
 - workflow execution history
 - plugin registration
 - auditability
 ## Storage Principles
 - MongoDB stores metadata and execution state
 - Object storage stores large binary files and large derived bundles
 - MongoDB documents should have clear aggregate boundaries
 - Large, fast-growing arrays should be split into separate collections
 - Platform contracts should use references, not embedded file blobs
 ## Primary Collections
 - `users`
 - `workspaces`
 - `projects`
 - `memberships`
 - `assets`
 - `asset_probe_reports`
 - `datasets`
 - `dataset_versions`
 - `workflow_definitions`
 - `workflow_definition_versions`
 - `workflow_runs`
 - `run_tasks`
 - `artifacts`
 - `annotation_tasks`
 - `annotations`
 - `plugins`
 - `storage_connections`
 - `audit_logs`
 ## Collection Design
 ### users
 Purpose:
 - account identity
 - profile
 - login metadata
 Core fields:
 - `_id`
 - `email`
 - `displayName`
 - `avatarUrl`
 - `status`
 - `lastLoginAt`
 - `createdAt`
 - `updatedAt`
 ### workspaces
 Purpose:
 - resource ownership boundary
 Core fields:
 - `_id`
 - `type` as `personal` or `team`
 - `name`
 - `slug`
 - `ownerId`
 - `status`
 - `settings`
 - `createdAt`
 - `updatedAt`
 ### memberships
 Purpose:
 - workspace and project role mapping
 Core fields:
 - `_id`
 - `workspaceId`
 - `projectId` optional
 - `userId`
 - `role`
 - `status`
 - `createdAt`
 - `updatedAt`
 This collection should stay independent instead of embedding large member arrays on every resource.
 ### projects
 Purpose:
 - project-scoped grouping for assets, workflows, runs, and outputs
 Core fields:
 - `_id`
 - `workspaceId`
 - `name`
 - `slug`
 - `description`
 - `status`
 - `createdBy`
 - `createdAt`
 - `updatedAt`
 ### assets
 Purpose:
 - represent raw uploaded or imported inputs
 Supported asset types:
 - `raw_file`
 - `archive`
 - `folder`
 - `video_collection`
 - `standard_dataset`
 - `rosbag`
 - `hdf5_dataset`
 - `object_storage_prefix`
 Core fields:
 - `_id`
 - `workspaceId`
 - `projectId`
 - `type`
 - `sourceType`
 - `displayName`
 - `status`
 - `storageRef`
 - `sizeBytes`
 - `fileCount`
 - `topLevelPaths`
 - `detectedFormats`
 - `summary`
 - `createdBy`
 - `createdAt`
 - `updatedAt`
 Do not embed full large file listings in this document.
 ### asset_probe_reports
 Purpose:
 - retain richer structure-detection and validation output
 Core fields:
 - `_id`
 - `assetId`
 - `reportVersion`
 - `detectedFormatCandidates`
 - `structureSummary`
 - `warnings`
 - `recommendedNextNodes`
 - `rawReport`
 - `createdAt`
 ### datasets
 Purpose:
 - represent logical dataset identity
 Core fields:
 - `_id`
 - `workspaceId`
 - `projectId`
 - `name`
 - `type`
 - `status`
 - `latestVersionId`
 - `summary`
 - `createdBy`
 - `createdAt`
 - `updatedAt`
 ### dataset_versions
 Purpose:
 - represent immutable dataset snapshots
 Core fields:
 - `_id`
 - `datasetId`
 - `workspaceId`
 - `projectId`
 - `sourceAssetId`
 - `parentVersionId`
 - `versionTag`
 - `canonicalSchemaVersion`
 - `manifestRef`
 - `stats`
 - `summary`
 - `status`
 - `createdBy`
 - `createdAt`
 This collection is separated because versions will grow over time.
 ### workflow_definitions
 Purpose:
 - represent logical workflow identity
 Core fields:
 - `_id`
 - `workspaceId`
 - `projectId`
 - `name`
 - `slug`
 - `status`
 - `latestVersionNumber`
 - `publishedVersionNumber`
 - `createdBy`
 - `createdAt`
 - `updatedAt`
 ### workflow_definition_versions
 Purpose:
 - represent immutable workflow snapshots
 Core fields:
 - `_id`
 - `workflowDefinitionId`
 - `workspaceId`
 - `projectId`
 - `versionNumber`
 - `visualGraph`
 - `logicGraph`
 - `runtimeGraph`
 - `pluginRefs`
 - `summary`
 - `createdBy`
 - `createdAt`
 Splitting versions from workflow head metadata avoids oversized documents and simplifies history queries.
 ### workflow_runs
 Purpose:
 - store execution runs
 Core fields:
 - `_id`
 - `workflowDefinitionId`
 - `workflowVersionId`
 - `workspaceId`
 - `projectId`
 - `triggeredBy`
 - `status`
 - `runtimeSnapshot`
 - `summary`
 - `startedAt`
 - `finishedAt`
 - `createdAt`
 ### run_tasks
 Purpose:
 - store one execution unit per node per run
 Core fields:
 - `_id`
 - `workflowRunId`
 - `workflowVersionId`
 - `nodeId`
 - `nodeType`
 - `status`
 - `attempt`
 - `executor`
 - `scheduler`
 - `inputRefs`
 - `outputRefs`
 - `logRef`
 - `cacheKey`
 - `cacheHit`
 - `errorSummary`
 - `startedAt`
 - `finishedAt`
 - `createdAt`
 This collection should remain separate from `workflow_runs` because task volume grows quickly.
 ### artifacts
 Purpose:
 - store managed outputs and previews
 Artifact types may include:
 - preview bundle
 - quality report
 - normalized dataset package
 - delivery package
 - training config package
 - intermediate task output
 Core fields:
 - `_id`
 - `workspaceId`
 - `projectId`
 - `type`
 - `producerType`
 - `producerId`
 - `storageRef`
 - `previewable`
 - `summary`
 - `lineage`
 - `createdBy`
 - `createdAt`
 ### annotation_tasks
 Purpose:
 - track assignment and state of manual labeling work
 Core fields:
 - `_id`
 - `workspaceId`
 - `projectId`
 - `targetType`
 - `targetRef`
 - `labelType`
 - `status`
 - `assigneeIds`
 - `reviewerIds`
 - `createdBy`
 - `createdAt`
 - `updatedAt`
 ### annotations
 Purpose:
 - persist annotation outputs
 Core fields:
 - `_id`
 - `annotationTaskId`
 - `workspaceId`
 - `projectId`
 - `targetRef`
 - `payload`
 - `status`
 - `createdBy`
 - `createdAt`
 - `updatedAt`
 ### plugins
 Purpose:
 - track installable and enabled plugin versions
 Core fields:
 - `_id`
 - `workspaceId` optional for workspace-scoped plugins
 - `scope` as `platform` or `workspace`
 - `name`
 - `status`
 - `currentVersion`
 - `versions`
 - `permissions`
 - `metadata`
 - `createdAt`
 - `updatedAt`
 If plugin version payloads become large, split versions into a separate collection later. V1 can keep them nested if bounded.
 ### storage_connections
 Purpose:
 - store object storage and path registration configuration
 Core fields:
 - `_id`
 - `workspaceId`
 - `type`
 - `provider`
 - `name`
 - `status`
 - `config`
 - `secretRef`
 - `createdBy`
 - `createdAt`
 - `updatedAt`
 Store secrets outside plaintext document fields where possible.
 ### audit_logs
 Purpose:
 - append-only history of sensitive actions
 Core fields:
 - `_id`
 - `workspaceId`
 - `projectId`
 - `actorId`
 - `resourceType`
 - `resourceId`
 - `action`
 - `beforeSummary`
 - `afterSummary`
 - `metadata`
 - `createdAt`
 ## Reference Strategy
 Use stable ids between collections.
 References should be explicit:
 - asset to probe report
 - dataset to dataset versions
 - workflow definition to workflow versions
 - workflow run to run tasks
 - task to artifact
 - annotation task to annotations
 Do not depend on implicit path-based linkage.
 ## Index Recommendations
 ### Always index
 - `workspaceId`
 - `projectId`
 - `status`
 - `createdAt`
 ### Important compound indexes
 - `memberships.workspaceId + memberships.userId`
 - `projects.workspaceId + projects.slug`
 - `assets.projectId + assets.type + assets.createdAt`
 - `datasets.projectId + datasets.name`
 - `dataset_versions.datasetId + dataset_versions.createdAt`
 - `workflow_definitions.projectId + workflow_definitions.slug`
 - `workflow_definition_versions.workflowDefinitionId + versionNumber`
 - `workflow_runs.projectId + createdAt`
 - `workflow_runs.workflowDefinitionId + status`
 - `run_tasks.workflowRunId + nodeId`
 - `artifacts.producerType + producerId`
 - `annotation_tasks.projectId + status`
 - `audit_logs.workspaceId + createdAt`
 ## Object Storage References
 MongoDB should store references such as:
 - bucket
 - key
 - uri
 - checksum
 - content type
 - size
 It should not store:
 - large binary file payloads
 - full raw video content
 - giant archive contents
 ## V1 Constraints
 - MongoDB is the only database
 - No relational sidecar is assumed
 - No GridFS-first strategy is assumed
 - Large manifests may live in object storage and be referenced from MongoDB
 ## V1 Non-Goals
 The V1 model does not need:
 - cross-region data distribution
 - advanced event sourcing
 - fully normalized analytics warehouse modeling
 - high-volume search indexing inside MongoDB itself
--- a/design/06-api/.gitkeep
+++ b/design/06-api/.gitkeep
@ -0,0 +1 @@
--- a/design/07-research/.gitkeep
+++ b/design/07-research/.gitkeep
@ -0,0 +1 @@
--- a/design/08-decisions/.gitkeep
+++ b/design/08-decisions/.gitkeep
@ -0,0 +1 @@
--- a/design/08-decisions/adr-0001-raw-asset-and-canonical-dataset.md
+++ b/design/08-decisions/adr-0001-raw-asset-and-canonical-dataset.md
@ -0,0 +1,45 @@
 # ADR-0001: Separate Raw Assets From Canonical Datasets
 ## Status
 Accepted
 ## Context
 EmboFlow must support both structured embodied dataset formats and unstructured or semi-structured delivery-style raw assets, including:
 - RLDS
 - LeRobot v2/v3
 - HDF5
 - Rosbag
 - Raw video directories
 - Archive packages
 If the platform treats every input as an already-standardized dataset, ingestion and delivery workflows become awkward and lossy.
 ## Decision
 The platform will model:
 - Raw assets as first-class resources
 - Canonical datasets as derived semantic resources
 Raw assets preserve original structure, paths, naming, and metadata layout. Canonical datasets provide normalized semantics for conversion, workflow execution, and export logic.
 ## Consequences
 ### Positive
 - Supports customer delivery package workflows
 - Supports embodied dataset conversion workflows
 - Preserves original structure for inspection and debugging
 - Avoids forcing visualization to depend on a lossy normalized format
 ### Negative
 - Adds one more layer to the object model
 - Requires readers and mappers instead of direct format-to-format conversion
 ## Notes
 Visualization may operate on raw assets directly. Processing and export should primarily operate on canonical semantics where possible.
--- a/design/08-decisions/adr-0002-executor-and-scheduler-separation.md
+++ b/design/08-decisions/adr-0002-executor-and-scheduler-separation.md
@ -0,0 +1,56 @@
 # ADR-0002: Separate Executors From Schedulers
 ## Status
 Accepted
 ## Context
 EmboFlow needs to support multiple runtime modes now and later:
 - direct Python execution
 - Docker-isolated execution
 - HTTP-based execution
 - local scheduling
 - future Kubernetes scheduling
 - future Volcano scheduling
 If execution logic and scheduling logic are coupled together, migration from single-host operation to cluster operation becomes costly.
 ## Decision
 The architecture will separate:
 - Executor: how node logic runs
 - Scheduler: where and under what dispatch policy tasks run
 V1 executors:
 - Python
 - Docker
 - HTTP
 V1 scheduler:
 - Local
 Reserved future schedulers:
 - Kubernetes
 - Volcano
 ## Consequences
 ### Positive
 - Cleaner evolution path
 - Better runtime abstraction
 - Less refactoring required for cluster migration
 ### Negative
 - Slightly more abstraction in V1 than the immediate deployment requires
 ## Notes
 User-injected code should default to Docker execution, while trusted platform logic may use Python execution.
--- a/design/09-assets/.gitkeep
+++ b/design/09-assets/.gitkeep
@ -0,0 +1 @@
--- a/design/README.md
+++ b/design/README.md
@ -0,0 +1,21 @@
 # EmboFlow Design Workspace
 This directory stores project design materials before or alongside implementation.
 ## Structure
 - `00-overview`: project goals, scope, milestones
 - `01-product`: requirements, user stories, feature definitions
 - `02-architecture`: system architecture, modules, technical constraints
 - `03-workflows`: business flows, sequence diagrams, operational flows
 - `04-ui-ux`: wireframes, interaction notes, UX decisions
 - `05-data`: data model, entities, schema drafts
 - `06-api`: API contracts, request/response drafts, integration notes
 - `07-research`: competitive analysis, references, discovery notes
 - `08-decisions`: ADRs and major tradeoff records
 - `09-assets`: diagrams, exported images, attachments
 - `templates`: reusable design document templates
 ## Suggested usage
 Keep design artifacts in Markdown where possible so they diff cleanly in Git.
--- a/design/templates/.gitkeep
+++ b/design/templates/.gitkeep
@ -0,0 +1 @@
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,62 @@
 services:
  web:
    image: node:20-alpine
    working_dir: /workspace
    command: ["sh", "-c", "sleep infinity"]
    ports:
      - "${WEB_PORT:-3000}:3000"
    volumes:
      - .:/workspace
    depends_on:
      - api
  api:
    image: node:20-alpine
    working_dir: /workspace
    command: ["sh", "-c", "sleep infinity"]
    ports:
      - "${API_PORT:-3001}:3001"
    volumes:
      - .:/workspace
    depends_on:
      - mongo
  worker:
    image: node:20-alpine
    working_dir: /workspace
    command: ["sh", "-c", "sleep infinity"]
    ports:
      - "${WORKER_PORT:-3002}:3002"
    volumes:
      - .:/workspace
    depends_on:
      - mongo
      - minio
  mongo:
    image: mongo:7
    restart: unless-stopped
    ports:
      - "${MONGO_PORT:-27017}:27017"
    environment:
      MONGO_INITDB_ROOT_USERNAME: "${MONGO_ROOT_USERNAME:-emboflow}"
      MONGO_INITDB_ROOT_PASSWORD: "${MONGO_ROOT_PASSWORD:-emboflow}"
    volumes:
      - mongo-data:/data/db
  minio:
    image: minio/minio:RELEASE.2024-10-29T16-01-48Z
    restart: unless-stopped
    command: ["server", "/data", "--console-address", ":9001"]
    ports:
      - "${MINIO_PORT:-9000}:9000"
      - "${MINIO_CONSOLE_PORT:-9001}:9001"
    environment:
      MINIO_ROOT_USER: "${MINIO_ROOT_USER:-emboflow}"
      MINIO_ROOT_PASSWORD: "${MINIO_ROOT_PASSWORD:-emboflow123}"
    volumes:
      - minio-data:/data
 volumes:
  mongo-data:
  minio-data:
--- a/docs/development-workflow.md
+++ b/docs/development-workflow.md
@ -0,0 +1,96 @@
 # EmboFlow Development Workflow
 ## Goal
 Keep repository design artifacts and implementation changes aligned as EmboFlow evolves.
 ## Working Agreement
 EmboFlow is being developed from explicit design documents under `design/`. Development should follow a doc-aware workflow instead of letting code drift ahead without recorded decisions.
 ## Standard Change Flow
 ### 1. Read Before Editing
 Before changing code, review the design files that define the affected area:
 - product scope
 - architecture boundaries
 - workflow model
 - data model
 - deployment model
 - accepted ADRs
 ### 2. Identify Impact
 Decide whether the change affects:
 - product behavior
 - object model
 - workflow/run/task semantics
 - node or plugin contract
 - storage assumptions
 - user or permission behavior
 - deployment/runtime assumptions
 If yes, the matching design files must be updated.
 ### 3. Change Code And Docs Together
 Do not defer the design update. Treat design edits as part of the implementation, not follow-up cleanup.
 ### 4. Run The Consistency Check
 From the repo root:
 ```bash
 python3 scripts/check_doc_code_sync.py . --strict
 ```
 Interpret warnings manually. The script is a guardrail, not a replacement for judgment.
 ### 5. Use The Local Hooks
 Install local hooks once per clone:
 ```bash
 bash scripts/install_hooks.sh
 ```
 This enables:
 - `commit-msg`: require English-only gitmoji commit messages
 - `pre-commit`: block staged code/config drift without doc updates
 - `pre-push`: run commit-message validation, doc/code sync checks, and repository tests
 ### 6. Close With Explicit Status
 Every implementation summary should state one of:
 - `Aligned`
 - `Partially aligned`
 - `Doc-first`
 and name the exact design files that were reviewed or updated.
 ## EmboFlow-Specific Review Checklist
 Before closing a non-trivial change, confirm whether any of these need updates:
 - raw asset vs canonical dataset model
 - workflow definition vs workflow run model
 - node schema and plugin contract
 - executor vs scheduler separation
 - MongoDB collection or document shape
 - workspace/project/user boundary
 - deployment topology or storage assumptions
 ## Automation
 This repository now uses both local and remote guardrails:
 - local git hooks from `.githooks/`
 - commit message validation
 - CI checks in `.github/workflows/guardrails.yml`
 These checks are intended to keep design documents, code changes, and commit history coherent.
--- a/docs/plans/2026-03-26-emboflow-v1-foundation-and-mvp.md
+++ b/docs/plans/2026-03-26-emboflow-v1-foundation-and-mvp.md
@ -0,0 +1,621 @@
 # EmboFlow V1 Foundation And MVP Implementation Plan
 > **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
 **Goal:** Build the first usable EmboFlow increment: workspace-aware raw asset ingestion, workflow definition/versioning, local workflow execution, and the first web workflow authoring surfaces.
 **Architecture:** Use a TypeScript monorepo with a React web app, a Node.js API control plane, and a separate Node.js worker. Use MongoDB as the only database, object storage abstraction for cloud storage or MinIO, and a local scheduler with Python and Docker executor contracts.
 **Tech Stack:** pnpm workspace, React, TypeScript, React Flow, NestJS, Mongoose, MongoDB, Docker Compose, Python runtime hooks, unittest/Vitest/Jest-compatible project tests
 ---
 ### Task 1: Bootstrap The Monorepo And Runtime Skeleton
 **Files:**
 - Create: `package.json`
 - Create: `pnpm-workspace.yaml`
 - Create: `tsconfig.base.json`
 - Create: `apps/web/package.json`
 - Create: `apps/api/package.json`
 - Create: `apps/worker/package.json`
 - Create: `docker-compose.yml`
 - Create: `.env.example`
 - Test: `tests/test_repo_structure.py`
 **Step 1: Write the failing test**
 Create `tests/test_repo_structure.py` to assert the repository contains the expected top-level app folders and root workspace files.
 **Step 2: Run test to verify it fails**
 Run:
 ```bash
 python3 -m unittest tests/test_repo_structure.py -v
 ```
 Expected: FAIL because the monorepo files and app folders do not exist yet.
 **Step 3: Write minimal implementation**
 Create the pnpm workspace root, app package manifests, root TypeScript config, `.env.example`, and `docker-compose.yml` with services for:
 - `web`
 - `api`
 - `worker`
 - `mongo`
 - `minio`
 Keep the first version minimal. Do not add extra infra services that are not required by the design.
 **Step 4: Run test to verify it passes**
 Run:
 ```bash
 python3 -m unittest tests/test_repo_structure.py -v
 ```
 Expected: PASS
 **Step 5: Commit**
 ```bash
 git add package.json pnpm-workspace.yaml tsconfig.base.json apps docker-compose.yml .env.example tests/test_repo_structure.py
 git commit -m ":tada: bootstrap workspace and runtime skeleton"
 ```
 ### Task 2: Create Shared Domain Contracts And Mongo Setup
 **Files:**
 - Create: `packages/contracts/package.json`
 - Create: `packages/contracts/src/domain.ts`
 - Create: `apps/api/src/common/mongo/mongo.module.ts`
 - Create: `apps/api/src/common/mongo/schemas/workspace.schema.ts`
 - Create: `apps/api/src/common/mongo/schemas/project.schema.ts`
 - Create: `apps/api/src/common/mongo/schemas/asset.schema.ts`
 - Create: `apps/api/src/common/mongo/schemas/workflow.schema.ts`
 - Test: `apps/api/test/domain-contracts.spec.ts`
 **Step 1: Write the failing test**
 Create `apps/api/test/domain-contracts.spec.ts` asserting:
 - workspace types include `personal` and `team`
 - asset types include raw and dataset-style sources
 - workflow status values match the design docs
 **Step 2: Run test to verify it fails**
 Run:
 ```bash
 pnpm --filter api test domain-contracts.spec.ts
 ```
 Expected: FAIL because contracts and schemas are missing.
 **Step 3: Write minimal implementation**
 Create shared domain enums and base Mongo schema definitions for:
 - workspaces
 - projects
 - assets
 - workflow definitions
 Add a minimal Mongo module in the API app using environment-based connection config.
 **Step 4: Run test to verify it passes**
 Run:
 ```bash
 pnpm --filter api test domain-contracts.spec.ts
 ```
 Expected: PASS
 **Step 5: Commit**
 ```bash
 git add packages/contracts apps/api/src/common apps/api/test/domain-contracts.spec.ts
 git commit -m ":sparkles: add shared domain contracts and mongo setup"
 ```
 ### Task 3: Implement Identity, Workspace, And Project APIs
 **Files:**
 - Create: `apps/api/src/modules/auth/auth.module.ts`
 - Create: `apps/api/src/modules/auth/auth.controller.ts`
 - Create: `apps/api/src/modules/workspaces/workspaces.module.ts`
 - Create: `apps/api/src/modules/workspaces/workspaces.controller.ts`
 - Create: `apps/api/src/modules/projects/projects.module.ts`
 - Create: `apps/api/src/modules/projects/projects.controller.ts`
 - Create: `apps/api/src/modules/projects/projects.service.ts`
 - Test: `apps/api/test/projects.e2e-spec.ts`
 **Step 1: Write the failing test**
 Create `apps/api/test/projects.e2e-spec.ts` covering:
 - create personal workspace bootstrap flow
 - create project under a workspace
 - reject project creation without a workspace id
 **Step 2: Run test to verify it fails**
 Run:
 ```bash
 pnpm --filter api test projects.e2e-spec.ts
 ```
 Expected: FAIL because the modules and endpoints do not exist yet.
 **Step 3: Write minimal implementation**
 Implement:
 - development-safe auth stub or local auth module
 - workspace creation and listing
 - project creation and listing
 - basic membership checks sufficient for V1 local development
 Do not build a full production auth stack before the API shape is stable.
 **Step 4: Run test to verify it passes**
 Run:
 ```bash
 pnpm --filter api test projects.e2e-spec.ts
 ```
 Expected: PASS
 **Step 5: Commit**
 ```bash
 git add apps/api/src/modules/auth apps/api/src/modules/workspaces apps/api/src/modules/projects apps/api/test/projects.e2e-spec.ts
 git commit -m ":sparkles: add workspace and project APIs"
 ```
 ### Task 4: Implement Asset Ingestion, Storage Abstraction, And Probe Metadata
 **Files:**
 - Create: `apps/api/src/modules/storage/storage.module.ts`
 - Create: `apps/api/src/modules/storage/storage.service.ts`
 - Create: `apps/api/src/modules/assets/assets.module.ts`
 - Create: `apps/api/src/modules/assets/assets.controller.ts`
 - Create: `apps/api/src/modules/assets/assets.service.ts`
 - Create: `apps/api/src/modules/assets/probe/probe.service.ts`
 - Create: `apps/api/src/common/mongo/schemas/asset-probe-report.schema.ts`
 - Test: `apps/api/test/assets.e2e-spec.ts`
 **Step 1: Write the failing test**
 Create `apps/api/test/assets.e2e-spec.ts` covering:
 - register an uploaded asset record
 - create a probe report for a raw asset
 - return recommended next actions from probe metadata
 **Step 2: Run test to verify it fails**
 Run:
 ```bash
 pnpm --filter api test assets.e2e-spec.ts
 ```
 Expected: FAIL because asset ingestion and probe services are missing.
 **Step 3: Write minimal implementation**
 Implement:
 - storage abstraction interface
 - MinIO/S3-compatible config contract
 - asset create/list/detail endpoints
 - probe-report persistence
 - placeholder probe logic for directory and archive summaries
 Do not build full binary upload optimization yet. First make the metadata contract stable.
 **Step 4: Run test to verify it passes**
 Run:
 ```bash
 pnpm --filter api test assets.e2e-spec.ts
 ```
 Expected: PASS
 **Step 5: Commit**
 ```bash
 git add apps/api/src/modules/storage apps/api/src/modules/assets apps/api/src/common/mongo/schemas/asset-probe-report.schema.ts apps/api/test/assets.e2e-spec.ts
 git commit -m ":truck: add asset ingestion and probe metadata flow"
 ```
 ### Task 5: Implement Workflow Definitions, Versions, Runs, And Tasks
 **Files:**
 - Create: `apps/api/src/modules/workflows/workflows.module.ts`
 - Create: `apps/api/src/modules/workflows/workflows.controller.ts`
 - Create: `apps/api/src/modules/workflows/workflows.service.ts`
 - Create: `apps/api/src/modules/runs/runs.module.ts`
 - Create: `apps/api/src/modules/runs/runs.controller.ts`
 - Create: `apps/api/src/modules/runs/runs.service.ts`
 - Create: `apps/api/src/common/mongo/schemas/workflow-definition-version.schema.ts`
 - Create: `apps/api/src/common/mongo/schemas/workflow-run.schema.ts`
 - Create: `apps/api/src/common/mongo/schemas/run-task.schema.ts`
 - Test: `apps/api/test/workflow-runs.e2e-spec.ts`
 **Step 1: Write the failing test**
 Create `apps/api/test/workflow-runs.e2e-spec.ts` covering:
 - create workflow definition
 - save workflow version
 - create workflow run from saved version
 - generate initial run tasks for ready nodes
 **Step 2: Run test to verify it fails**
 Run:
 ```bash
 pnpm --filter api test workflow-runs.e2e-spec.ts
 ```
 Expected: FAIL because workflow versioning and run creation do not exist yet.
 **Step 3: Write minimal implementation**
 Implement:
 - workflow definition head record
 - immutable workflow version snapshots
 - run creation from a workflow version
 - initial DAG compilation for simple source-to-transform chains
 - run task persistence
 Keep V1 graph compilation simple. Support sequential edges first, then one-level branching.
 **Step 4: Run test to verify it passes**
 Run:
 ```bash
 pnpm --filter api test workflow-runs.e2e-spec.ts
 ```
 Expected: PASS
 **Step 5: Commit**
 ```bash
 git add apps/api/src/modules/workflows apps/api/src/modules/runs apps/api/src/common/mongo/schemas/workflow-definition-version.schema.ts apps/api/src/common/mongo/schemas/workflow-run.schema.ts apps/api/src/common/mongo/schemas/run-task.schema.ts apps/api/test/workflow-runs.e2e-spec.ts
 git commit -m ":sparkles: add workflow versioning and run records"
 ```
 ### Task 6: Add The Worker, Local Scheduler, And Executor Contracts
 **Files:**
 - Create: `apps/worker/src/main.ts`
 - Create: `apps/worker/src/runner/task-runner.ts`
 - Create: `apps/worker/src/scheduler/local-scheduler.ts`
 - Create: `apps/worker/src/executors/python-executor.ts`
 - Create: `apps/worker/src/executors/docker-executor.ts`
 - Create: `apps/worker/src/executors/http-executor.ts`
 - Create: `apps/worker/src/contracts/execution-context.ts`
 - Test: `apps/worker/test/task-runner.spec.ts`
 **Step 1: Write the failing test**
 Create `apps/worker/test/task-runner.spec.ts` covering:
 - worker loads pending tasks
 - worker marks task running then success
 - worker chooses executor based on node runtime config
 **Step 2: Run test to verify it fails**
 Run:
 ```bash
 pnpm --filter worker test task-runner.spec.ts
 ```
 Expected: FAIL because the worker runtime does not exist yet.
 **Step 3: Write minimal implementation**
 Implement:
 - worker bootstrap
 - polling or queue-backed local scheduler
 - execution context builder
 - stub Python, Docker, and HTTP executors
 - task status transitions
 Do not implement full Docker isolation logic in one step. First lock the runtime interfaces and transitions.
 **Step 4: Run test to verify it passes**
 Run:
 ```bash
 pnpm --filter worker test task-runner.spec.ts
 ```
 Expected: PASS
 **Step 5: Commit**
 ```bash
 git add apps/worker apps/api/src/modules/runs apps/worker/test/task-runner.spec.ts
 git commit -m ":construction_worker: add local worker and executor contracts"
 ```
 ### Task 7: Build The Web Shell, Workspace Flow, And Asset Workspace
 **Files:**
 - Create: `apps/web/src/main.tsx`
 - Create: `apps/web/src/app/router.tsx`
 - Create: `apps/web/src/features/layout/app-shell.tsx`
 - Create: `apps/web/src/features/workspaces/workspace-switcher.tsx`
 - Create: `apps/web/src/features/projects/project-selector.tsx`
 - Create: `apps/web/src/features/assets/assets-page.tsx`
 - Create: `apps/web/src/features/assets/asset-detail-page.tsx`
 - Create: `apps/web/src/features/assets/components/asset-list.tsx`
 - Create: `apps/web/src/features/assets/components/asset-summary-panel.tsx`
 - Test: `apps/web/src/features/assets/assets-page.test.tsx`
 **Step 1: Write the failing test**
 Create `apps/web/src/features/assets/assets-page.test.tsx` covering:
 - app shell renders primary navigation
 - assets page renders asset rows from API data
 - asset detail page renders probe summary
 **Step 2: Run test to verify it fails**
 Run:
 ```bash
 pnpm --filter web test assets-page.test.tsx
 ```
 Expected: FAIL because the web app shell and pages do not exist yet.
 **Step 3: Write minimal implementation**
 Implement:
 - web app bootstrap
 - primary navigation matching the design docs
 - workspace/project header controls
 - asset list page
 - asset detail page with summary and action buttons
 Defer advanced preview renderers. Start with structured metadata and simple detail views.
 **Step 4: Run test to verify it passes**
 Run:
 ```bash
 pnpm --filter web test assets-page.test.tsx
 ```
 Expected: PASS
 **Step 5: Commit**
 ```bash
 git add apps/web apps/web/src/features/assets/assets-page.test.tsx
 git commit -m ":sparkles: add web shell and asset workspace"
 ```
 ### Task 8: Build Canvas Authoring, Run Detail, And First Workflow Actions
 **Files:**
 - Create: `apps/web/src/features/workflows/workflows-page.tsx`
 - Create: `apps/web/src/features/workflows/workflow-editor-page.tsx`
 - Create: `apps/web/src/features/workflows/components/node-library.tsx`
 - Create: `apps/web/src/features/workflows/components/workflow-canvas.tsx`
 - Create: `apps/web/src/features/workflows/components/node-config-panel.tsx`
 - Create: `apps/web/src/features/runs/run-detail-page.tsx`
 - Create: `apps/web/src/features/runs/components/run-graph-view.tsx`
 - Create: `apps/web/src/features/runs/components/task-log-panel.tsx`
 - Test: `apps/web/src/features/workflows/workflow-editor-page.test.tsx`
 **Step 1: Write the failing test**
 Create `apps/web/src/features/workflows/workflow-editor-page.test.tsx` covering:
 - node library renders categories
 - node config panel opens when a node is selected
 - run detail view shows node status badges from run data
 **Step 2: Run test to verify it fails**
 Run:
 ```bash
 pnpm --filter web test workflow-editor-page.test.tsx
 ```
 Expected: FAIL because the workflow editor and run detail pages do not exist yet.
 **Step 3: Write minimal implementation**
 Implement:
 - workflow list page
 - workflow editor page using React Flow
 - left node library, center canvas, right config panel
 - save workflow version action
 - trigger workflow run action
 - run detail page with graph and selected-node log panel
 Keep the first editor scoped to V1 node categories and schema-driven config rendering.
 **Step 4: Run test to verify it passes**
 Run:
 ```bash
 pnpm --filter web test workflow-editor-page.test.tsx
 ```
 Expected: PASS
 **Step 5: Commit**
 ```bash
 git add apps/web/src/features/workflows apps/web/src/features/runs apps/web/src/features/workflows/workflow-editor-page.test.tsx
 git commit -m ":sparkles: add canvas workflow editor and run detail pages"
 ```
 ### Task 9: Add Preview Surface, Delivery Nodes, And MVP Integration
 **Files:**
 - Create: `apps/api/src/modules/artifacts/artifacts.module.ts`
 - Create: `apps/api/src/modules/artifacts/artifacts.controller.ts`
 - Create: `apps/api/src/modules/artifacts/artifacts.service.ts`
 - Create: `apps/web/src/features/explore/explore-page.tsx`
 - Create: `apps/web/src/features/explore/renderers/json-renderer.tsx`
 - Create: `apps/web/src/features/explore/renderers/video-renderer.tsx`
 - Create: `apps/web/src/features/explore/renderers/directory-renderer.tsx`
 - Create: `apps/api/src/modules/plugins/builtin/delivery-nodes.ts`
 - Test: `apps/api/test/artifacts.e2e-spec.ts`
 - Test: `apps/web/src/features/explore/explore-page.test.tsx`
 **Step 1: Write the failing tests**
 Create:
 - `apps/api/test/artifacts.e2e-spec.ts` for artifact retrieval by producer
 - `apps/web/src/features/explore/explore-page.test.tsx` for opening and rendering supported artifact types
 **Step 2: Run tests to verify they fail**
 Run:
 ```bash
 pnpm --filter api test artifacts.e2e-spec.ts
 pnpm --filter web test explore-page.test.tsx
 ```
 Expected: FAIL because artifact APIs and explore renderers do not exist yet.
 **Step 3: Write minimal implementation**
 Implement:
 - artifact module and lookup endpoints
 - explore page
 - JSON, directory, and video renderers
 - built-in delivery-normalization node definitions for the V1 business path
 Do not implement the full renderer plugin platform yet. Start with built-ins and stable renderer contracts.
 **Step 4: Run tests to verify they pass**
 Run:
 ```bash
 pnpm --filter api test artifacts.e2e-spec.ts
 pnpm --filter web test explore-page.test.tsx
 ```
 Expected: PASS
 **Step 5: Commit**
 ```bash
 git add apps/api/src/modules/artifacts apps/api/src/modules/plugins/builtin/delivery-nodes.ts apps/api/test/artifacts.e2e-spec.ts apps/web/src/features/explore apps/web/src/features/explore/explore-page.test.tsx
 git commit -m ":package: add explore surface and delivery artifacts"
 ```
 ### Task 10: Harden Guardrails, Docs, And Developer Entry Commands
 **Files:**
 - Modify: `CONTRIBUTING.md`
 - Modify: `docs/development-workflow.md`
 - Modify: `design/03-workflows/workflow-execution-model.md`
 - Modify: `design/05-data/mongodb-data-model.md`
 - Create: `Makefile`
 - Create: `README.md`
 - Test: `tests/test_dev_commands.py`
 **Step 1: Write the failing test**
 Create `tests/test_dev_commands.py` asserting:
 - `Makefile` exposes expected local commands
 - `README.md` documents bootstrap, hooks, test, and local run commands
 **Step 2: Run test to verify it fails**
 Run:
 ```bash
 python3 -m unittest tests/test_dev_commands.py -v
 ```
 Expected: FAIL because developer entry commands are not documented yet.
 **Step 3: Write minimal implementation**
 Add:
 - `make bootstrap`
 - `make test`
 - `make dev-api`
 - `make dev-web`
 - `make dev-worker`
 - `make guardrails`
 Document the developer flow in `README.md` and update design docs if implementation decisions changed during Tasks 1-9.
 **Step 4: Run test to verify it passes**
 Run:
 ```bash
 python3 -m unittest tests/test_dev_commands.py -v
 ```
 Expected: PASS
 **Step 5: Commit**
 ```bash
 git add CONTRIBUTING.md docs/development-workflow.md design/03-workflows/workflow-execution-model.md design/05-data/mongodb-data-model.md Makefile README.md tests/test_dev_commands.py
 git commit -m ":memo: add developer entry commands and bootstrap docs"
 ```
 ## Exit Criteria
 The first implementation pass is complete when:
 - a user can create a workspace and project
 - a raw asset can be registered and probed
 - a workflow can be created, versioned, and executed locally
 - run tasks produce observable status and artifacts
 - the web app exposes assets, workflows, runs, and basic explore views
 - guardrails for docs, hooks, commit messages, and CI remain green
 ## Notes
 - Keep commits small and use the repository gitmoji + English commit policy.
 - Update design files in the same task where behavior or architecture changes.
 - Do not add training execution before the V1 data workflow loop is stable.
--- a/package.json
+++ b/package.json
@ -0,0 +1,9 @@
 {
  "name": "emboflow",
  "private": true,
  "version": "0.1.0",
  "packageManager": "pnpm@9.12.3",
  "scripts": {
    "test": "python3 -m unittest discover -s tests -p 'test_*.py'"
  }
 }
--- a/pnpm-workspace.yaml
+++ b/pnpm-workspace.yaml
@ -0,0 +1,3 @@
 packages:
  - "apps/*"
  - "packages/*"
--- a/scripts/check_commit_message.py
+++ b/scripts/check_commit_message.py
@ -0,0 +1,126 @@
 #!/usr/bin/env python3
 import argparse
 import re
 import subprocess
 import sys
 from pathlib import Path
 SHORTCODE_PREFIX = re.compile(r"^:[a-z0-9_+-]+:\s+")
 EMOJI_PREFIX = re.compile(r"^[\u2600-\u27BF\U0001F300-\U0001FAFF]\s+")
 def strip_prefix(message: str) -> str:
    if SHORTCODE_PREFIX.match(message):
        return SHORTCODE_PREFIX.sub("", message, count=1)
    if EMOJI_PREFIX.match(message):
        return EMOJI_PREFIX.sub("", message, count=1)
    return message
 def validate_message(message: str) -> list[str]:
    lines = [line.rstrip("\n") for line in message.splitlines()]
    cleaned_lines = [line for line in lines if line and not line.startswith("#")]
    if not cleaned_lines:
        return ["Commit message must not be empty."]
    subject = cleaned_lines[0]
    errors: list[str] = []
    if not SHORTCODE_PREFIX.match(subject) and not EMOJI_PREFIX.match(subject):
        errors.append("Commit subject must start with a gitmoji shortcode or emoji.")
    body = "\n".join(cleaned_lines)
    normalized = strip_prefix(subject) + ("\n" + "\n".join(cleaned_lines[1:]) if len(cleaned_lines) > 1 else "")
    try:
        normalized.encode("ascii")
    except UnicodeEncodeError:
        errors.append("Commit message must be written in English ASCII text after the gitmoji prefix.")
    if not strip_prefix(subject).strip():
        errors.append("Commit subject must include an English summary after the gitmoji prefix.")
    if re.search(r"[\u4e00-\u9fff]", body):
        errors.append("Commit message must not contain Chinese characters.")
    return errors
 def read_message_file(path: Path) -> str:
    return path.read_text(encoding="utf-8")
 def run_git(*args: str) -> list[str]:
    result = subprocess.run(
        ["git", *args],
        capture_output=True,
        text=True,
        check=False,
    )
    if result.returncode != 0:
        raise RuntimeError(result.stderr.strip() or "git command failed")
    return [line.strip() for line in result.stdout.splitlines() if line.strip()]
 def commit_messages_from_range(rev_range: str) -> list[tuple[str, str]]:
    if ".." in rev_range:
        shas = run_git("rev-list", rev_range)
    else:
        shas = [rev_range]
    messages: list[tuple[str, str]] = []
    for sha in shas:
        message = subprocess.run(
            ["git", "log", "--format=%B", "-n", "1", sha],
            capture_output=True,
            text=True,
            check=False,
        )
        if message.returncode != 0:
            raise RuntimeError(message.stderr.strip() or "git log failed")
        messages.append((sha, message.stdout.strip()))
    return messages
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Validate commit message format.")
    parser.add_argument("--file", help="path to commit message file")
    parser.add_argument("--rev-range", help="git revision range or single commit")
    return parser.parse_args()
 def main() -> int:
    args = parse_args()
    if bool(args.file) == bool(args.rev_range):
        print("Use exactly one of --file or --rev-range.")
        return 2
    failures: list[str] = []
    if args.file:
        message = read_message_file(Path(args.file))
        errors = validate_message(message)
        if errors:
            failures.extend(errors)
    else:
        for sha, message in commit_messages_from_range(args.rev_range):
            errors = validate_message(message)
            for error in errors:
                failures.append(f"{sha[:12]}: {error}")
    if failures:
        print("Commit message validation failed:")
        for failure in failures:
            print(f"  - {failure}")
        print("\nExpected format example:")
        print("  :sparkles: add hook templates and CI guardrails")
        return 1
    print("Commit message validation passed.")
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/scripts/check_doc_code_sync.py
+++ b/scripts/check_doc_code_sync.py
@ -0,0 +1,194 @@
 #!/usr/bin/env python3
 import argparse
 import subprocess
 import sys
 from pathlib import Path
 DOC_PATTERNS = (
    "design/",
    "docs/",
    "adr",
    "architecture",
    "prd",
    "spec",
    "plan",
 )
 CODE_SUFFIXES = {
    ".py",
    ".ts",
    ".tsx",
    ".js",
    ".jsx",
    ".java",
    ".go",
    ".rs",
    ".rb",
    ".php",
    ".kt",
    ".swift",
    ".scala",
    ".sh",
 }
 CODE_HINTS = ("apps/", "packages/", "scripts/")
 TEST_HINTS = ("test", "spec", "__tests__", "tests/")
 CONFIG_SUFFIXES = {".yml", ".yaml", ".json", ".toml", ".ini", ".env"}
 CONFIG_HINTS = ("docker", "compose", "k8s", "helm", "terraform", ".github/", ".githooks/", ".env")
 def run_git(repo: Path, *args: str) -> list[str]:
    result = subprocess.run(
        ["git", "-C", str(repo), *args],
        capture_output=True,
        text=True,
        check=False,
    )
    if result.returncode != 0:
        raise RuntimeError(result.stderr.strip() or "git command failed")
    return [line.strip() for line in result.stdout.splitlines() if line.strip()]
 def classify(path_text: str) -> str:
    lower = path_text.lower()
    path = Path(path_text)
    if any(token in lower for token in DOC_PATTERNS) or path.suffix == ".md":
        return "docs"
    if any(token in lower for token in TEST_HINTS):
        return "tests"
    if any(token in lower for token in CODE_HINTS):
        return "code"
    if path.suffix in CODE_SUFFIXES:
        return "code"
    if path.suffix in CONFIG_SUFFIXES or any(token in lower for token in CONFIG_HINTS):
        return "config"
    return "other"
 def print_group(title: str, items: list[str]) -> None:
    print(f"\n{title}:")
    if not items:
        print("  - none")
        return
    for item in items:
        print(f"  - {item}")
 def assess_changes(
    docs: list[str],
    code: list[str],
    tests: list[str],
    config: list[str],
    other: list[str],
    strict: bool,
 ) -> dict:
    warnings: list[str] = []
    blockers: list[str] = []
    if code and not docs:
        message = "Code changed but no design/doc files changed."
        warnings.append(message)
        if strict:
            blockers.append(message)
    if config and not docs:
        message = "Config or deployment files changed without any doc updates."
        warnings.append(message)
        if strict:
            blockers.append(message)
    if docs and not code and not config and not tests:
        warnings.append(
            "Docs changed without code changes. This may be intentional, but verify they still match the repository."
        )
    if code and not tests:
        warnings.append(
            "Code changed without any test-file changes. Verify whether tests should change."
        )
    if other:
        warnings.append(
            "Unclassified files changed. Confirm they do not affect documented behavior or runtime assumptions."
        )
    return {
        "warnings": warnings,
        "blockers": blockers,
        "blocking": bool(blockers),
    }
 def collect_paths(repo: Path, args: argparse.Namespace) -> list[str]:
    if args.staged:
        return run_git(repo, "diff", "--cached", "--name-only", "--diff-filter=ACMR")
    if args.base_ref:
        return run_git(repo, "diff", "--name-only", "--diff-filter=ACMR", f"{args.base_ref}...HEAD")
    if args.rev_range:
        if ".." in args.rev_range:
            return run_git(repo, "diff", "--name-only", "--diff-filter=ACMR", args.rev_range)
        return run_git(repo, "diff-tree", "--no-commit-id", "--name-only", "-r", args.rev_range)
    changed = run_git(repo, "status", "--short")
    return sorted({line[3:] for line in changed if len(line) > 3})
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Check whether doc changes track code changes.")
    parser.add_argument("repo", nargs="?", default=".", help="git repository path")
    parser.add_argument("--strict", action="store_true", help="fail on blocking drift")
    parser.add_argument("--staged", action="store_true", help="inspect staged files only")
    parser.add_argument("--base-ref", help="compare changes from base ref to HEAD")
    parser.add_argument("--rev-range", help="inspect a git revision range or a single commit")
    return parser.parse_args()
 def main() -> int:
    args = parse_args()
    repo = Path(args.repo).expanduser().resolve()
    if not (repo / ".git").exists():
        print(f"Not a git repository: {repo}")
        return 2
    paths = sorted(set(collect_paths(repo, args)))
    docs = [p for p in paths if classify(p) == "docs"]
    code = [p for p in paths if classify(p) == "code"]
    tests = [p for p in paths if classify(p) == "tests"]
    config = [p for p in paths if classify(p) == "config"]
    other = [p for p in paths if classify(p) == "other"]
    assessment = assess_changes(docs, code, tests, config, other, args.strict)
    print(f"Repository: {repo}")
    print(f"Changed files: {len(paths)}")
    print_group("Design and doc files", docs)
    print_group("Code files", code)
    print_group("Test files", tests)
    print_group("Config and infra files", config)
    print_group("Other files", other)
    print("\nAssessment:")
    if not assessment["warnings"]:
        print("  - No obvious doc/code drift detected from changed-file classification.")
    else:
        for warning in assessment["warnings"]:
            print(f"  - {warning}")
    print("\nNext actions:")
    if code and not docs:
        print("  - Review design/ or docs/ and update affected architecture, workflow, or API notes.")
    if docs:
        print("  - Confirm each changed doc still matches the actual implementation.")
    if code:
        print("  - Confirm changed code paths match documented workflow, schema, and runtime assumptions.")
    if other:
        print("  - Review unclassified paths and decide whether docs or tests should be updated.")
    if assessment["blocking"]:
        print("\nResult: blocking drift detected.")
        return 1
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/scripts/install_hooks.sh
+++ b/scripts/install_hooks.sh
@ -0,0 +1,12 @@
 #!/usr/bin/env bash
 set -euo pipefail
 repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 git -C "$repo_root" config core.hooksPath .githooks
 chmod +x "$repo_root"/.githooks/*
 chmod +x "$repo_root"/scripts/check_doc_code_sync.py
 chmod +x "$repo_root"/scripts/check_commit_message.py
 echo "Installed local git hooks from .githooks"
 echo "Active hooks path: $(git -C "$repo_root" config core.hooksPath)"
--- a/tests/test_commit_message.py
+++ b/tests/test_commit_message.py
@ -0,0 +1,40 @@
 import importlib.util
 from pathlib import Path
 import unittest
 def load_module(module_name: str, path: Path):
    spec = importlib.util.spec_from_file_location(module_name, path)
    module = importlib.util.module_from_spec(spec)
    assert spec.loader is not None
    spec.loader.exec_module(module)
    return module
 REPO_ROOT = Path(__file__).resolve().parents[1]
 MODULE = load_module(
    "check_commit_message",
    REPO_ROOT / "scripts" / "check_commit_message.py",
 )
 class CommitMessageValidationTests(unittest.TestCase):
    def test_accepts_gitmoji_shortcode_with_english_message(self):
        errors = MODULE.validate_message(":sparkles: add local hook templates")
        self.assertEqual(errors, [])
    def test_accepts_unicode_gitmoji_with_english_message(self):
        errors = MODULE.validate_message("✨ add ci validation for hooks")
        self.assertEqual(errors, [])
    def test_rejects_message_without_gitmoji_prefix(self):
        errors = MODULE.validate_message("add local hook templates")
        self.assertTrue(any("gitmoji" in error.lower() for error in errors))
    def test_rejects_non_english_message(self):
        errors = MODULE.validate_message(":sparkles: 添加本地 hook")
        self.assertTrue(any("english" in error.lower() for error in errors))
 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_doc_code_sync.py
+++ b/tests/test_doc_code_sync.py
@ -0,0 +1,55 @@
 import importlib.util
 from pathlib import Path
 import unittest
 def load_module(module_name: str, path: Path):
    spec = importlib.util.spec_from_file_location(module_name, path)
    module = importlib.util.module_from_spec(spec)
    assert spec.loader is not None
    spec.loader.exec_module(module)
    return module
 REPO_ROOT = Path(__file__).resolve().parents[1]
 MODULE = load_module(
    "check_doc_code_sync",
    REPO_ROOT / "scripts" / "check_doc_code_sync.py",
 )
 class DocCodeSyncAssessmentTests(unittest.TestCase):
    def test_classifies_python_scripts_as_code(self):
        self.assertEqual(MODULE.classify("scripts/check_doc_code_sync.py"), "code")
    def test_classifies_app_paths_as_code(self):
        self.assertEqual(MODULE.classify("apps/web/package.json"), "code")
    def test_classifies_env_example_as_config(self):
        self.assertEqual(MODULE.classify(".env.example"), "config")
    def test_strict_mode_blocks_code_without_doc_updates(self):
        assessment = MODULE.assess_changes(
            docs=[],
            code=["src/app.ts"],
            tests=[],
            config=[],
            other=[],
            strict=True,
        )
        self.assertTrue(assessment["blocking"])
    def test_doc_and_code_changes_together_do_not_block(self):
        assessment = MODULE.assess_changes(
            docs=["design/02-architecture/system-architecture.md"],
            code=["src/app.ts"],
            tests=[],
            config=[],
            other=[],
            strict=True,
        )
        self.assertFalse(assessment["blocking"])
 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_repo_structure.py
+++ b/tests/test_repo_structure.py
@ -0,0 +1,35 @@
 from pathlib import Path
 import unittest
 REPO_ROOT = Path(__file__).resolve().parents[1]
 class RepoStructureTests(unittest.TestCase):
    def test_root_workspace_files_exist(self):
        required_files = [
            "package.json",
            "pnpm-workspace.yaml",
            "tsconfig.base.json",
            "docker-compose.yml",
            ".env.example",
        ]
        for relative_path in required_files:
            with self.subTest(path=relative_path):
                self.assertTrue((REPO_ROOT / relative_path).is_file())
    def test_app_package_manifests_exist(self):
        required_files = [
            "apps/web/package.json",
            "apps/api/package.json",
            "apps/worker/package.json",
        ]
        for relative_path in required_files:
            with self.subTest(path=relative_path):
                self.assertTrue((REPO_ROOT / relative_path).is_file())
 if __name__ == "__main__":
    unittest.main()
--- a/tsconfig.base.json
+++ b/tsconfig.base.json
@ -0,0 +1,12 @@
 {
  "compilerOptions": {
    "target": "ES2022",
    "module": "ESNext",
    "moduleResolution": "Bundler",
    "strict": true,
    "esModuleInterop": true,
    "resolveJsonModule": true,
    "skipLibCheck": true,
    "baseUrl": "."
  }
 }