🎉 feat: initialize foundation docs guardrails and workspace skeleton

2026-03-26 17:18:40 +08:00 · 2026-03-26 17:18:40 +08:00 · f41816bbd9
commit f41816bbd9
43 changed files with 3258 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1,17 @@
+NODE_ENV=development
+
+WEB_PORT=3000
+API_PORT=3001
+WORKER_PORT=3002
+
+MONGO_PORT=27017
+MONGO_DB=emboflow
+MONGO_ROOT_USERNAME=emboflow
+MONGO_ROOT_PASSWORD=emboflow
+
+MINIO_PORT=9000
+MINIO_CONSOLE_PORT=9001
+MINIO_ROOT_USER=emboflow
+MINIO_ROOT_PASSWORD=emboflow123
+
+STORAGE_PROVIDER=minio
--- a/.githooks/commit-msg
+++ b/.githooks/commit-msg
@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+repo_root="$(git rev-parse --show-toplevel)"
+cd "$repo_root"
+
+python3 scripts/check_commit_message.py --file "$1"
--- a/.githooks/pre-commit
+++ b/.githooks/pre-commit
@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+repo_root="$(git rev-parse --show-toplevel)"
+cd "$repo_root"
+
+python3 scripts/check_doc_code_sync.py . --staged --strict
--- a/.githooks/pre-push
+++ b/.githooks/pre-push
@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+repo_root="$(git rev-parse --show-toplevel)"
+cd "$repo_root"
+
+if git rev-parse --abbrev-ref --symbolic-full-name "@{upstream}" >/dev/null 2>&1; then
+  base_ref="$(git rev-parse --abbrev-ref --symbolic-full-name "@{upstream}")"
+  python3 scripts/check_doc_code_sync.py . --base-ref "$base_ref" --strict
+  python3 scripts/check_commit_message.py --rev-range "$base_ref..HEAD"
+elif git rev-parse HEAD~1 >/dev/null 2>&1; then
+  python3 scripts/check_doc_code_sync.py . --base-ref HEAD~1 --strict
+  python3 scripts/check_commit_message.py --rev-range "HEAD~1..HEAD"
+else
+  python3 scripts/check_doc_code_sync.py . --rev-range HEAD --strict
+  python3 scripts/check_commit_message.py --rev-range HEAD
+fi
+
+python3 -m unittest discover -s tests -p 'test_*.py'
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -0,0 +1,40 @@
+# Summary
+
+- Describe the change in clear English.
+- Explain the user-visible or system-level impact.
+
+# Design Sync
+
+- [ ] I reviewed the relevant files under `design/` before implementing.
+- [ ] I updated the affected design or docs files in the same change set, or I confirmed no design update was required.
+- [ ] I ran `python3 scripts/check_doc_code_sync.py . --strict`.
+
+Design files reviewed or updated:
+
+- ``
+
+If design and code are not fully aligned yet, explain the gap:
+
+-
+
+# Validation
+
+- [ ] I ran local checks relevant to this change.
+- [ ] I ran `bash scripts/install_hooks.sh` in this clone or already had the repo hooks installed.
+- [ ] My commit messages in this PR are English-only and use a gitmoji prefix.
+
+Commands run:
+
+```bash
+# paste commands here
+```
+
+# Scope Checklist
+
+- [ ] This PR updates behavior, contracts, or runtime assumptions intentionally.
+- [ ] This PR does not silently break documented architecture or workflow assumptions.
+- [ ] This PR includes tests if behavior changed, or I confirmed tests were not required.
+
+# Notes For Reviewers
+
+- Call out any risky areas, follow-up work, or unresolved assumptions.
--- a/.github/workflows/guardrails.yml
+++ b/.github/workflows/guardrails.yml
@ -0,0 +1,45 @@
+name: Guardrails
+
+on:
+  pull_request:
+  push:
+
+jobs:
+  repository-guardrails:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Compute git range
+        id: git_range
+        shell: bash
+        run: |
+          if [ "${GITHUB_EVENT_NAME}" = "pull_request" ]; then
+            RANGE="${{ github.event.pull_request.base.sha }}..${{ github.sha }}"
+          elif [ "${{ github.event.before }}" != "0000000000000000000000000000000000000000" ]; then
+            RANGE="${{ github.event.before }}..${{ github.sha }}"
+          else
+            RANGE="${{ github.sha }}"
+          fi
+          echo "range=${RANGE}" >> "$GITHUB_OUTPUT"
+
+      - name: Validate commit messages
+        run: |
+          python3 scripts/check_commit_message.py --rev-range "${{ steps.git_range.outputs.range }}"
+
+      - name: Validate design and code sync
+        run: |
+          python3 scripts/check_doc_code_sync.py . --rev-range "${{ steps.git_range.outputs.range }}" --strict
+
+      - name: Run repository tests
+        run: |
+          python3 -m unittest discover -s tests -p 'test_*.py'
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,93 @@
+# Contributing To EmboFlow
+
+## Core Rule
+
+Keep `design/` and implementation aligned in the same change set.
+
+Do not treat design files as background notes. If a code change affects product behavior, workflow behavior, data models, contracts, runtime assumptions, permissions, or deployment assumptions, update the corresponding design documents before closing the task.
+
+## Required Workflow
+
+1. Read the relevant files under `design/` before implementing.
+2. Summarize the intended contract you are changing.
+3. Implement the code change.
+4. Update the affected design files in the same work session.
+5. Install the local git hooks once per clone:
+
+```bash
+bash scripts/install_hooks.sh
+```
+
+6. Use English-only commit messages with a gitmoji prefix, for example:
+
+```text
+:sparkles: add workflow guardrails and CI checks
+```
+
+7. Run the local sync check when needed:
+
+```bash
+python3 scripts/check_doc_code_sync.py . --strict
+```
+
+8. If design and code still diverge, document that explicitly in your final summary.
+
+## When Design Updates Are Required
+
+Update design files when a change affects:
+
+- user-visible behavior
+- workflow nodes or execution paths
+- data model or storage structure
+- API or schema contracts
+- plugin or executor behavior
+- workspace, project, or permission rules
+- deployment or runtime assumptions
+
+## When Design Updates May Be Skipped
+
+Design updates are usually not required for:
+
+- pure refactors with no behavior change
+- test-only changes
+- formatting, comments, and naming cleanup
+
+Even in those cases, verify that no documented statement became false indirectly.
+
+## Primary Design Locations
+
+- `design/00-overview/`
+- `design/01-product/`
+- `design/02-architecture/`
+- `design/03-workflows/`
+- `design/05-data/`
+- `design/08-decisions/`
+
+## Local Tooling
+
+This repository includes:
+
+- git hook templates under `.githooks/`
+- a hook installer:
+
+```bash
+bash scripts/install_hooks.sh
+```
+
+- a design/code sync checker:
+
+```bash
+python3 scripts/check_doc_code_sync.py . --strict
+```
+
+- a commit message validator:
+
+```bash
+python3 scripts/check_commit_message.py --rev-range HEAD
+```
+
+The hooks and CI enforce:
+
+- English-only commit messages with a gitmoji prefix
+- design/code consistency checks
+- repository unit tests before push
--- a/apps/api/package.json
+++ b/apps/api/package.json
@ -0,0 +1,8 @@
+{
+  "name": "@emboflow/api",
+  "private": true,
+  "version": "0.1.0",
+  "scripts": {
+    "dev": "echo 'api app scaffold pending'"
+  }
+}
--- a/apps/web/package.json
+++ b/apps/web/package.json
@ -0,0 +1,8 @@
+{
+  "name": "@emboflow/web",
+  "private": true,
+  "version": "0.1.0",
+  "scripts": {
+    "dev": "echo 'web app scaffold pending'"
+  }
+}
--- a/apps/worker/package.json
+++ b/apps/worker/package.json
@ -0,0 +1,8 @@
+{
+  "name": "@emboflow/worker",
+  "private": true,
+  "version": "0.1.0",
+  "scripts": {
+    "dev": "echo 'worker app scaffold pending'"
+  }
+}
--- a/design/00-overview/.gitkeep
+++ b/design/00-overview/.gitkeep
@ -0,0 +1 @@
+
--- a/design/00-overview/emboflow-platform-overview.md
+++ b/design/00-overview/emboflow-platform-overview.md
@ -0,0 +1,70 @@
+# EmboFlow Platform Overview
+
+## Positioning
+
+EmboFlow is a browser-based embodied data engineering platform for ingesting raw assets, organizing dataset workflows on a visual canvas, processing and converting data, annotating and inspecting results, exporting normalized artifacts, and generating downstream training configurations.
+
+The platform is designed around plugin-based extensibility, but the first version should deliver a stable built-in core before opening broader extension surfaces.
+
+## Primary Users
+
+- Individual engineers building embodied datasets
+- Team operators managing collection, preprocessing, delivery, and annotation workflows
+- Data engineering teams that need repeatable conversion and packaging pipelines
+- Teams preparing datasets for external training systems
+
+## V1 Product Goal
+
+Build a usable end-to-end platform that allows users to:
+
+1. Log into a personal or team workspace
+2. Create a project
+3. Upload or import raw embodied data assets
+4. Auto-detect asset structure and generate preview summaries
+5. Compose processing pipelines on a canvas
+6. Configure node parameters and inject code into processing nodes
+7. Execute workflows asynchronously and inspect logs and outputs
+8. Export normalized delivery packages, training datasets, or training config files
+
+## Supported Input Formats in V1
+
+- RLDS
+- LeRobot v2/v3
+- HDF5
+- Rosbag
+- Raw video folders and delivery-style directory packages
+- Compressed archives containing the above
+
+## Core Product Principles
+
+- Raw assets are first-class objects
+- Canonical semantic datasets are derived, not assumed
+- Visualization can operate directly on raw assets
+- Workflow execution is asynchronous and traceable
+- Plugins are versioned and managed
+- User-injected code is supported with strict runtime boundaries
+- Training execution is out of scope for V1, but training handoff is in scope
+
+## Major Workspaces
+
+- Asset Workspace: upload, import, scan, probe, browse
+- Canvas Workspace: build and run workflows
+- Explore Workspace: inspect raw assets and processed outputs
+- Label Workspace: create and review annotation tasks
+- Admin Workspace: users, workspaces, plugins, storage, runtime settings
+
+## V1 Output Types
+
+- Standardized embodied dataset exports
+- Customer delivery packages
+- Validation and quality reports
+- Annotation artifacts
+- Training configuration packages for downstream training systems
+
+## Non-Goals for V1
+
+- Built-in training execution orchestration
+- Real-time collaborative editing on the same canvas
+- Public plugin marketplace
+- Fully generalized MLOps lifecycle management
+- Advanced distributed scheduling in the first deployment
--- a/design/01-product/.gitkeep
+++ b/design/01-product/.gitkeep
@ -0,0 +1 @@
+
--- a/design/01-product/v1-scope-and-mvp.md
+++ b/design/01-product/v1-scope-and-mvp.md
@ -0,0 +1,90 @@
+# EmboFlow V1 Scope And MVP
+
+## MVP Definition
+
+The first release should prove that EmboFlow can turn raw embodied data assets into structured outputs through a visual workflow engine.
+
+### MVP Success Path
+
+1. A user signs into a workspace
+2. The user creates a project
+3. The user uploads or imports a raw asset
+4. The platform probes the asset and generates a structure summary
+5. The user previews the asset
+6. The user composes a canvas workflow
+7. The workflow executes asynchronously
+8. The user reviews logs, outputs, and generated artifacts
+9. The user exports a normalized dataset, delivery package, or training config
+
+## In Scope For V1
+
+- User login and workspace model
+- Personal and team workspaces
+- Project resource isolation
+- Raw asset upload and import
+- Object storage integration
+- Asset probing and structure detection
+- Raw asset preview
+- Canvas workflow editor
+- Built-in node library for ingest, transform, inspect, export
+- Node configuration through schema-driven forms
+- Code injection for processing nodes
+- Workflow run orchestration
+- Logs, status, retries, and artifact tracking
+- Dataset conversion and delivery-package normalization
+- Training config export
+- Plugin registration skeleton
+
+## Important Business Scenarios
+
+### Embodied Dataset Conversion
+
+- Import RLDS, LeRobot, HDF5, or Rosbag
+- Map to canonical semantics
+- Export to target dataset format
+
+### Delivery Package Normalization
+
+- Import customer-provided raw directory or archive
+- Rename top-level folders
+- Validate required file structure
+- Validate metadata files
+- Check video file quality and naming
+- Export or upload normalized package
+
+### Data Processing Workflow Authoring
+
+- Drag nodes onto canvas
+- Connect nodes into DAG
+- Tune parameters
+- Inject code into processing nodes
+- Re-run pipeline with traceable history
+
+## V1 Modules To Build Deeply
+
+- Identity and workspace management
+- Asset ingestion and probing
+- Workflow editor and node model
+- Execution engine
+- Built-in dataset conversion nodes
+- Built-in delivery normalization nodes
+- Preview and inspection
+- Artifact export
+
+## V1 Modules To Keep Lightweight
+
+- Annotation
+- Collaboration
+- Plugin lifecycle UX
+- Advanced analytics
+- Kubernetes and Volcano scheduling adapters
+- Advanced multi-sensor synchronized visual analytics
+
+## Explicit V1 Exclusions
+
+- Platform-managed training execution
+- Real-time multi-user canvas co-editing
+- Full marketplace for third-party plugins
+- Complex enterprise approval workflows
+- Streaming data processing
+- Large-scale distributed execution as a deployment requirement
--- a/design/02-architecture/.gitkeep
+++ b/design/02-architecture/.gitkeep
@ -0,0 +1 @@
+
--- a/design/02-architecture/deployment-architecture.md
+++ b/design/02-architecture/deployment-architecture.md
@ -0,0 +1,115 @@
+# EmboFlow Deployment Architecture
+
+## V1 Deployment Target
+
+The first deployment target is a single public server. The platform should be deployed in a way that is operationally simple now and migration-friendly later.
+
+## Recommended V1 Deployment Topology
+
+- Reverse proxy
+- Web frontend service
+- API service
+- Worker service
+- MongoDB
+- Optional MinIO
+- Host Docker runtime for execution containers
+
+## Deployment Principles
+
+- Single-host deployment first
+- All major services containerized
+- Persistent state mounted on host volumes
+- Object storage can be external or self-hosted
+- Execution workers separated from API service
+- Future scheduler migration should not require domain model changes
+
+## Recommended Runtime Layout
+
+### Edge
+
+- Nginx or equivalent reverse proxy
+- HTTPS termination
+- Static web delivery or web upstream routing
+
+### Application
+
+- `web`
+- `api`
+- `worker`
+
+### Data
+
+- `mongo`
+- `minio` optional
+
+## Object Storage Strategy
+
+The product should support both:
+
+- Cloud object storage such as BOS or S3-compatible services
+- Self-hosted MinIO for development, demos, or private deployment
+
+The application should expose a unified storage abstraction instead of embedding provider-specific logic across modules.
+
+## Local Scheduler In V1
+
+V1 should use a local scheduler. Worker processes execute tasks on the same deployment host.
+
+Design constraints:
+
+- RuntimeSpec must already exist
+- Scheduler abstraction must already exist
+- Docker executor must already be scheduler-compatible
+
+This keeps future migration to Kubernetes or Volcano feasible.
+
+## Host-Level Persistent Directories
+
+Recommended host directories:
+
+- application config
+- mongodb data
+- minio data
+- uploaded file staging
+- execution temp workspace
+- logs
+- backup data
+
+## Execution Isolation
+
+The host Docker runtime serves two different purposes:
+
+- Running the platform deployment stack
+- Running task execution containers
+
+These must be treated as separate concerns in configuration and security design.
+
+## Future Migration Path
+
+### Stage 1
+
+- Single-host deployment
+- Local scheduler
+- Docker executor
+
+### Stage 2
+
+- Kubernetes-based service deployment
+- Kubernetes scheduler adapter for workflow tasks
+
+### Stage 3
+
+- Volcano scheduler adapter
+- Better support for large batch jobs and training-adjacent workloads
+
+## Operational Baseline
+
+V1 should provide basic operational support for:
+
+- health checks
+- service restart
+- execution failure visibility
+- disk space monitoring
+- object storage connectivity checks
+- MongoDB backup and restore procedures
+- worker online status
--- a/design/02-architecture/system-architecture.md
+++ b/design/02-architecture/system-architecture.md
@ -0,0 +1,200 @@
+# EmboFlow System Architecture
+
+## Architecture Style
+
+EmboFlow V1 is a browser/server platform built as:
+
+- Web frontend
+- Modular backend control plane
+- Independent worker runtime
+- MongoDB as the only database
+- Object storage abstraction over cloud object storage or MinIO
+- Local scheduler in V1 with future migration path to Kubernetes and Volcano
+
+The architecture should preserve clear service boundaries even if V1 is implemented as a modular monolith plus workers.
+
+## High-Level Layers
+
+### Frontend Layer
+
+- Asset workspace
+- Canvas workspace
+- Explore workspace
+- Label workspace
+- Admin workspace
+
+### Control Plane
+
+- Identity and authorization
+- Workspace and project management
+- Asset and dataset metadata
+- Workflow definition management
+- Plugin registry and activation
+- Run orchestration API
+- Artifact indexing
+
+### Execution Plane
+
+- Workflow DAG compilation
+- Task queue dispatch
+- Worker execution
+- Executor routing
+- Log and artifact collection
+
+### Storage Layer
+
+- MongoDB for metadata and run state
+- Object storage for files and large outputs
+- Temporary local working directories for execution
+
+## Core Domain Objects
+
+- User
+- Workspace
+- Project
+- Asset
+- Dataset
+- DatasetVersion
+- WorkflowDefinition
+- WorkflowVersion
+- WorkflowRun
+- RunTask
+- Artifact
+- AnnotationTask
+- Annotation
+- Plugin
+- StorageConnection
+
+## Raw Asset And Canonical Dataset Model
+
+The platform must distinguish between:
+
+- Raw Asset View
+- Canonical Dataset View
+
+Raw assets preserve source structure, file paths, metadata layout, and original naming. Canonical datasets provide a normalized semantic layer for workflow nodes and export logic.
+
+Visualization may read raw assets directly. Conversion, orchestration, and export should primarily target canonical semantics.
+
+## Workflow Model
+
+Workflow definitions are versioned and contain:
+
+- Visual graph state
+- Logical node and edge graph
+- Runtime configuration
+- Plugin references
+
+Workflow execution produces immutable workflow runs. A run snapshots:
+
+- Workflow version
+- Node configuration
+- Injected code
+- Executor settings
+- Input bindings
+
+Runs compile into task DAGs.
+
+## Node And Plugin Model
+
+### Node Categories
+
+- Source
+- Transform
+- Inspect
+- Annotate
+- Export
+- Utility
+
+### Node Definition Contract
+
+Each node definition includes:
+
+- Metadata
+- Input schema
+- Output schema
+- Config schema
+- UI schema
+- Executor type
+- Runtime limits
+- Optional code hook contract
+
+### Plugin Types
+
+- Node plugins
+- Reader/writer plugins
+- Renderer plugins
+- Executor plugins
+- Integration plugins
+
+## Execution Architecture
+
+### Executors
+
+- Python executor
+- Docker executor
+- HTTP executor
+
+V1 should prioritize Python and Docker. HTTP executor is useful for integrating external services.
+
+### Schedulers
+
+- Local scheduler in V1
+- Kubernetes scheduler later
+- Volcano scheduler later
+
+Executors and schedulers are separate abstractions:
+
+- Executor defines how logic runs
+- Scheduler defines where and under what scheduling policy it runs
+
+## Storage Architecture
+
+### MongoDB Collections
+
+Recommended primary collections:
+
+- users
+- workspaces
+- projects
+- memberships
+- assets
+- asset_probe_reports
+- datasets
+- dataset_versions
+- workflow_definitions
+- workflow_definition_versions
+- workflow_runs
+- run_tasks
+- artifacts
+- annotation_tasks
+- annotations
+- plugins
+- storage_connections
+- audit_logs
+
+### Object Storage Content
+
+- Raw uploads
+- Imported archives
+- Normalized export packages
+- Training config packages
+- Preview resources
+- Logs and attachments
+- Large manifests and file indexes
+
+## Security Model
+
+User-injected code is low-trust code and must not run in web or API processes.
+
+V1 runtime policy:
+
+- Built-in trusted nodes may use Python executor
+- Plugin code should run in controlled runtimes
+- User-injected code should default to Docker executor
+- Network access should be denied by default for user code
+- Input and output paths should be explicitly mounted
+
+## Deployment Direction
+
+V1 deployment target is a single public server using containerized application services. The architecture must still preserve future migration to multi-node environments.
--- a/design/03-workflows/.gitkeep
+++ b/design/03-workflows/.gitkeep
@ -0,0 +1 @@
+
--- a/design/03-workflows/workflow-execution-model.md
+++ b/design/03-workflows/workflow-execution-model.md
@ -0,0 +1,316 @@
+# EmboFlow Workflow Execution Model
+
+## Goal
+
+Define how EmboFlow represents, validates, executes, and observes canvas workflows.
+
+The workflow system is the product core. The canvas is only the editing surface. The real system of record is the versioned workflow definition and its immutable run snapshots.
+
+## Core Objects
+
+- `WorkflowDefinition`
+  Logical workflow identity under a project
+- `WorkflowVersion`
+  Immutable snapshot of nodes, edges, runtime defaults, and plugin references
+- `NodeInstance`
+  Concrete node on a workflow graph
+- `WorkflowRun`
+  One execution of one workflow version
+- `RunTask`
+  Executable unit derived from a node during one run
+- `Artifact`
+  Managed output from a task or run
+
+## Workflow Layers
+
+Each workflow version contains three layers.
+
+### Visual Layer
+
+Used only by the editor:
+
+- node positions
+- collapsed state
+- groups
+- zoom defaults
+- viewport metadata
+
+### Logic Layer
+
+Used for graph semantics:
+
+- nodes
+- edges
+- input/output ports
+- branch conditions
+- merge semantics
+- dependency graph
+
+### Runtime Layer
+
+Used for execution:
+
+- node config values
+- executor settings
+- runtime resource limits
+- retry policy
+- code hooks
+- cache policy
+
+Visual changes must not change workflow semantics. Runtime changes must produce a new workflow version.
+
+## Node Categories
+
+V1 node categories:
+
+- `Source`
+- `Transform`
+- `Inspect`
+- `Annotate`
+- `Export`
+- `Utility`
+
+### V1 Built-In Node Families
+
+- asset upload/import
+- archive extract
+- folder rename
+- directory validation
+- metadata validation
+- video quality inspection
+- dataset readers for RLDS, LeRobot, HDF5, Rosbag
+- canonical mapping nodes
+- dataset writers and exporters
+- training config export
+- Python processing node
+
+## Node Definition Contract
+
+Each node definition must expose:
+
+- `id`
+- `name`
+- `category`
+- `version`
+- `description`
+- `inputSchema`
+- `outputSchema`
+- `configSchema`
+- `uiSchema`
+- `executorType`
+- `runtimeDefaults`
+- `permissions`
+- `capabilities`
+- `codeHookSpec`
+
+### Code Hook Spec
+
+V1 supports user code hooks only on:
+
+- `Transform`
+- `Inspect`
+- `Utility`
+
+Hooks must use a constrained entrypoint instead of arbitrary script structure.
+
+Example:
+
+```python
+def process(input_data, context):
+    return input_data
+```
+
+This keeps serialization, logging, and runtime control predictable.
+
+## Data Flow Contract
+
+Tasks should exchange managed references, not loose file paths.
+
+V1 reference types:
+
+- `assetRef`
+- `datasetVersionRef`
+- `artifactRef`
+- `annotationTaskRef`
+- `inlineConfig`
+
+Executors may materialize files internally, but the platform-level contract must remain reference-based.
+
+## Validation Stages
+
+Workflow execution must validate in this order:
+
+1. workflow version exists
+2. referenced plugins exist and are enabled
+3. node schemas are valid
+4. edge connections are schema-compatible
+5. runtime configuration is complete
+6. referenced assets and datasets are accessible
+7. code hooks pass static validation
+8. executor and scheduler requirements are satisfiable
+
+Validation failure must block run creation.
+
+## Run Lifecycle
+
+When a user executes a workflow:
+
+1. resolve workflow version
+2. snapshot all runtime-relevant inputs
+3. resolve plugin versions
+4. freeze node config and code hooks
+5. compile graph into a DAG
+6. create `WorkflowRun`
+7. create `RunTask` entries
+8. enqueue ready tasks
+9. collect outputs, logs, and task state
+10. finalize run status and summary
+
+## Run State Model
+
+### WorkflowRun Status
+
+- `pending`
+- `queued`
+- `running`
+- `success`
+- `failed`
+- `cancelled`
+- `partial_success`
+
+### RunTask Status
+
+- `pending`
+- `queued`
+- `running`
+- `success`
+- `failed`
+- `cancelled`
+- `skipped`
+
+`partial_success` is used for workflows where non-blocking nodes fail but the run still produces valid outputs.
+
+## Retry And Failure Policy
+
+Each node instance may define:
+
+- retry count
+- retry backoff policy
+- fail-fast behavior
+- continue-on-error behavior
+- manual retry eligibility
+
+V1 should support:
+
+- `fail_fast`
+- `continue_on_error`
+- `retry_n_times`
+- `manual_retry`
+
+## Cache Model
+
+V1 should support node-level cache reuse.
+
+Recommended cache key inputs:
+
+- workflow version
+- node id
+- upstream reference summary
+- config summary
+- code hook digest
+- plugin version
+- executor version
+
+Cache hit behavior:
+
+- reuse output artifact refs
+- reuse output summaries
+- retain previous logs reference
+- mark task as cache-resolved in metadata
+
+## Execution Context
+
+Each task receives a normalized execution context containing:
+
+- workspace id
+- project id
+- workflow run id
+- task id
+- actor id
+- node config
+- code hook content
+- input references
+- storage context
+- temp working directory
+- runtime resource limits
+
+This context must be available across Python, Docker, and HTTP executors.
+
+## Observability Requirements
+
+Each task must emit:
+
+- status transitions
+- start time and finish time
+- duration
+- executor metadata
+- resource request metadata
+- stdout/stderr log stream
+- structured task summary
+- artifact refs
+
+The UI must allow:
+
+- graph-level run status
+- node-level log inspection
+- node-level artifact browsing
+- task retry entrypoint
+- direct navigation from a node to preview output
+
+## Canvas Interaction Rules
+
+V1 editor behavior should enforce:
+
+- port-level connection rules
+- incompatible edge blocking
+- dirty-state detection
+- explicit save before publish/run if graph changed
+- per-node validation badges
+- run from latest saved version, not unsaved draft
+
+## Example V1 Pipelines
+
+### Delivery Normalization
+
+```text
+Raw Folder Import
+  -> Archive Extract
+  -> Folder Rename
+  -> Directory Validation
+  -> Metadata Validation
+  -> Video Quality Check
+  -> Delivery Export
+```
+
+### Dataset Conversion
+
+```text
+Rosbag Reader
+  -> Canonical Mapping
+  -> Frame Filter
+  -> Metadata Normalize
+  -> LeRobot Writer
+  -> Training Config Export
+```
+
+## V1 Non-Goals
+
+The V1 workflow engine does not need:
+
+- loop semantics
+- streaming execution
+- unbounded dynamic fan-out
+- event-driven triggers
+- advanced distributed DAG partitioning
+
+The V1 goal is a stable, observable DAG executor for data engineering workflows.
--- a/design/04-ui-ux/.gitkeep
+++ b/design/04-ui-ux/.gitkeep
@ -0,0 +1 @@
+
--- a/design/04-ui-ux/information-architecture-and-key-screens.md
+++ b/design/04-ui-ux/information-architecture-and-key-screens.md
@ -0,0 +1,296 @@
+# EmboFlow Information Architecture And Key Screens
+
+## Goal
+
+Define the primary navigation model, main screens, and key interaction patterns for EmboFlow V1.
+
+The UI should feel like a serious data workflow product, not a generic low-code canvas. The most important interaction is the relationship between assets, workflows, runs, and outputs.
+
+## Information Architecture
+
+Top-level product areas:
+
+- Workspace switcher
+- Project selector
+- Asset Workspace
+- Canvas Workspace
+- Explore Workspace
+- Label Workspace
+- Admin Workspace
+
+## Navigation Model
+
+### Global Header
+
+Recommended global header content:
+
+- workspace switcher
+- project switcher
+- search entry
+- run notifications
+- user menu
+
+### Primary Sidebar
+
+Recommended primary navigation:
+
+- Assets
+- Workflows
+- Runs
+- Explore
+- Labels
+- Admin
+
+This keeps the product model explicit:
+
+- assets are inputs
+- workflows define transformation logic
+- runs represent execution history
+- explore is where users inspect outputs and raw inputs
+
+## Screen 1: Workspace And Project Entry
+
+Purpose:
+
+- choose personal or team workspace
+- choose or create project
+- view recent projects and recent workflow runs
+
+V1 should emphasize project-level organization because all major resources are project-scoped.
+
+## Screen 2: Asset Workspace
+
+Purpose:
+
+- upload or import raw assets
+- inspect asset type and status
+- review probe summary
+- launch preview or workflow entrypoint
+
+Core regions:
+
+- asset list with filters
+- import actions
+- asset status and source type
+- probe summary card
+- recommended next actions
+
+Key actions:
+
+- upload file
+- upload archive
+- import object storage prefix
+- register storage path
+- open preview
+- create workflow from asset
+
+## Screen 3: Asset Detail / Explore Entry
+
+Purpose:
+
+- inspect one asset deeply
+- browse folder structure
+- inspect metadata and detected format
+- preview representative files
+
+Suggested panels:
+
+- left: file tree or asset structure
+- center: preview surface
+- right: metadata, probe report, warnings, recommended nodes
+
+This screen should support both:
+
+- raw asset view
+- canonical dataset summary view when available
+
+## Screen 4: Canvas Workspace
+
+This is the core authoring surface.
+
+### Layout
+
+Recommended layout, aligned with the Xspark reference pattern:
+
+- left: node library and workflow tools
+- center: canvas
+- right: node configuration panel
+
+### Left Panel
+
+Contains:
+
+- source nodes
+- transform nodes
+- inspect nodes
+- annotate nodes
+- export nodes
+- utility nodes
+- search/filter
+
+### Center Canvas
+
+Supports:
+
+- drag-and-drop node placement
+- edge creation
+- zoom and pan
+- mini-map
+- node badges for validation status
+- run-state overlays when viewing an executed version
+
+### Right Configuration Panel
+
+The right panel is schema-driven.
+
+It should render:
+
+- node title
+- node description
+- config fields
+- input/output schema summary
+- executor selection
+- runtime policy
+- code hook editor if supported
+- validation errors
+
+This panel is critical. It should feel like a structured system console, not a generic form dump.
+
+## Screen 5: Workflow Run Detail
+
+Purpose:
+
+- inspect execution state
+- view DAG progress
+- open task logs
+- inspect task outputs
+- retry failed nodes
+
+Recommended layout:
+
+- top: run summary and status
+- center: workflow graph with execution overlays
+- bottom or side drawer: logs and artifacts for selected node
+
+## Screen 6: Explore Workspace
+
+Purpose:
+
+- inspect raw or processed outputs outside the canvas authoring context
+- compare source and transformed outputs
+- validate whether a run produced expected results
+
+V1 renderer set:
+
+- directory tree renderer
+- JSON renderer
+- video renderer
+- dataset summary renderer
+- quality report renderer
+
+This workspace should open from:
+
+- asset detail
+- workflow node output
+- artifact detail
+
+## Screen 7: Label Workspace
+
+Purpose:
+
+- process annotation tasks
+- review results
+- attach annotations to data outputs
+
+V1 should keep this lightweight:
+
+- frame labels
+- clip labels
+- temporal segment labels
+- quality tags
+
+The label workspace should be able to open from an artifact or dataset version, not only from a workflow node.
+
+## Screen 8: Admin Workspace
+
+Purpose:
+
+- manage members
+- manage storage connections
+- manage plugin enablement
+- inspect audit and runtime settings
+
+Suggested sections:
+
+- members and roles
+- workspace settings
+- storage connections
+- plugin registry
+- executor policies
+- audit log viewer
+
+## Key UX Principles
+
+### 1. Separate authoring from inspection
+
+Do not overload the canvas with deep preview or annotation workflows. The canvas configures process. Explore and Label workspaces handle dense interaction.
+
+### 2. Keep lineage visible
+
+Users should be able to move across:
+
+- asset
+- workflow
+- run
+- task
+- artifact
+- annotation
+
+without losing context.
+
+### 3. Prefer explicit system terminology
+
+Use consistent object names in the UI:
+
+- Asset
+- Dataset
+- Workflow
+- Run
+- Task
+- Artifact
+- Plugin
+
+Do not rename the same concept differently across pages.
+
+### 4. Make validation obvious before execution
+
+Before users run a workflow, the editor should visibly show:
+
+- missing config
+- invalid schema connections
+- unsupported executor choices
+- permission or plugin issues
+
+### 5. Keep the product usable on standard screens
+
+The canvas and right configuration panel must work on laptop-sized displays. On narrower screens, the right panel may collapse into a drawer.
+
+## V1 Visual Direction
+
+The UI should communicate:
+
+- precision
+- observability
+- traceability
+- strong operator control
+
+It should feel closer to a workflow control console than a consumer productivity app.
+
+## V1 Non-Goals
+
+V1 UI does not need:
+
+- real-time multi-user cursor collaboration
+- advanced canvas commenting systems
+- highly customized renderer marketplace UX
+- heavy design polish ahead of workflow clarity
--- a/design/05-data/.gitkeep
+++ b/design/05-data/.gitkeep
@ -0,0 +1 @@
+
--- a/design/05-data/mongodb-data-model.md
+++ b/design/05-data/mongodb-data-model.md
@ -0,0 +1,521 @@
+# EmboFlow MongoDB Data Model
+
+## Goal
+
+Define the MongoDB-only persistence model for EmboFlow V1.
+
+The database must support:
+
+- user and workspace isolation
+- raw asset tracking
+- canonical dataset versions
+- workflow versioning
+- workflow execution history
+- plugin registration
+- auditability
+
+## Storage Principles
+
+- MongoDB stores metadata and execution state
+- Object storage stores large binary files and large derived bundles
+- MongoDB documents should have clear aggregate boundaries
+- Large, fast-growing arrays should be split into separate collections
+- Platform contracts should use references, not embedded file blobs
+
+## Primary Collections
+
+- `users`
+- `workspaces`
+- `projects`
+- `memberships`
+- `assets`
+- `asset_probe_reports`
+- `datasets`
+- `dataset_versions`
+- `workflow_definitions`
+- `workflow_definition_versions`
+- `workflow_runs`
+- `run_tasks`
+- `artifacts`
+- `annotation_tasks`
+- `annotations`
+- `plugins`
+- `storage_connections`
+- `audit_logs`
+
+## Collection Design
+
+### users
+
+Purpose:
+
+- account identity
+- profile
+- login metadata
+
+Core fields:
+
+- `_id`
+- `email`
+- `displayName`
+- `avatarUrl`
+- `status`
+- `lastLoginAt`
+- `createdAt`
+- `updatedAt`
+
+### workspaces
+
+Purpose:
+
+- resource ownership boundary
+
+Core fields:
+
+- `_id`
+- `type` as `personal` or `team`
+- `name`
+- `slug`
+- `ownerId`
+- `status`
+- `settings`
+- `createdAt`
+- `updatedAt`
+
+### memberships
+
+Purpose:
+
+- workspace and project role mapping
+
+Core fields:
+
+- `_id`
+- `workspaceId`
+- `projectId` optional
+- `userId`
+- `role`
+- `status`
+- `createdAt`
+- `updatedAt`
+
+This collection should stay independent instead of embedding large member arrays on every resource.
+
+### projects
+
+Purpose:
+
+- project-scoped grouping for assets, workflows, runs, and outputs
+
+Core fields:
+
+- `_id`
+- `workspaceId`
+- `name`
+- `slug`
+- `description`
+- `status`
+- `createdBy`
+- `createdAt`
+- `updatedAt`
+
+### assets
+
+Purpose:
+
+- represent raw uploaded or imported inputs
+
+Supported asset types:
+
+- `raw_file`
+- `archive`
+- `folder`
+- `video_collection`
+- `standard_dataset`
+- `rosbag`
+- `hdf5_dataset`
+- `object_storage_prefix`
+
+Core fields:
+
+- `_id`
+- `workspaceId`
+- `projectId`
+- `type`
+- `sourceType`
+- `displayName`
+- `status`
+- `storageRef`
+- `sizeBytes`
+- `fileCount`
+- `topLevelPaths`
+- `detectedFormats`
+- `summary`
+- `createdBy`
+- `createdAt`
+- `updatedAt`
+
+Do not embed full large file listings in this document.
+
+### asset_probe_reports
+
+Purpose:
+
+- retain richer structure-detection and validation output
+
+Core fields:
+
+- `_id`
+- `assetId`
+- `reportVersion`
+- `detectedFormatCandidates`
+- `structureSummary`
+- `warnings`
+- `recommendedNextNodes`
+- `rawReport`
+- `createdAt`
+
+### datasets
+
+Purpose:
+
+- represent logical dataset identity
+
+Core fields:
+
+- `_id`
+- `workspaceId`
+- `projectId`
+- `name`
+- `type`
+- `status`
+- `latestVersionId`
+- `summary`
+- `createdBy`
+- `createdAt`
+- `updatedAt`
+
+### dataset_versions
+
+Purpose:
+
+- represent immutable dataset snapshots
+
+Core fields:
+
+- `_id`
+- `datasetId`
+- `workspaceId`
+- `projectId`
+- `sourceAssetId`
+- `parentVersionId`
+- `versionTag`
+- `canonicalSchemaVersion`
+- `manifestRef`
+- `stats`
+- `summary`
+- `status`
+- `createdBy`
+- `createdAt`
+
+This collection is separated because versions will grow over time.
+
+### workflow_definitions
+
+Purpose:
+
+- represent logical workflow identity
+
+Core fields:
+
+- `_id`
+- `workspaceId`
+- `projectId`
+- `name`
+- `slug`
+- `status`
+- `latestVersionNumber`
+- `publishedVersionNumber`
+- `createdBy`
+- `createdAt`
+- `updatedAt`
+
+### workflow_definition_versions
+
+Purpose:
+
+- represent immutable workflow snapshots
+
+Core fields:
+
+- `_id`
+- `workflowDefinitionId`
+- `workspaceId`
+- `projectId`
+- `versionNumber`
+- `visualGraph`
+- `logicGraph`
+- `runtimeGraph`
+- `pluginRefs`
+- `summary`
+- `createdBy`
+- `createdAt`
+
+Splitting versions from workflow head metadata avoids oversized documents and simplifies history queries.
+
+### workflow_runs
+
+Purpose:
+
+- store execution runs
+
+Core fields:
+
+- `_id`
+- `workflowDefinitionId`
+- `workflowVersionId`
+- `workspaceId`
+- `projectId`
+- `triggeredBy`
+- `status`
+- `runtimeSnapshot`
+- `summary`
+- `startedAt`
+- `finishedAt`
+- `createdAt`
+
+### run_tasks
+
+Purpose:
+
+- store one execution unit per node per run
+
+Core fields:
+
+- `_id`
+- `workflowRunId`
+- `workflowVersionId`
+- `nodeId`
+- `nodeType`
+- `status`
+- `attempt`
+- `executor`
+- `scheduler`
+- `inputRefs`
+- `outputRefs`
+- `logRef`
+- `cacheKey`
+- `cacheHit`
+- `errorSummary`
+- `startedAt`
+- `finishedAt`
+- `createdAt`
+
+This collection should remain separate from `workflow_runs` because task volume grows quickly.
+
+### artifacts
+
+Purpose:
+
+- store managed outputs and previews
+
+Artifact types may include:
+
+- preview bundle
+- quality report
+- normalized dataset package
+- delivery package
+- training config package
+- intermediate task output
+
+Core fields:
+
+- `_id`
+- `workspaceId`
+- `projectId`
+- `type`
+- `producerType`
+- `producerId`
+- `storageRef`
+- `previewable`
+- `summary`
+- `lineage`
+- `createdBy`
+- `createdAt`
+
+### annotation_tasks
+
+Purpose:
+
+- track assignment and state of manual labeling work
+
+Core fields:
+
+- `_id`
+- `workspaceId`
+- `projectId`
+- `targetType`
+- `targetRef`
+- `labelType`
+- `status`
+- `assigneeIds`
+- `reviewerIds`
+- `createdBy`
+- `createdAt`
+- `updatedAt`
+
+### annotations
+
+Purpose:
+
+- persist annotation outputs
+
+Core fields:
+
+- `_id`
+- `annotationTaskId`
+- `workspaceId`
+- `projectId`
+- `targetRef`
+- `payload`
+- `status`
+- `createdBy`
+- `createdAt`
+- `updatedAt`
+
+### plugins
+
+Purpose:
+
+- track installable and enabled plugin versions
+
+Core fields:
+
+- `_id`
+- `workspaceId` optional for workspace-scoped plugins
+- `scope` as `platform` or `workspace`
+- `name`
+- `status`
+- `currentVersion`
+- `versions`
+- `permissions`
+- `metadata`
+- `createdAt`
+- `updatedAt`
+
+If plugin version payloads become large, split versions into a separate collection later. V1 can keep them nested if bounded.
+
+### storage_connections
+
+Purpose:
+
+- store object storage and path registration configuration
+
+Core fields:
+
+- `_id`
+- `workspaceId`
+- `type`
+- `provider`
+- `name`
+- `status`
+- `config`
+- `secretRef`
+- `createdBy`
+- `createdAt`
+- `updatedAt`
+
+Store secrets outside plaintext document fields where possible.
+
+### audit_logs
+
+Purpose:
+
+- append-only history of sensitive actions
+
+Core fields:
+
+- `_id`
+- `workspaceId`
+- `projectId`
+- `actorId`
+- `resourceType`
+- `resourceId`
+- `action`
+- `beforeSummary`
+- `afterSummary`
+- `metadata`
+- `createdAt`
+
+## Reference Strategy
+
+Use stable ids between collections.
+
+References should be explicit:
+
+- asset to probe report
+- dataset to dataset versions
+- workflow definition to workflow versions
+- workflow run to run tasks
+- task to artifact
+- annotation task to annotations
+
+Do not depend on implicit path-based linkage.
+
+## Index Recommendations
+
+### Always index
+
+- `workspaceId`
+- `projectId`
+- `status`
+- `createdAt`
+
+### Important compound indexes
+
+- `memberships.workspaceId + memberships.userId`
+- `projects.workspaceId + projects.slug`
+- `assets.projectId + assets.type + assets.createdAt`
+- `datasets.projectId + datasets.name`
+- `dataset_versions.datasetId + dataset_versions.createdAt`
+- `workflow_definitions.projectId + workflow_definitions.slug`
+- `workflow_definition_versions.workflowDefinitionId + versionNumber`
+- `workflow_runs.projectId + createdAt`
+- `workflow_runs.workflowDefinitionId + status`
+- `run_tasks.workflowRunId + nodeId`
+- `artifacts.producerType + producerId`
+- `annotation_tasks.projectId + status`
+- `audit_logs.workspaceId + createdAt`
+
+## Object Storage References
+
+MongoDB should store references such as:
+
+- bucket
+- key
+- uri
+- checksum
+- content type
+- size
+
+It should not store:
+
+- large binary file payloads
+- full raw video content
+- giant archive contents
+
+## V1 Constraints
+
+- MongoDB is the only database
+- No relational sidecar is assumed
+- No GridFS-first strategy is assumed
+- Large manifests may live in object storage and be referenced from MongoDB
+
+## V1 Non-Goals
+
+The V1 model does not need:
+
+- cross-region data distribution
+- advanced event sourcing
+- fully normalized analytics warehouse modeling
+- high-volume search indexing inside MongoDB itself
--- a/design/06-api/.gitkeep
+++ b/design/06-api/.gitkeep
@ -0,0 +1 @@
+
--- a/design/07-research/.gitkeep
+++ b/design/07-research/.gitkeep
@ -0,0 +1 @@
+
--- a/design/08-decisions/.gitkeep
+++ b/design/08-decisions/.gitkeep
@ -0,0 +1 @@
+
--- a/design/08-decisions/adr-0001-raw-asset-and-canonical-dataset.md
+++ b/design/08-decisions/adr-0001-raw-asset-and-canonical-dataset.md
@ -0,0 +1,45 @@
+# ADR-0001: Separate Raw Assets From Canonical Datasets
+
+## Status
+
+Accepted
+
+## Context
+
+EmboFlow must support both structured embodied dataset formats and unstructured or semi-structured delivery-style raw assets, including:
+
+- RLDS
+- LeRobot v2/v3
+- HDF5
+- Rosbag
+- Raw video directories
+- Archive packages
+
+If the platform treats every input as an already-standardized dataset, ingestion and delivery workflows become awkward and lossy.
+
+## Decision
+
+The platform will model:
+
+- Raw assets as first-class resources
+- Canonical datasets as derived semantic resources
+
+Raw assets preserve original structure, paths, naming, and metadata layout. Canonical datasets provide normalized semantics for conversion, workflow execution, and export logic.
+
+## Consequences
+
+### Positive
+
+- Supports customer delivery package workflows
+- Supports embodied dataset conversion workflows
+- Preserves original structure for inspection and debugging
+- Avoids forcing visualization to depend on a lossy normalized format
+
+### Negative
+
+- Adds one more layer to the object model
+- Requires readers and mappers instead of direct format-to-format conversion
+
+## Notes
+
+Visualization may operate on raw assets directly. Processing and export should primarily operate on canonical semantics where possible.
--- a/design/08-decisions/adr-0002-executor-and-scheduler-separation.md
+++ b/design/08-decisions/adr-0002-executor-and-scheduler-separation.md
@ -0,0 +1,56 @@
+# ADR-0002: Separate Executors From Schedulers
+
+## Status
+
+Accepted
+
+## Context
+
+EmboFlow needs to support multiple runtime modes now and later:
+
+- direct Python execution
+- Docker-isolated execution
+- HTTP-based execution
+- local scheduling
+- future Kubernetes scheduling
+- future Volcano scheduling
+
+If execution logic and scheduling logic are coupled together, migration from single-host operation to cluster operation becomes costly.
+
+## Decision
+
+The architecture will separate:
+
+- Executor: how node logic runs
+- Scheduler: where and under what dispatch policy tasks run
+
+V1 executors:
+
+- Python
+- Docker
+- HTTP
+
+V1 scheduler:
+
+- Local
+
+Reserved future schedulers:
+
+- Kubernetes
+- Volcano
+
+## Consequences
+
+### Positive
+
+- Cleaner evolution path
+- Better runtime abstraction
+- Less refactoring required for cluster migration
+
+### Negative
+
+- Slightly more abstraction in V1 than the immediate deployment requires
+
+## Notes
+
+User-injected code should default to Docker execution, while trusted platform logic may use Python execution.
--- a/design/09-assets/.gitkeep
+++ b/design/09-assets/.gitkeep
@ -0,0 +1 @@
+
--- a/design/README.md
+++ b/design/README.md
@ -0,0 +1,21 @@
+# EmboFlow Design Workspace
+
+This directory stores project design materials before or alongside implementation.
+
+## Structure
+
+- `00-overview`: project goals, scope, milestones
+- `01-product`: requirements, user stories, feature definitions
+- `02-architecture`: system architecture, modules, technical constraints
+- `03-workflows`: business flows, sequence diagrams, operational flows
+- `04-ui-ux`: wireframes, interaction notes, UX decisions
+- `05-data`: data model, entities, schema drafts
+- `06-api`: API contracts, request/response drafts, integration notes
+- `07-research`: competitive analysis, references, discovery notes
+- `08-decisions`: ADRs and major tradeoff records
+- `09-assets`: diagrams, exported images, attachments
+- `templates`: reusable design document templates
+
+## Suggested usage
+
+Keep design artifacts in Markdown where possible so they diff cleanly in Git.
--- a/design/templates/.gitkeep
+++ b/design/templates/.gitkeep
@ -0,0 +1 @@
+
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,62 @@
+services:
+  web:
+    image: node:20-alpine
+    working_dir: /workspace
+    command: ["sh", "-c", "sleep infinity"]
+    ports:
+      - "${WEB_PORT:-3000}:3000"
+    volumes:
+      - .:/workspace
+    depends_on:
+      - api
+
+  api:
+    image: node:20-alpine
+    working_dir: /workspace
+    command: ["sh", "-c", "sleep infinity"]
+    ports:
+      - "${API_PORT:-3001}:3001"
+    volumes:
+      - .:/workspace
+    depends_on:
+      - mongo
+
+  worker:
+    image: node:20-alpine
+    working_dir: /workspace
+    command: ["sh", "-c", "sleep infinity"]
+    ports:
+      - "${WORKER_PORT:-3002}:3002"
+    volumes:
+      - .:/workspace
+    depends_on:
+      - mongo
+      - minio
+
+  mongo:
+    image: mongo:7
+    restart: unless-stopped
+    ports:
+      - "${MONGO_PORT:-27017}:27017"
+    environment:
+      MONGO_INITDB_ROOT_USERNAME: "${MONGO_ROOT_USERNAME:-emboflow}"
+      MONGO_INITDB_ROOT_PASSWORD: "${MONGO_ROOT_PASSWORD:-emboflow}"
+    volumes:
+      - mongo-data:/data/db
+
+  minio:
+    image: minio/minio:RELEASE.2024-10-29T16-01-48Z
+    restart: unless-stopped
+    command: ["server", "/data", "--console-address", ":9001"]
+    ports:
+      - "${MINIO_PORT:-9000}:9000"
+      - "${MINIO_CONSOLE_PORT:-9001}:9001"
+    environment:
+      MINIO_ROOT_USER: "${MINIO_ROOT_USER:-emboflow}"
+      MINIO_ROOT_PASSWORD: "${MINIO_ROOT_PASSWORD:-emboflow123}"
+    volumes:
+      - minio-data:/data
+
+volumes:
+  mongo-data:
+  minio-data:
--- a/docs/development-workflow.md
+++ b/docs/development-workflow.md
@ -0,0 +1,96 @@
+# EmboFlow Development Workflow
+
+## Goal
+
+Keep repository design artifacts and implementation changes aligned as EmboFlow evolves.
+
+## Working Agreement
+
+EmboFlow is being developed from explicit design documents under `design/`. Development should follow a doc-aware workflow instead of letting code drift ahead without recorded decisions.
+
+## Standard Change Flow
+
+### 1. Read Before Editing
+
+Before changing code, review the design files that define the affected area:
+
+- product scope
+- architecture boundaries
+- workflow model
+- data model
+- deployment model
+- accepted ADRs
+
+### 2. Identify Impact
+
+Decide whether the change affects:
+
+- product behavior
+- object model
+- workflow/run/task semantics
+- node or plugin contract
+- storage assumptions
+- user or permission behavior
+- deployment/runtime assumptions
+
+If yes, the matching design files must be updated.
+
+### 3. Change Code And Docs Together
+
+Do not defer the design update. Treat design edits as part of the implementation, not follow-up cleanup.
+
+### 4. Run The Consistency Check
+
+From the repo root:
+
+```bash
+python3 scripts/check_doc_code_sync.py . --strict
+```
+
+Interpret warnings manually. The script is a guardrail, not a replacement for judgment.
+
+### 5. Use The Local Hooks
+
+Install local hooks once per clone:
+
+```bash
+bash scripts/install_hooks.sh
+```
+
+This enables:
+
+- `commit-msg`: require English-only gitmoji commit messages
+- `pre-commit`: block staged code/config drift without doc updates
+- `pre-push`: run commit-message validation, doc/code sync checks, and repository tests
+
+### 6. Close With Explicit Status
+
+Every implementation summary should state one of:
+
+- `Aligned`
+- `Partially aligned`
+- `Doc-first`
+
+and name the exact design files that were reviewed or updated.
+
+## EmboFlow-Specific Review Checklist
+
+Before closing a non-trivial change, confirm whether any of these need updates:
+
+- raw asset vs canonical dataset model
+- workflow definition vs workflow run model
+- node schema and plugin contract
+- executor vs scheduler separation
+- MongoDB collection or document shape
+- workspace/project/user boundary
+- deployment topology or storage assumptions
+
+## Automation
+
+This repository now uses both local and remote guardrails:
+
+- local git hooks from `.githooks/`
+- commit message validation
+- CI checks in `.github/workflows/guardrails.yml`
+
+These checks are intended to keep design documents, code changes, and commit history coherent.
--- a/docs/plans/2026-03-26-emboflow-v1-foundation-and-mvp.md
+++ b/docs/plans/2026-03-26-emboflow-v1-foundation-and-mvp.md
@ -0,0 +1,621 @@
+# EmboFlow V1 Foundation And MVP Implementation Plan
+
+> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
+
+**Goal:** Build the first usable EmboFlow increment: workspace-aware raw asset ingestion, workflow definition/versioning, local workflow execution, and the first web workflow authoring surfaces.
+
+**Architecture:** Use a TypeScript monorepo with a React web app, a Node.js API control plane, and a separate Node.js worker. Use MongoDB as the only database, object storage abstraction for cloud storage or MinIO, and a local scheduler with Python and Docker executor contracts.
+
+**Tech Stack:** pnpm workspace, React, TypeScript, React Flow, NestJS, Mongoose, MongoDB, Docker Compose, Python runtime hooks, unittest/Vitest/Jest-compatible project tests
+
+---
+
+### Task 1: Bootstrap The Monorepo And Runtime Skeleton
+
+**Files:**
+- Create: `package.json`
+- Create: `pnpm-workspace.yaml`
+- Create: `tsconfig.base.json`
+- Create: `apps/web/package.json`
+- Create: `apps/api/package.json`
+- Create: `apps/worker/package.json`
+- Create: `docker-compose.yml`
+- Create: `.env.example`
+- Test: `tests/test_repo_structure.py`
+
+**Step 1: Write the failing test**
+
+Create `tests/test_repo_structure.py` to assert the repository contains the expected top-level app folders and root workspace files.
+
+**Step 2: Run test to verify it fails**
+
+Run:
+
+```bash
+python3 -m unittest tests/test_repo_structure.py -v
+```
+
+Expected: FAIL because the monorepo files and app folders do not exist yet.
+
+**Step 3: Write minimal implementation**
+
+Create the pnpm workspace root, app package manifests, root TypeScript config, `.env.example`, and `docker-compose.yml` with services for:
+
+- `web`
+- `api`
+- `worker`
+- `mongo`
+- `minio`
+
+Keep the first version minimal. Do not add extra infra services that are not required by the design.
+
+**Step 4: Run test to verify it passes**
+
+Run:
+
+```bash
+python3 -m unittest tests/test_repo_structure.py -v
+```
+
+Expected: PASS
+
+**Step 5: Commit**
+
+```bash
+git add package.json pnpm-workspace.yaml tsconfig.base.json apps docker-compose.yml .env.example tests/test_repo_structure.py
+git commit -m ":tada: bootstrap workspace and runtime skeleton"
+```
+
+### Task 2: Create Shared Domain Contracts And Mongo Setup
+
+**Files:**
+- Create: `packages/contracts/package.json`
+- Create: `packages/contracts/src/domain.ts`
+- Create: `apps/api/src/common/mongo/mongo.module.ts`
+- Create: `apps/api/src/common/mongo/schemas/workspace.schema.ts`
+- Create: `apps/api/src/common/mongo/schemas/project.schema.ts`
+- Create: `apps/api/src/common/mongo/schemas/asset.schema.ts`
+- Create: `apps/api/src/common/mongo/schemas/workflow.schema.ts`
+- Test: `apps/api/test/domain-contracts.spec.ts`
+
+**Step 1: Write the failing test**
+
+Create `apps/api/test/domain-contracts.spec.ts` asserting:
+
+- workspace types include `personal` and `team`
+- asset types include raw and dataset-style sources
+- workflow status values match the design docs
+
+**Step 2: Run test to verify it fails**
+
+Run:
+
+```bash
+pnpm --filter api test domain-contracts.spec.ts
+```
+
+Expected: FAIL because contracts and schemas are missing.
+
+**Step 3: Write minimal implementation**
+
+Create shared domain enums and base Mongo schema definitions for:
+
+- workspaces
+- projects
+- assets
+- workflow definitions
+
+Add a minimal Mongo module in the API app using environment-based connection config.
+
+**Step 4: Run test to verify it passes**
+
+Run:
+
+```bash
+pnpm --filter api test domain-contracts.spec.ts
+```
+
+Expected: PASS
+
+**Step 5: Commit**
+
+```bash
+git add packages/contracts apps/api/src/common apps/api/test/domain-contracts.spec.ts
+git commit -m ":sparkles: add shared domain contracts and mongo setup"
+```
+
+### Task 3: Implement Identity, Workspace, And Project APIs
+
+**Files:**
+- Create: `apps/api/src/modules/auth/auth.module.ts`
+- Create: `apps/api/src/modules/auth/auth.controller.ts`
+- Create: `apps/api/src/modules/workspaces/workspaces.module.ts`
+- Create: `apps/api/src/modules/workspaces/workspaces.controller.ts`
+- Create: `apps/api/src/modules/projects/projects.module.ts`
+- Create: `apps/api/src/modules/projects/projects.controller.ts`
+- Create: `apps/api/src/modules/projects/projects.service.ts`
+- Test: `apps/api/test/projects.e2e-spec.ts`
+
+**Step 1: Write the failing test**
+
+Create `apps/api/test/projects.e2e-spec.ts` covering:
+
+- create personal workspace bootstrap flow
+- create project under a workspace
+- reject project creation without a workspace id
+
+**Step 2: Run test to verify it fails**
+
+Run:
+
+```bash
+pnpm --filter api test projects.e2e-spec.ts
+```
+
+Expected: FAIL because the modules and endpoints do not exist yet.
+
+**Step 3: Write minimal implementation**
+
+Implement:
+
+- development-safe auth stub or local auth module
+- workspace creation and listing
+- project creation and listing
+- basic membership checks sufficient for V1 local development
+
+Do not build a full production auth stack before the API shape is stable.
+
+**Step 4: Run test to verify it passes**
+
+Run:
+
+```bash
+pnpm --filter api test projects.e2e-spec.ts
+```
+
+Expected: PASS
+
+**Step 5: Commit**
+
+```bash
+git add apps/api/src/modules/auth apps/api/src/modules/workspaces apps/api/src/modules/projects apps/api/test/projects.e2e-spec.ts
+git commit -m ":sparkles: add workspace and project APIs"
+```
+
+### Task 4: Implement Asset Ingestion, Storage Abstraction, And Probe Metadata
+
+**Files:**
+- Create: `apps/api/src/modules/storage/storage.module.ts`
+- Create: `apps/api/src/modules/storage/storage.service.ts`
+- Create: `apps/api/src/modules/assets/assets.module.ts`
+- Create: `apps/api/src/modules/assets/assets.controller.ts`
+- Create: `apps/api/src/modules/assets/assets.service.ts`
+- Create: `apps/api/src/modules/assets/probe/probe.service.ts`
+- Create: `apps/api/src/common/mongo/schemas/asset-probe-report.schema.ts`
+- Test: `apps/api/test/assets.e2e-spec.ts`
+
+**Step 1: Write the failing test**
+
+Create `apps/api/test/assets.e2e-spec.ts` covering:
+
+- register an uploaded asset record
+- create a probe report for a raw asset
+- return recommended next actions from probe metadata
+
+**Step 2: Run test to verify it fails**
+
+Run:
+
+```bash
+pnpm --filter api test assets.e2e-spec.ts
+```
+
+Expected: FAIL because asset ingestion and probe services are missing.
+
+**Step 3: Write minimal implementation**
+
+Implement:
+
+- storage abstraction interface
+- MinIO/S3-compatible config contract
+- asset create/list/detail endpoints
+- probe-report persistence
+- placeholder probe logic for directory and archive summaries
+
+Do not build full binary upload optimization yet. First make the metadata contract stable.
+
+**Step 4: Run test to verify it passes**
+
+Run:
+
+```bash
+pnpm --filter api test assets.e2e-spec.ts
+```
+
+Expected: PASS
+
+**Step 5: Commit**
+
+```bash
+git add apps/api/src/modules/storage apps/api/src/modules/assets apps/api/src/common/mongo/schemas/asset-probe-report.schema.ts apps/api/test/assets.e2e-spec.ts
+git commit -m ":truck: add asset ingestion and probe metadata flow"
+```
+
+### Task 5: Implement Workflow Definitions, Versions, Runs, And Tasks
+
+**Files:**
+- Create: `apps/api/src/modules/workflows/workflows.module.ts`
+- Create: `apps/api/src/modules/workflows/workflows.controller.ts`
+- Create: `apps/api/src/modules/workflows/workflows.service.ts`
+- Create: `apps/api/src/modules/runs/runs.module.ts`
+- Create: `apps/api/src/modules/runs/runs.controller.ts`
+- Create: `apps/api/src/modules/runs/runs.service.ts`
+- Create: `apps/api/src/common/mongo/schemas/workflow-definition-version.schema.ts`
+- Create: `apps/api/src/common/mongo/schemas/workflow-run.schema.ts`
+- Create: `apps/api/src/common/mongo/schemas/run-task.schema.ts`
+- Test: `apps/api/test/workflow-runs.e2e-spec.ts`
+
+**Step 1: Write the failing test**
+
+Create `apps/api/test/workflow-runs.e2e-spec.ts` covering:
+
+- create workflow definition
+- save workflow version
+- create workflow run from saved version
+- generate initial run tasks for ready nodes
+
+**Step 2: Run test to verify it fails**
+
+Run:
+
+```bash
+pnpm --filter api test workflow-runs.e2e-spec.ts
+```
+
+Expected: FAIL because workflow versioning and run creation do not exist yet.
+
+**Step 3: Write minimal implementation**
+
+Implement:
+
+- workflow definition head record
+- immutable workflow version snapshots
+- run creation from a workflow version
+- initial DAG compilation for simple source-to-transform chains
+- run task persistence
+
+Keep V1 graph compilation simple. Support sequential edges first, then one-level branching.
+
+**Step 4: Run test to verify it passes**
+
+Run:
+
+```bash
+pnpm --filter api test workflow-runs.e2e-spec.ts
+```
+
+Expected: PASS
+
+**Step 5: Commit**
+
+```bash
+git add apps/api/src/modules/workflows apps/api/src/modules/runs apps/api/src/common/mongo/schemas/workflow-definition-version.schema.ts apps/api/src/common/mongo/schemas/workflow-run.schema.ts apps/api/src/common/mongo/schemas/run-task.schema.ts apps/api/test/workflow-runs.e2e-spec.ts
+git commit -m ":sparkles: add workflow versioning and run records"
+```
+
+### Task 6: Add The Worker, Local Scheduler, And Executor Contracts
+
+**Files:**
+- Create: `apps/worker/src/main.ts`
+- Create: `apps/worker/src/runner/task-runner.ts`
+- Create: `apps/worker/src/scheduler/local-scheduler.ts`
+- Create: `apps/worker/src/executors/python-executor.ts`
+- Create: `apps/worker/src/executors/docker-executor.ts`
+- Create: `apps/worker/src/executors/http-executor.ts`
+- Create: `apps/worker/src/contracts/execution-context.ts`
+- Test: `apps/worker/test/task-runner.spec.ts`
+
+**Step 1: Write the failing test**
+
+Create `apps/worker/test/task-runner.spec.ts` covering:
+
+- worker loads pending tasks
+- worker marks task running then success
+- worker chooses executor based on node runtime config
+
+**Step 2: Run test to verify it fails**
+
+Run:
+
+```bash
+pnpm --filter worker test task-runner.spec.ts
+```
+
+Expected: FAIL because the worker runtime does not exist yet.
+
+**Step 3: Write minimal implementation**
+
+Implement:
+
+- worker bootstrap
+- polling or queue-backed local scheduler
+- execution context builder
+- stub Python, Docker, and HTTP executors
+- task status transitions
+
+Do not implement full Docker isolation logic in one step. First lock the runtime interfaces and transitions.
+
+**Step 4: Run test to verify it passes**
+
+Run:
+
+```bash
+pnpm --filter worker test task-runner.spec.ts
+```
+
+Expected: PASS
+
+**Step 5: Commit**
+
+```bash
+git add apps/worker apps/api/src/modules/runs apps/worker/test/task-runner.spec.ts
+git commit -m ":construction_worker: add local worker and executor contracts"
+```
+
+### Task 7: Build The Web Shell, Workspace Flow, And Asset Workspace
+
+**Files:**
+- Create: `apps/web/src/main.tsx`
+- Create: `apps/web/src/app/router.tsx`
+- Create: `apps/web/src/features/layout/app-shell.tsx`
+- Create: `apps/web/src/features/workspaces/workspace-switcher.tsx`
+- Create: `apps/web/src/features/projects/project-selector.tsx`
+- Create: `apps/web/src/features/assets/assets-page.tsx`
+- Create: `apps/web/src/features/assets/asset-detail-page.tsx`
+- Create: `apps/web/src/features/assets/components/asset-list.tsx`
+- Create: `apps/web/src/features/assets/components/asset-summary-panel.tsx`
+- Test: `apps/web/src/features/assets/assets-page.test.tsx`
+
+**Step 1: Write the failing test**
+
+Create `apps/web/src/features/assets/assets-page.test.tsx` covering:
+
+- app shell renders primary navigation
+- assets page renders asset rows from API data
+- asset detail page renders probe summary
+
+**Step 2: Run test to verify it fails**
+
+Run:
+
+```bash
+pnpm --filter web test assets-page.test.tsx
+```
+
+Expected: FAIL because the web app shell and pages do not exist yet.
+
+**Step 3: Write minimal implementation**
+
+Implement:
+
+- web app bootstrap
+- primary navigation matching the design docs
+- workspace/project header controls
+- asset list page
+- asset detail page with summary and action buttons
+
+Defer advanced preview renderers. Start with structured metadata and simple detail views.
+
+**Step 4: Run test to verify it passes**
+
+Run:
+
+```bash
+pnpm --filter web test assets-page.test.tsx
+```
+
+Expected: PASS
+
+**Step 5: Commit**
+
+```bash
+git add apps/web apps/web/src/features/assets/assets-page.test.tsx
+git commit -m ":sparkles: add web shell and asset workspace"
+```
+
+### Task 8: Build Canvas Authoring, Run Detail, And First Workflow Actions
+
+**Files:**
+- Create: `apps/web/src/features/workflows/workflows-page.tsx`
+- Create: `apps/web/src/features/workflows/workflow-editor-page.tsx`
+- Create: `apps/web/src/features/workflows/components/node-library.tsx`
+- Create: `apps/web/src/features/workflows/components/workflow-canvas.tsx`
+- Create: `apps/web/src/features/workflows/components/node-config-panel.tsx`
+- Create: `apps/web/src/features/runs/run-detail-page.tsx`
+- Create: `apps/web/src/features/runs/components/run-graph-view.tsx`
+- Create: `apps/web/src/features/runs/components/task-log-panel.tsx`
+- Test: `apps/web/src/features/workflows/workflow-editor-page.test.tsx`
+
+**Step 1: Write the failing test**
+
+Create `apps/web/src/features/workflows/workflow-editor-page.test.tsx` covering:
+
+- node library renders categories
+- node config panel opens when a node is selected
+- run detail view shows node status badges from run data
+
+**Step 2: Run test to verify it fails**
+
+Run:
+
+```bash
+pnpm --filter web test workflow-editor-page.test.tsx
+```
+
+Expected: FAIL because the workflow editor and run detail pages do not exist yet.
+
+**Step 3: Write minimal implementation**
+
+Implement:
+
+- workflow list page
+- workflow editor page using React Flow
+- left node library, center canvas, right config panel
+- save workflow version action
+- trigger workflow run action
+- run detail page with graph and selected-node log panel
+
+Keep the first editor scoped to V1 node categories and schema-driven config rendering.
+
+**Step 4: Run test to verify it passes**
+
+Run:
+
+```bash
+pnpm --filter web test workflow-editor-page.test.tsx
+```
+
+Expected: PASS
+
+**Step 5: Commit**
+
+```bash
+git add apps/web/src/features/workflows apps/web/src/features/runs apps/web/src/features/workflows/workflow-editor-page.test.tsx
+git commit -m ":sparkles: add canvas workflow editor and run detail pages"
+```
+
+### Task 9: Add Preview Surface, Delivery Nodes, And MVP Integration
+
+**Files:**
+- Create: `apps/api/src/modules/artifacts/artifacts.module.ts`
+- Create: `apps/api/src/modules/artifacts/artifacts.controller.ts`
+- Create: `apps/api/src/modules/artifacts/artifacts.service.ts`
+- Create: `apps/web/src/features/explore/explore-page.tsx`
+- Create: `apps/web/src/features/explore/renderers/json-renderer.tsx`
+- Create: `apps/web/src/features/explore/renderers/video-renderer.tsx`
+- Create: `apps/web/src/features/explore/renderers/directory-renderer.tsx`
+- Create: `apps/api/src/modules/plugins/builtin/delivery-nodes.ts`
+- Test: `apps/api/test/artifacts.e2e-spec.ts`
+- Test: `apps/web/src/features/explore/explore-page.test.tsx`
+
+**Step 1: Write the failing tests**
+
+Create:
+
+- `apps/api/test/artifacts.e2e-spec.ts` for artifact retrieval by producer
+- `apps/web/src/features/explore/explore-page.test.tsx` for opening and rendering supported artifact types
+
+**Step 2: Run tests to verify they fail**
+
+Run:
+
+```bash
+pnpm --filter api test artifacts.e2e-spec.ts
+pnpm --filter web test explore-page.test.tsx
+```
+
+Expected: FAIL because artifact APIs and explore renderers do not exist yet.
+
+**Step 3: Write minimal implementation**
+
+Implement:
+
+- artifact module and lookup endpoints
+- explore page
+- JSON, directory, and video renderers
+- built-in delivery-normalization node definitions for the V1 business path
+
+Do not implement the full renderer plugin platform yet. Start with built-ins and stable renderer contracts.
+
+**Step 4: Run tests to verify they pass**
+
+Run:
+
+```bash
+pnpm --filter api test artifacts.e2e-spec.ts
+pnpm --filter web test explore-page.test.tsx
+```
+
+Expected: PASS
+
+**Step 5: Commit**
+
+```bash
+git add apps/api/src/modules/artifacts apps/api/src/modules/plugins/builtin/delivery-nodes.ts apps/api/test/artifacts.e2e-spec.ts apps/web/src/features/explore apps/web/src/features/explore/explore-page.test.tsx
+git commit -m ":package: add explore surface and delivery artifacts"
+```
+
+### Task 10: Harden Guardrails, Docs, And Developer Entry Commands
+
+**Files:**
+- Modify: `CONTRIBUTING.md`
+- Modify: `docs/development-workflow.md`
+- Modify: `design/03-workflows/workflow-execution-model.md`
+- Modify: `design/05-data/mongodb-data-model.md`
+- Create: `Makefile`
+- Create: `README.md`
+- Test: `tests/test_dev_commands.py`
+
+**Step 1: Write the failing test**
+
+Create `tests/test_dev_commands.py` asserting:
+
+- `Makefile` exposes expected local commands
+- `README.md` documents bootstrap, hooks, test, and local run commands
+
+**Step 2: Run test to verify it fails**
+
+Run:
+
+```bash
+python3 -m unittest tests/test_dev_commands.py -v
+```
+
+Expected: FAIL because developer entry commands are not documented yet.
+
+**Step 3: Write minimal implementation**
+
+Add:
+
+- `make bootstrap`
+- `make test`
+- `make dev-api`
+- `make dev-web`
+- `make dev-worker`
+- `make guardrails`
+
+Document the developer flow in `README.md` and update design docs if implementation decisions changed during Tasks 1-9.
+
+**Step 4: Run test to verify it passes**
+
+Run:
+
+```bash
+python3 -m unittest tests/test_dev_commands.py -v
+```
+
+Expected: PASS
+
+**Step 5: Commit**
+
+```bash
+git add CONTRIBUTING.md docs/development-workflow.md design/03-workflows/workflow-execution-model.md design/05-data/mongodb-data-model.md Makefile README.md tests/test_dev_commands.py
+git commit -m ":memo: add developer entry commands and bootstrap docs"
+```
+
+## Exit Criteria
+
+The first implementation pass is complete when:
+
+- a user can create a workspace and project
+- a raw asset can be registered and probed
+- a workflow can be created, versioned, and executed locally
+- run tasks produce observable status and artifacts
+- the web app exposes assets, workflows, runs, and basic explore views
+- guardrails for docs, hooks, commit messages, and CI remain green
+
+## Notes
+
+- Keep commits small and use the repository gitmoji + English commit policy.
+- Update design files in the same task where behavior or architecture changes.
+- Do not add training execution before the V1 data workflow loop is stable.
--- a/package.json
+++ b/package.json
@ -0,0 +1,9 @@
+{
+  "name": "emboflow",
+  "private": true,
+  "version": "0.1.0",
+  "packageManager": "pnpm@9.12.3",
+  "scripts": {
+    "test": "python3 -m unittest discover -s tests -p 'test_*.py'"
+  }
+}
--- a/pnpm-workspace.yaml
+++ b/pnpm-workspace.yaml
@ -0,0 +1,3 @@
+packages:
+  - "apps/*"
+  - "packages/*"
--- a/scripts/check_commit_message.py
+++ b/scripts/check_commit_message.py
@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+import argparse
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+
+SHORTCODE_PREFIX = re.compile(r"^:[a-z0-9_+-]+:\s+")
+EMOJI_PREFIX = re.compile(r"^[\u2600-\u27BF\U0001F300-\U0001FAFF]\s+")
+
+
+def strip_prefix(message: str) -> str:
+    if SHORTCODE_PREFIX.match(message):
+        return SHORTCODE_PREFIX.sub("", message, count=1)
+    if EMOJI_PREFIX.match(message):
+        return EMOJI_PREFIX.sub("", message, count=1)
+    return message
+
+
+def validate_message(message: str) -> list[str]:
+    lines = [line.rstrip("\n") for line in message.splitlines()]
+    cleaned_lines = [line for line in lines if line and not line.startswith("#")]
+    if not cleaned_lines:
+        return ["Commit message must not be empty."]
+
+    subject = cleaned_lines[0]
+    errors: list[str] = []
+
+    if not SHORTCODE_PREFIX.match(subject) and not EMOJI_PREFIX.match(subject):
+        errors.append("Commit subject must start with a gitmoji shortcode or emoji.")
+
+    body = "\n".join(cleaned_lines)
+    normalized = strip_prefix(subject) + ("\n" + "\n".join(cleaned_lines[1:]) if len(cleaned_lines) > 1 else "")
+
+    try:
+        normalized.encode("ascii")
+    except UnicodeEncodeError:
+        errors.append("Commit message must be written in English ASCII text after the gitmoji prefix.")
+
+    if not strip_prefix(subject).strip():
+        errors.append("Commit subject must include an English summary after the gitmoji prefix.")
+
+    if re.search(r"[\u4e00-\u9fff]", body):
+        errors.append("Commit message must not contain Chinese characters.")
+
+    return errors
+
+
+def read_message_file(path: Path) -> str:
+    return path.read_text(encoding="utf-8")
+
+
+def run_git(*args: str) -> list[str]:
+    result = subprocess.run(
+        ["git", *args],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    if result.returncode != 0:
+        raise RuntimeError(result.stderr.strip() or "git command failed")
+    return [line.strip() for line in result.stdout.splitlines() if line.strip()]
+
+
+def commit_messages_from_range(rev_range: str) -> list[tuple[str, str]]:
+    if ".." in rev_range:
+        shas = run_git("rev-list", rev_range)
+    else:
+        shas = [rev_range]
+
+    messages: list[tuple[str, str]] = []
+    for sha in shas:
+        message = subprocess.run(
+            ["git", "log", "--format=%B", "-n", "1", sha],
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        if message.returncode != 0:
+            raise RuntimeError(message.stderr.strip() or "git log failed")
+        messages.append((sha, message.stdout.strip()))
+    return messages
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Validate commit message format.")
+    parser.add_argument("--file", help="path to commit message file")
+    parser.add_argument("--rev-range", help="git revision range or single commit")
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+
+    if bool(args.file) == bool(args.rev_range):
+        print("Use exactly one of --file or --rev-range.")
+        return 2
+
+    failures: list[str] = []
+
+    if args.file:
+        message = read_message_file(Path(args.file))
+        errors = validate_message(message)
+        if errors:
+            failures.extend(errors)
+    else:
+        for sha, message in commit_messages_from_range(args.rev_range):
+            errors = validate_message(message)
+            for error in errors:
+                failures.append(f"{sha[:12]}: {error}")
+
+    if failures:
+        print("Commit message validation failed:")
+        for failure in failures:
+            print(f"  - {failure}")
+        print("\nExpected format example:")
+        print("  :sparkles: add hook templates and CI guardrails")
+        return 1
+
+    print("Commit message validation passed.")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/check_doc_code_sync.py
+++ b/scripts/check_doc_code_sync.py
@ -0,0 +1,194 @@
+#!/usr/bin/env python3
+import argparse
+import subprocess
+import sys
+from pathlib import Path
+
+
+DOC_PATTERNS = (
+    "design/",
+    "docs/",
+    "adr",
+    "architecture",
+    "prd",
+    "spec",
+    "plan",
+)
+
+CODE_SUFFIXES = {
+    ".py",
+    ".ts",
+    ".tsx",
+    ".js",
+    ".jsx",
+    ".java",
+    ".go",
+    ".rs",
+    ".rb",
+    ".php",
+    ".kt",
+    ".swift",
+    ".scala",
+    ".sh",
+}
+
+CODE_HINTS = ("apps/", "packages/", "scripts/")
+TEST_HINTS = ("test", "spec", "__tests__", "tests/")
+CONFIG_SUFFIXES = {".yml", ".yaml", ".json", ".toml", ".ini", ".env"}
+CONFIG_HINTS = ("docker", "compose", "k8s", "helm", "terraform", ".github/", ".githooks/", ".env")
+
+
+def run_git(repo: Path, *args: str) -> list[str]:
+    result = subprocess.run(
+        ["git", "-C", str(repo), *args],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    if result.returncode != 0:
+        raise RuntimeError(result.stderr.strip() or "git command failed")
+    return [line.strip() for line in result.stdout.splitlines() if line.strip()]
+
+
+def classify(path_text: str) -> str:
+    lower = path_text.lower()
+    path = Path(path_text)
+
+    if any(token in lower for token in DOC_PATTERNS) or path.suffix == ".md":
+        return "docs"
+    if any(token in lower for token in TEST_HINTS):
+        return "tests"
+    if any(token in lower for token in CODE_HINTS):
+        return "code"
+    if path.suffix in CODE_SUFFIXES:
+        return "code"
+    if path.suffix in CONFIG_SUFFIXES or any(token in lower for token in CONFIG_HINTS):
+        return "config"
+    return "other"
+
+
+def print_group(title: str, items: list[str]) -> None:
+    print(f"\n{title}:")
+    if not items:
+        print("  - none")
+        return
+    for item in items:
+        print(f"  - {item}")
+
+
+def assess_changes(
+    docs: list[str],
+    code: list[str],
+    tests: list[str],
+    config: list[str],
+    other: list[str],
+    strict: bool,
+) -> dict:
+    warnings: list[str] = []
+    blockers: list[str] = []
+
+    if code and not docs:
+        message = "Code changed but no design/doc files changed."
+        warnings.append(message)
+        if strict:
+            blockers.append(message)
+    if config and not docs:
+        message = "Config or deployment files changed without any doc updates."
+        warnings.append(message)
+        if strict:
+            blockers.append(message)
+    if docs and not code and not config and not tests:
+        warnings.append(
+            "Docs changed without code changes. This may be intentional, but verify they still match the repository."
+        )
+    if code and not tests:
+        warnings.append(
+            "Code changed without any test-file changes. Verify whether tests should change."
+        )
+    if other:
+        warnings.append(
+            "Unclassified files changed. Confirm they do not affect documented behavior or runtime assumptions."
+        )
+
+    return {
+        "warnings": warnings,
+        "blockers": blockers,
+        "blocking": bool(blockers),
+    }
+
+
+def collect_paths(repo: Path, args: argparse.Namespace) -> list[str]:
+    if args.staged:
+        return run_git(repo, "diff", "--cached", "--name-only", "--diff-filter=ACMR")
+    if args.base_ref:
+        return run_git(repo, "diff", "--name-only", "--diff-filter=ACMR", f"{args.base_ref}...HEAD")
+    if args.rev_range:
+        if ".." in args.rev_range:
+            return run_git(repo, "diff", "--name-only", "--diff-filter=ACMR", args.rev_range)
+        return run_git(repo, "diff-tree", "--no-commit-id", "--name-only", "-r", args.rev_range)
+
+    changed = run_git(repo, "status", "--short")
+    return sorted({line[3:] for line in changed if len(line) > 3})
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Check whether doc changes track code changes.")
+    parser.add_argument("repo", nargs="?", default=".", help="git repository path")
+    parser.add_argument("--strict", action="store_true", help="fail on blocking drift")
+    parser.add_argument("--staged", action="store_true", help="inspect staged files only")
+    parser.add_argument("--base-ref", help="compare changes from base ref to HEAD")
+    parser.add_argument("--rev-range", help="inspect a git revision range or a single commit")
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+    repo = Path(args.repo).expanduser().resolve()
+
+    if not (repo / ".git").exists():
+        print(f"Not a git repository: {repo}")
+        return 2
+
+    paths = sorted(set(collect_paths(repo, args)))
+
+    docs = [p for p in paths if classify(p) == "docs"]
+    code = [p for p in paths if classify(p) == "code"]
+    tests = [p for p in paths if classify(p) == "tests"]
+    config = [p for p in paths if classify(p) == "config"]
+    other = [p for p in paths if classify(p) == "other"]
+    assessment = assess_changes(docs, code, tests, config, other, args.strict)
+
+    print(f"Repository: {repo}")
+    print(f"Changed files: {len(paths)}")
+    print_group("Design and doc files", docs)
+    print_group("Code files", code)
+    print_group("Test files", tests)
+    print_group("Config and infra files", config)
+    print_group("Other files", other)
+
+    print("\nAssessment:")
+    if not assessment["warnings"]:
+        print("  - No obvious doc/code drift detected from changed-file classification.")
+    else:
+        for warning in assessment["warnings"]:
+            print(f"  - {warning}")
+
+    print("\nNext actions:")
+    if code and not docs:
+        print("  - Review design/ or docs/ and update affected architecture, workflow, or API notes.")
+    if docs:
+        print("  - Confirm each changed doc still matches the actual implementation.")
+    if code:
+        print("  - Confirm changed code paths match documented workflow, schema, and runtime assumptions.")
+    if other:
+        print("  - Review unclassified paths and decide whether docs or tests should be updated.")
+
+    if assessment["blocking"]:
+        print("\nResult: blocking drift detected.")
+        return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/install_hooks.sh
+++ b/scripts/install_hooks.sh
@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
+git -C "$repo_root" config core.hooksPath .githooks
+chmod +x "$repo_root"/.githooks/*
+chmod +x "$repo_root"/scripts/check_doc_code_sync.py
+chmod +x "$repo_root"/scripts/check_commit_message.py
+
+echo "Installed local git hooks from .githooks"
+echo "Active hooks path: $(git -C "$repo_root" config core.hooksPath)"
--- a/tests/test_commit_message.py
+++ b/tests/test_commit_message.py
@ -0,0 +1,40 @@
+import importlib.util
+from pathlib import Path
+import unittest
+
+
+def load_module(module_name: str, path: Path):
+    spec = importlib.util.spec_from_file_location(module_name, path)
+    module = importlib.util.module_from_spec(spec)
+    assert spec.loader is not None
+    spec.loader.exec_module(module)
+    return module
+
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+MODULE = load_module(
+    "check_commit_message",
+    REPO_ROOT / "scripts" / "check_commit_message.py",
+)
+
+
+class CommitMessageValidationTests(unittest.TestCase):
+    def test_accepts_gitmoji_shortcode_with_english_message(self):
+        errors = MODULE.validate_message(":sparkles: add local hook templates")
+        self.assertEqual(errors, [])
+
+    def test_accepts_unicode_gitmoji_with_english_message(self):
+        errors = MODULE.validate_message("✨ add ci validation for hooks")
+        self.assertEqual(errors, [])
+
+    def test_rejects_message_without_gitmoji_prefix(self):
+        errors = MODULE.validate_message("add local hook templates")
+        self.assertTrue(any("gitmoji" in error.lower() for error in errors))
+
+    def test_rejects_non_english_message(self):
+        errors = MODULE.validate_message(":sparkles: 添加本地 hook")
+        self.assertTrue(any("english" in error.lower() for error in errors))
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_doc_code_sync.py
+++ b/tests/test_doc_code_sync.py
@ -0,0 +1,55 @@
+import importlib.util
+from pathlib import Path
+import unittest
+
+
+def load_module(module_name: str, path: Path):
+    spec = importlib.util.spec_from_file_location(module_name, path)
+    module = importlib.util.module_from_spec(spec)
+    assert spec.loader is not None
+    spec.loader.exec_module(module)
+    return module
+
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+MODULE = load_module(
+    "check_doc_code_sync",
+    REPO_ROOT / "scripts" / "check_doc_code_sync.py",
+)
+
+
+class DocCodeSyncAssessmentTests(unittest.TestCase):
+    def test_classifies_python_scripts_as_code(self):
+        self.assertEqual(MODULE.classify("scripts/check_doc_code_sync.py"), "code")
+
+    def test_classifies_app_paths_as_code(self):
+        self.assertEqual(MODULE.classify("apps/web/package.json"), "code")
+
+    def test_classifies_env_example_as_config(self):
+        self.assertEqual(MODULE.classify(".env.example"), "config")
+
+    def test_strict_mode_blocks_code_without_doc_updates(self):
+        assessment = MODULE.assess_changes(
+            docs=[],
+            code=["src/app.ts"],
+            tests=[],
+            config=[],
+            other=[],
+            strict=True,
+        )
+        self.assertTrue(assessment["blocking"])
+
+    def test_doc_and_code_changes_together_do_not_block(self):
+        assessment = MODULE.assess_changes(
+            docs=["design/02-architecture/system-architecture.md"],
+            code=["src/app.ts"],
+            tests=[],
+            config=[],
+            other=[],
+            strict=True,
+        )
+        self.assertFalse(assessment["blocking"])
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_repo_structure.py
+++ b/tests/test_repo_structure.py
@ -0,0 +1,35 @@
+from pathlib import Path
+import unittest
+
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+
+
+class RepoStructureTests(unittest.TestCase):
+    def test_root_workspace_files_exist(self):
+        required_files = [
+            "package.json",
+            "pnpm-workspace.yaml",
+            "tsconfig.base.json",
+            "docker-compose.yml",
+            ".env.example",
+        ]
+
+        for relative_path in required_files:
+            with self.subTest(path=relative_path):
+                self.assertTrue((REPO_ROOT / relative_path).is_file())
+
+    def test_app_package_manifests_exist(self):
+        required_files = [
+            "apps/web/package.json",
+            "apps/api/package.json",
+            "apps/worker/package.json",
+        ]
+
+        for relative_path in required_files:
+            with self.subTest(path=relative_path):
+                self.assertTrue((REPO_ROOT / relative_path).is_file())
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tsconfig.base.json
+++ b/tsconfig.base.json
@ -0,0 +1,12 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "module": "ESNext",
+    "moduleResolution": "Bundler",
+    "strict": true,
+    "esModuleInterop": true,
+    "resolveJsonModule": true,
+    "skipLibCheck": true,
+    "baseUrl": "."
+  }
+}