From f41816bbd98855947b5cc2e10ab91055260b758f Mon Sep 17 00:00:00 2001 From: eust-w Date: Thu, 26 Mar 2026 17:18:40 +0800 Subject: [PATCH] :tada: feat: initialize foundation docs guardrails and workspace skeleton --- .env.example | 17 + .githooks/commit-msg | 7 + .githooks/pre-commit | 7 + .githooks/pre-push | 19 + .github/pull_request_template.md | 40 ++ .github/workflows/guardrails.yml | 45 ++ CONTRIBUTING.md | 93 +++ apps/api/package.json | 8 + apps/web/package.json | 8 + apps/worker/package.json | 8 + design/00-overview/.gitkeep | 1 + .../00-overview/emboflow-platform-overview.md | 70 ++ design/01-product/.gitkeep | 1 + design/01-product/v1-scope-and-mvp.md | 90 +++ design/02-architecture/.gitkeep | 1 + .../deployment-architecture.md | 115 ++++ design/02-architecture/system-architecture.md | 200 ++++++ design/03-workflows/.gitkeep | 1 + .../03-workflows/workflow-execution-model.md | 316 +++++++++ design/04-ui-ux/.gitkeep | 1 + ...nformation-architecture-and-key-screens.md | 296 +++++++++ design/05-data/.gitkeep | 1 + design/05-data/mongodb-data-model.md | 521 +++++++++++++++ design/06-api/.gitkeep | 1 + design/07-research/.gitkeep | 1 + design/08-decisions/.gitkeep | 1 + ...dr-0001-raw-asset-and-canonical-dataset.md | 45 ++ ...-0002-executor-and-scheduler-separation.md | 56 ++ design/09-assets/.gitkeep | 1 + design/README.md | 21 + design/templates/.gitkeep | 1 + docker-compose.yml | 62 ++ docs/development-workflow.md | 96 +++ ...26-03-26-emboflow-v1-foundation-and-mvp.md | 621 ++++++++++++++++++ package.json | 9 + pnpm-workspace.yaml | 3 + scripts/check_commit_message.py | 126 ++++ scripts/check_doc_code_sync.py | 194 ++++++ scripts/install_hooks.sh | 12 + tests/test_commit_message.py | 40 ++ tests/test_doc_code_sync.py | 55 ++ tests/test_repo_structure.py | 35 + tsconfig.base.json | 12 + 43 files changed, 3258 insertions(+) create mode 100644 .env.example create mode 100755 .githooks/commit-msg create mode 100755 .githooks/pre-commit create mode 100755 .githooks/pre-push create mode 100644 .github/pull_request_template.md create mode 100644 .github/workflows/guardrails.yml create mode 100644 CONTRIBUTING.md create mode 100644 apps/api/package.json create mode 100644 apps/web/package.json create mode 100644 apps/worker/package.json create mode 100644 design/00-overview/.gitkeep create mode 100644 design/00-overview/emboflow-platform-overview.md create mode 100644 design/01-product/.gitkeep create mode 100644 design/01-product/v1-scope-and-mvp.md create mode 100644 design/02-architecture/.gitkeep create mode 100644 design/02-architecture/deployment-architecture.md create mode 100644 design/02-architecture/system-architecture.md create mode 100644 design/03-workflows/.gitkeep create mode 100644 design/03-workflows/workflow-execution-model.md create mode 100644 design/04-ui-ux/.gitkeep create mode 100644 design/04-ui-ux/information-architecture-and-key-screens.md create mode 100644 design/05-data/.gitkeep create mode 100644 design/05-data/mongodb-data-model.md create mode 100644 design/06-api/.gitkeep create mode 100644 design/07-research/.gitkeep create mode 100644 design/08-decisions/.gitkeep create mode 100644 design/08-decisions/adr-0001-raw-asset-and-canonical-dataset.md create mode 100644 design/08-decisions/adr-0002-executor-and-scheduler-separation.md create mode 100644 design/09-assets/.gitkeep create mode 100644 design/README.md create mode 100644 design/templates/.gitkeep create mode 100644 docker-compose.yml create mode 100644 docs/development-workflow.md create mode 100644 docs/plans/2026-03-26-emboflow-v1-foundation-and-mvp.md create mode 100644 package.json create mode 100644 pnpm-workspace.yaml create mode 100755 scripts/check_commit_message.py create mode 100755 scripts/check_doc_code_sync.py create mode 100644 scripts/install_hooks.sh create mode 100644 tests/test_commit_message.py create mode 100644 tests/test_doc_code_sync.py create mode 100644 tests/test_repo_structure.py create mode 100644 tsconfig.base.json diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..d4b13b5 --- /dev/null +++ b/.env.example @@ -0,0 +1,17 @@ +NODE_ENV=development + +WEB_PORT=3000 +API_PORT=3001 +WORKER_PORT=3002 + +MONGO_PORT=27017 +MONGO_DB=emboflow +MONGO_ROOT_USERNAME=emboflow +MONGO_ROOT_PASSWORD=emboflow + +MINIO_PORT=9000 +MINIO_CONSOLE_PORT=9001 +MINIO_ROOT_USER=emboflow +MINIO_ROOT_PASSWORD=emboflow123 + +STORAGE_PROVIDER=minio diff --git a/.githooks/commit-msg b/.githooks/commit-msg new file mode 100755 index 0000000..0b15ab0 --- /dev/null +++ b/.githooks/commit-msg @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +set -euo pipefail + +repo_root="$(git rev-parse --show-toplevel)" +cd "$repo_root" + +python3 scripts/check_commit_message.py --file "$1" diff --git a/.githooks/pre-commit b/.githooks/pre-commit new file mode 100755 index 0000000..b594841 --- /dev/null +++ b/.githooks/pre-commit @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +set -euo pipefail + +repo_root="$(git rev-parse --show-toplevel)" +cd "$repo_root" + +python3 scripts/check_doc_code_sync.py . --staged --strict diff --git a/.githooks/pre-push b/.githooks/pre-push new file mode 100755 index 0000000..bde3dd4 --- /dev/null +++ b/.githooks/pre-push @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +set -euo pipefail + +repo_root="$(git rev-parse --show-toplevel)" +cd "$repo_root" + +if git rev-parse --abbrev-ref --symbolic-full-name "@{upstream}" >/dev/null 2>&1; then + base_ref="$(git rev-parse --abbrev-ref --symbolic-full-name "@{upstream}")" + python3 scripts/check_doc_code_sync.py . --base-ref "$base_ref" --strict + python3 scripts/check_commit_message.py --rev-range "$base_ref..HEAD" +elif git rev-parse HEAD~1 >/dev/null 2>&1; then + python3 scripts/check_doc_code_sync.py . --base-ref HEAD~1 --strict + python3 scripts/check_commit_message.py --rev-range "HEAD~1..HEAD" +else + python3 scripts/check_doc_code_sync.py . --rev-range HEAD --strict + python3 scripts/check_commit_message.py --rev-range HEAD +fi + +python3 -m unittest discover -s tests -p 'test_*.py' diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..9bf35db --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,40 @@ +# Summary + +- Describe the change in clear English. +- Explain the user-visible or system-level impact. + +# Design Sync + +- [ ] I reviewed the relevant files under `design/` before implementing. +- [ ] I updated the affected design or docs files in the same change set, or I confirmed no design update was required. +- [ ] I ran `python3 scripts/check_doc_code_sync.py . --strict`. + +Design files reviewed or updated: + +- `` + +If design and code are not fully aligned yet, explain the gap: + +- + +# Validation + +- [ ] I ran local checks relevant to this change. +- [ ] I ran `bash scripts/install_hooks.sh` in this clone or already had the repo hooks installed. +- [ ] My commit messages in this PR are English-only and use a gitmoji prefix. + +Commands run: + +```bash +# paste commands here +``` + +# Scope Checklist + +- [ ] This PR updates behavior, contracts, or runtime assumptions intentionally. +- [ ] This PR does not silently break documented architecture or workflow assumptions. +- [ ] This PR includes tests if behavior changed, or I confirmed tests were not required. + +# Notes For Reviewers + +- Call out any risky areas, follow-up work, or unresolved assumptions. diff --git a/.github/workflows/guardrails.yml b/.github/workflows/guardrails.yml new file mode 100644 index 0000000..8eb501b --- /dev/null +++ b/.github/workflows/guardrails.yml @@ -0,0 +1,45 @@ +name: Guardrails + +on: + pull_request: + push: + +jobs: + repository-guardrails: + runs-on: ubuntu-latest + + steps: + - name: Check out repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Compute git range + id: git_range + shell: bash + run: | + if [ "${GITHUB_EVENT_NAME}" = "pull_request" ]; then + RANGE="${{ github.event.pull_request.base.sha }}..${{ github.sha }}" + elif [ "${{ github.event.before }}" != "0000000000000000000000000000000000000000" ]; then + RANGE="${{ github.event.before }}..${{ github.sha }}" + else + RANGE="${{ github.sha }}" + fi + echo "range=${RANGE}" >> "$GITHUB_OUTPUT" + + - name: Validate commit messages + run: | + python3 scripts/check_commit_message.py --rev-range "${{ steps.git_range.outputs.range }}" + + - name: Validate design and code sync + run: | + python3 scripts/check_doc_code_sync.py . --rev-range "${{ steps.git_range.outputs.range }}" --strict + + - name: Run repository tests + run: | + python3 -m unittest discover -s tests -p 'test_*.py' diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..427d1a4 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,93 @@ +# Contributing To EmboFlow + +## Core Rule + +Keep `design/` and implementation aligned in the same change set. + +Do not treat design files as background notes. If a code change affects product behavior, workflow behavior, data models, contracts, runtime assumptions, permissions, or deployment assumptions, update the corresponding design documents before closing the task. + +## Required Workflow + +1. Read the relevant files under `design/` before implementing. +2. Summarize the intended contract you are changing. +3. Implement the code change. +4. Update the affected design files in the same work session. +5. Install the local git hooks once per clone: + +```bash +bash scripts/install_hooks.sh +``` + +6. Use English-only commit messages with a gitmoji prefix, for example: + +```text +:sparkles: add workflow guardrails and CI checks +``` + +7. Run the local sync check when needed: + +```bash +python3 scripts/check_doc_code_sync.py . --strict +``` + +8. If design and code still diverge, document that explicitly in your final summary. + +## When Design Updates Are Required + +Update design files when a change affects: + +- user-visible behavior +- workflow nodes or execution paths +- data model or storage structure +- API or schema contracts +- plugin or executor behavior +- workspace, project, or permission rules +- deployment or runtime assumptions + +## When Design Updates May Be Skipped + +Design updates are usually not required for: + +- pure refactors with no behavior change +- test-only changes +- formatting, comments, and naming cleanup + +Even in those cases, verify that no documented statement became false indirectly. + +## Primary Design Locations + +- `design/00-overview/` +- `design/01-product/` +- `design/02-architecture/` +- `design/03-workflows/` +- `design/05-data/` +- `design/08-decisions/` + +## Local Tooling + +This repository includes: + +- git hook templates under `.githooks/` +- a hook installer: + +```bash +bash scripts/install_hooks.sh +``` + +- a design/code sync checker: + +```bash +python3 scripts/check_doc_code_sync.py . --strict +``` + +- a commit message validator: + +```bash +python3 scripts/check_commit_message.py --rev-range HEAD +``` + +The hooks and CI enforce: + +- English-only commit messages with a gitmoji prefix +- design/code consistency checks +- repository unit tests before push diff --git a/apps/api/package.json b/apps/api/package.json new file mode 100644 index 0000000..1e6f2a6 --- /dev/null +++ b/apps/api/package.json @@ -0,0 +1,8 @@ +{ + "name": "@emboflow/api", + "private": true, + "version": "0.1.0", + "scripts": { + "dev": "echo 'api app scaffold pending'" + } +} diff --git a/apps/web/package.json b/apps/web/package.json new file mode 100644 index 0000000..6ab7aff --- /dev/null +++ b/apps/web/package.json @@ -0,0 +1,8 @@ +{ + "name": "@emboflow/web", + "private": true, + "version": "0.1.0", + "scripts": { + "dev": "echo 'web app scaffold pending'" + } +} diff --git a/apps/worker/package.json b/apps/worker/package.json new file mode 100644 index 0000000..4bec2d2 --- /dev/null +++ b/apps/worker/package.json @@ -0,0 +1,8 @@ +{ + "name": "@emboflow/worker", + "private": true, + "version": "0.1.0", + "scripts": { + "dev": "echo 'worker app scaffold pending'" + } +} diff --git a/design/00-overview/.gitkeep b/design/00-overview/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/design/00-overview/.gitkeep @@ -0,0 +1 @@ + diff --git a/design/00-overview/emboflow-platform-overview.md b/design/00-overview/emboflow-platform-overview.md new file mode 100644 index 0000000..336a005 --- /dev/null +++ b/design/00-overview/emboflow-platform-overview.md @@ -0,0 +1,70 @@ +# EmboFlow Platform Overview + +## Positioning + +EmboFlow is a browser-based embodied data engineering platform for ingesting raw assets, organizing dataset workflows on a visual canvas, processing and converting data, annotating and inspecting results, exporting normalized artifacts, and generating downstream training configurations. + +The platform is designed around plugin-based extensibility, but the first version should deliver a stable built-in core before opening broader extension surfaces. + +## Primary Users + +- Individual engineers building embodied datasets +- Team operators managing collection, preprocessing, delivery, and annotation workflows +- Data engineering teams that need repeatable conversion and packaging pipelines +- Teams preparing datasets for external training systems + +## V1 Product Goal + +Build a usable end-to-end platform that allows users to: + +1. Log into a personal or team workspace +2. Create a project +3. Upload or import raw embodied data assets +4. Auto-detect asset structure and generate preview summaries +5. Compose processing pipelines on a canvas +6. Configure node parameters and inject code into processing nodes +7. Execute workflows asynchronously and inspect logs and outputs +8. Export normalized delivery packages, training datasets, or training config files + +## Supported Input Formats in V1 + +- RLDS +- LeRobot v2/v3 +- HDF5 +- Rosbag +- Raw video folders and delivery-style directory packages +- Compressed archives containing the above + +## Core Product Principles + +- Raw assets are first-class objects +- Canonical semantic datasets are derived, not assumed +- Visualization can operate directly on raw assets +- Workflow execution is asynchronous and traceable +- Plugins are versioned and managed +- User-injected code is supported with strict runtime boundaries +- Training execution is out of scope for V1, but training handoff is in scope + +## Major Workspaces + +- Asset Workspace: upload, import, scan, probe, browse +- Canvas Workspace: build and run workflows +- Explore Workspace: inspect raw assets and processed outputs +- Label Workspace: create and review annotation tasks +- Admin Workspace: users, workspaces, plugins, storage, runtime settings + +## V1 Output Types + +- Standardized embodied dataset exports +- Customer delivery packages +- Validation and quality reports +- Annotation artifacts +- Training configuration packages for downstream training systems + +## Non-Goals for V1 + +- Built-in training execution orchestration +- Real-time collaborative editing on the same canvas +- Public plugin marketplace +- Fully generalized MLOps lifecycle management +- Advanced distributed scheduling in the first deployment diff --git a/design/01-product/.gitkeep b/design/01-product/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/design/01-product/.gitkeep @@ -0,0 +1 @@ + diff --git a/design/01-product/v1-scope-and-mvp.md b/design/01-product/v1-scope-and-mvp.md new file mode 100644 index 0000000..85ffe44 --- /dev/null +++ b/design/01-product/v1-scope-and-mvp.md @@ -0,0 +1,90 @@ +# EmboFlow V1 Scope And MVP + +## MVP Definition + +The first release should prove that EmboFlow can turn raw embodied data assets into structured outputs through a visual workflow engine. + +### MVP Success Path + +1. A user signs into a workspace +2. The user creates a project +3. The user uploads or imports a raw asset +4. The platform probes the asset and generates a structure summary +5. The user previews the asset +6. The user composes a canvas workflow +7. The workflow executes asynchronously +8. The user reviews logs, outputs, and generated artifacts +9. The user exports a normalized dataset, delivery package, or training config + +## In Scope For V1 + +- User login and workspace model +- Personal and team workspaces +- Project resource isolation +- Raw asset upload and import +- Object storage integration +- Asset probing and structure detection +- Raw asset preview +- Canvas workflow editor +- Built-in node library for ingest, transform, inspect, export +- Node configuration through schema-driven forms +- Code injection for processing nodes +- Workflow run orchestration +- Logs, status, retries, and artifact tracking +- Dataset conversion and delivery-package normalization +- Training config export +- Plugin registration skeleton + +## Important Business Scenarios + +### Embodied Dataset Conversion + +- Import RLDS, LeRobot, HDF5, or Rosbag +- Map to canonical semantics +- Export to target dataset format + +### Delivery Package Normalization + +- Import customer-provided raw directory or archive +- Rename top-level folders +- Validate required file structure +- Validate metadata files +- Check video file quality and naming +- Export or upload normalized package + +### Data Processing Workflow Authoring + +- Drag nodes onto canvas +- Connect nodes into DAG +- Tune parameters +- Inject code into processing nodes +- Re-run pipeline with traceable history + +## V1 Modules To Build Deeply + +- Identity and workspace management +- Asset ingestion and probing +- Workflow editor and node model +- Execution engine +- Built-in dataset conversion nodes +- Built-in delivery normalization nodes +- Preview and inspection +- Artifact export + +## V1 Modules To Keep Lightweight + +- Annotation +- Collaboration +- Plugin lifecycle UX +- Advanced analytics +- Kubernetes and Volcano scheduling adapters +- Advanced multi-sensor synchronized visual analytics + +## Explicit V1 Exclusions + +- Platform-managed training execution +- Real-time multi-user canvas co-editing +- Full marketplace for third-party plugins +- Complex enterprise approval workflows +- Streaming data processing +- Large-scale distributed execution as a deployment requirement diff --git a/design/02-architecture/.gitkeep b/design/02-architecture/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/design/02-architecture/.gitkeep @@ -0,0 +1 @@ + diff --git a/design/02-architecture/deployment-architecture.md b/design/02-architecture/deployment-architecture.md new file mode 100644 index 0000000..0d3bb9c --- /dev/null +++ b/design/02-architecture/deployment-architecture.md @@ -0,0 +1,115 @@ +# EmboFlow Deployment Architecture + +## V1 Deployment Target + +The first deployment target is a single public server. The platform should be deployed in a way that is operationally simple now and migration-friendly later. + +## Recommended V1 Deployment Topology + +- Reverse proxy +- Web frontend service +- API service +- Worker service +- MongoDB +- Optional MinIO +- Host Docker runtime for execution containers + +## Deployment Principles + +- Single-host deployment first +- All major services containerized +- Persistent state mounted on host volumes +- Object storage can be external or self-hosted +- Execution workers separated from API service +- Future scheduler migration should not require domain model changes + +## Recommended Runtime Layout + +### Edge + +- Nginx or equivalent reverse proxy +- HTTPS termination +- Static web delivery or web upstream routing + +### Application + +- `web` +- `api` +- `worker` + +### Data + +- `mongo` +- `minio` optional + +## Object Storage Strategy + +The product should support both: + +- Cloud object storage such as BOS or S3-compatible services +- Self-hosted MinIO for development, demos, or private deployment + +The application should expose a unified storage abstraction instead of embedding provider-specific logic across modules. + +## Local Scheduler In V1 + +V1 should use a local scheduler. Worker processes execute tasks on the same deployment host. + +Design constraints: + +- RuntimeSpec must already exist +- Scheduler abstraction must already exist +- Docker executor must already be scheduler-compatible + +This keeps future migration to Kubernetes or Volcano feasible. + +## Host-Level Persistent Directories + +Recommended host directories: + +- application config +- mongodb data +- minio data +- uploaded file staging +- execution temp workspace +- logs +- backup data + +## Execution Isolation + +The host Docker runtime serves two different purposes: + +- Running the platform deployment stack +- Running task execution containers + +These must be treated as separate concerns in configuration and security design. + +## Future Migration Path + +### Stage 1 + +- Single-host deployment +- Local scheduler +- Docker executor + +### Stage 2 + +- Kubernetes-based service deployment +- Kubernetes scheduler adapter for workflow tasks + +### Stage 3 + +- Volcano scheduler adapter +- Better support for large batch jobs and training-adjacent workloads + +## Operational Baseline + +V1 should provide basic operational support for: + +- health checks +- service restart +- execution failure visibility +- disk space monitoring +- object storage connectivity checks +- MongoDB backup and restore procedures +- worker online status diff --git a/design/02-architecture/system-architecture.md b/design/02-architecture/system-architecture.md new file mode 100644 index 0000000..03ae558 --- /dev/null +++ b/design/02-architecture/system-architecture.md @@ -0,0 +1,200 @@ +# EmboFlow System Architecture + +## Architecture Style + +EmboFlow V1 is a browser/server platform built as: + +- Web frontend +- Modular backend control plane +- Independent worker runtime +- MongoDB as the only database +- Object storage abstraction over cloud object storage or MinIO +- Local scheduler in V1 with future migration path to Kubernetes and Volcano + +The architecture should preserve clear service boundaries even if V1 is implemented as a modular monolith plus workers. + +## High-Level Layers + +### Frontend Layer + +- Asset workspace +- Canvas workspace +- Explore workspace +- Label workspace +- Admin workspace + +### Control Plane + +- Identity and authorization +- Workspace and project management +- Asset and dataset metadata +- Workflow definition management +- Plugin registry and activation +- Run orchestration API +- Artifact indexing + +### Execution Plane + +- Workflow DAG compilation +- Task queue dispatch +- Worker execution +- Executor routing +- Log and artifact collection + +### Storage Layer + +- MongoDB for metadata and run state +- Object storage for files and large outputs +- Temporary local working directories for execution + +## Core Domain Objects + +- User +- Workspace +- Project +- Asset +- Dataset +- DatasetVersion +- WorkflowDefinition +- WorkflowVersion +- WorkflowRun +- RunTask +- Artifact +- AnnotationTask +- Annotation +- Plugin +- StorageConnection + +## Raw Asset And Canonical Dataset Model + +The platform must distinguish between: + +- Raw Asset View +- Canonical Dataset View + +Raw assets preserve source structure, file paths, metadata layout, and original naming. Canonical datasets provide a normalized semantic layer for workflow nodes and export logic. + +Visualization may read raw assets directly. Conversion, orchestration, and export should primarily target canonical semantics. + +## Workflow Model + +Workflow definitions are versioned and contain: + +- Visual graph state +- Logical node and edge graph +- Runtime configuration +- Plugin references + +Workflow execution produces immutable workflow runs. A run snapshots: + +- Workflow version +- Node configuration +- Injected code +- Executor settings +- Input bindings + +Runs compile into task DAGs. + +## Node And Plugin Model + +### Node Categories + +- Source +- Transform +- Inspect +- Annotate +- Export +- Utility + +### Node Definition Contract + +Each node definition includes: + +- Metadata +- Input schema +- Output schema +- Config schema +- UI schema +- Executor type +- Runtime limits +- Optional code hook contract + +### Plugin Types + +- Node plugins +- Reader/writer plugins +- Renderer plugins +- Executor plugins +- Integration plugins + +## Execution Architecture + +### Executors + +- Python executor +- Docker executor +- HTTP executor + +V1 should prioritize Python and Docker. HTTP executor is useful for integrating external services. + +### Schedulers + +- Local scheduler in V1 +- Kubernetes scheduler later +- Volcano scheduler later + +Executors and schedulers are separate abstractions: + +- Executor defines how logic runs +- Scheduler defines where and under what scheduling policy it runs + +## Storage Architecture + +### MongoDB Collections + +Recommended primary collections: + +- users +- workspaces +- projects +- memberships +- assets +- asset_probe_reports +- datasets +- dataset_versions +- workflow_definitions +- workflow_definition_versions +- workflow_runs +- run_tasks +- artifacts +- annotation_tasks +- annotations +- plugins +- storage_connections +- audit_logs + +### Object Storage Content + +- Raw uploads +- Imported archives +- Normalized export packages +- Training config packages +- Preview resources +- Logs and attachments +- Large manifests and file indexes + +## Security Model + +User-injected code is low-trust code and must not run in web or API processes. + +V1 runtime policy: + +- Built-in trusted nodes may use Python executor +- Plugin code should run in controlled runtimes +- User-injected code should default to Docker executor +- Network access should be denied by default for user code +- Input and output paths should be explicitly mounted + +## Deployment Direction + +V1 deployment target is a single public server using containerized application services. The architecture must still preserve future migration to multi-node environments. diff --git a/design/03-workflows/.gitkeep b/design/03-workflows/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/design/03-workflows/.gitkeep @@ -0,0 +1 @@ + diff --git a/design/03-workflows/workflow-execution-model.md b/design/03-workflows/workflow-execution-model.md new file mode 100644 index 0000000..f92850c --- /dev/null +++ b/design/03-workflows/workflow-execution-model.md @@ -0,0 +1,316 @@ +# EmboFlow Workflow Execution Model + +## Goal + +Define how EmboFlow represents, validates, executes, and observes canvas workflows. + +The workflow system is the product core. The canvas is only the editing surface. The real system of record is the versioned workflow definition and its immutable run snapshots. + +## Core Objects + +- `WorkflowDefinition` + Logical workflow identity under a project +- `WorkflowVersion` + Immutable snapshot of nodes, edges, runtime defaults, and plugin references +- `NodeInstance` + Concrete node on a workflow graph +- `WorkflowRun` + One execution of one workflow version +- `RunTask` + Executable unit derived from a node during one run +- `Artifact` + Managed output from a task or run + +## Workflow Layers + +Each workflow version contains three layers. + +### Visual Layer + +Used only by the editor: + +- node positions +- collapsed state +- groups +- zoom defaults +- viewport metadata + +### Logic Layer + +Used for graph semantics: + +- nodes +- edges +- input/output ports +- branch conditions +- merge semantics +- dependency graph + +### Runtime Layer + +Used for execution: + +- node config values +- executor settings +- runtime resource limits +- retry policy +- code hooks +- cache policy + +Visual changes must not change workflow semantics. Runtime changes must produce a new workflow version. + +## Node Categories + +V1 node categories: + +- `Source` +- `Transform` +- `Inspect` +- `Annotate` +- `Export` +- `Utility` + +### V1 Built-In Node Families + +- asset upload/import +- archive extract +- folder rename +- directory validation +- metadata validation +- video quality inspection +- dataset readers for RLDS, LeRobot, HDF5, Rosbag +- canonical mapping nodes +- dataset writers and exporters +- training config export +- Python processing node + +## Node Definition Contract + +Each node definition must expose: + +- `id` +- `name` +- `category` +- `version` +- `description` +- `inputSchema` +- `outputSchema` +- `configSchema` +- `uiSchema` +- `executorType` +- `runtimeDefaults` +- `permissions` +- `capabilities` +- `codeHookSpec` + +### Code Hook Spec + +V1 supports user code hooks only on: + +- `Transform` +- `Inspect` +- `Utility` + +Hooks must use a constrained entrypoint instead of arbitrary script structure. + +Example: + +```python +def process(input_data, context): + return input_data +``` + +This keeps serialization, logging, and runtime control predictable. + +## Data Flow Contract + +Tasks should exchange managed references, not loose file paths. + +V1 reference types: + +- `assetRef` +- `datasetVersionRef` +- `artifactRef` +- `annotationTaskRef` +- `inlineConfig` + +Executors may materialize files internally, but the platform-level contract must remain reference-based. + +## Validation Stages + +Workflow execution must validate in this order: + +1. workflow version exists +2. referenced plugins exist and are enabled +3. node schemas are valid +4. edge connections are schema-compatible +5. runtime configuration is complete +6. referenced assets and datasets are accessible +7. code hooks pass static validation +8. executor and scheduler requirements are satisfiable + +Validation failure must block run creation. + +## Run Lifecycle + +When a user executes a workflow: + +1. resolve workflow version +2. snapshot all runtime-relevant inputs +3. resolve plugin versions +4. freeze node config and code hooks +5. compile graph into a DAG +6. create `WorkflowRun` +7. create `RunTask` entries +8. enqueue ready tasks +9. collect outputs, logs, and task state +10. finalize run status and summary + +## Run State Model + +### WorkflowRun Status + +- `pending` +- `queued` +- `running` +- `success` +- `failed` +- `cancelled` +- `partial_success` + +### RunTask Status + +- `pending` +- `queued` +- `running` +- `success` +- `failed` +- `cancelled` +- `skipped` + +`partial_success` is used for workflows where non-blocking nodes fail but the run still produces valid outputs. + +## Retry And Failure Policy + +Each node instance may define: + +- retry count +- retry backoff policy +- fail-fast behavior +- continue-on-error behavior +- manual retry eligibility + +V1 should support: + +- `fail_fast` +- `continue_on_error` +- `retry_n_times` +- `manual_retry` + +## Cache Model + +V1 should support node-level cache reuse. + +Recommended cache key inputs: + +- workflow version +- node id +- upstream reference summary +- config summary +- code hook digest +- plugin version +- executor version + +Cache hit behavior: + +- reuse output artifact refs +- reuse output summaries +- retain previous logs reference +- mark task as cache-resolved in metadata + +## Execution Context + +Each task receives a normalized execution context containing: + +- workspace id +- project id +- workflow run id +- task id +- actor id +- node config +- code hook content +- input references +- storage context +- temp working directory +- runtime resource limits + +This context must be available across Python, Docker, and HTTP executors. + +## Observability Requirements + +Each task must emit: + +- status transitions +- start time and finish time +- duration +- executor metadata +- resource request metadata +- stdout/stderr log stream +- structured task summary +- artifact refs + +The UI must allow: + +- graph-level run status +- node-level log inspection +- node-level artifact browsing +- task retry entrypoint +- direct navigation from a node to preview output + +## Canvas Interaction Rules + +V1 editor behavior should enforce: + +- port-level connection rules +- incompatible edge blocking +- dirty-state detection +- explicit save before publish/run if graph changed +- per-node validation badges +- run from latest saved version, not unsaved draft + +## Example V1 Pipelines + +### Delivery Normalization + +```text +Raw Folder Import + -> Archive Extract + -> Folder Rename + -> Directory Validation + -> Metadata Validation + -> Video Quality Check + -> Delivery Export +``` + +### Dataset Conversion + +```text +Rosbag Reader + -> Canonical Mapping + -> Frame Filter + -> Metadata Normalize + -> LeRobot Writer + -> Training Config Export +``` + +## V1 Non-Goals + +The V1 workflow engine does not need: + +- loop semantics +- streaming execution +- unbounded dynamic fan-out +- event-driven triggers +- advanced distributed DAG partitioning + +The V1 goal is a stable, observable DAG executor for data engineering workflows. diff --git a/design/04-ui-ux/.gitkeep b/design/04-ui-ux/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/design/04-ui-ux/.gitkeep @@ -0,0 +1 @@ + diff --git a/design/04-ui-ux/information-architecture-and-key-screens.md b/design/04-ui-ux/information-architecture-and-key-screens.md new file mode 100644 index 0000000..410e5ee --- /dev/null +++ b/design/04-ui-ux/information-architecture-and-key-screens.md @@ -0,0 +1,296 @@ +# EmboFlow Information Architecture And Key Screens + +## Goal + +Define the primary navigation model, main screens, and key interaction patterns for EmboFlow V1. + +The UI should feel like a serious data workflow product, not a generic low-code canvas. The most important interaction is the relationship between assets, workflows, runs, and outputs. + +## Information Architecture + +Top-level product areas: + +- Workspace switcher +- Project selector +- Asset Workspace +- Canvas Workspace +- Explore Workspace +- Label Workspace +- Admin Workspace + +## Navigation Model + +### Global Header + +Recommended global header content: + +- workspace switcher +- project switcher +- search entry +- run notifications +- user menu + +### Primary Sidebar + +Recommended primary navigation: + +- Assets +- Workflows +- Runs +- Explore +- Labels +- Admin + +This keeps the product model explicit: + +- assets are inputs +- workflows define transformation logic +- runs represent execution history +- explore is where users inspect outputs and raw inputs + +## Screen 1: Workspace And Project Entry + +Purpose: + +- choose personal or team workspace +- choose or create project +- view recent projects and recent workflow runs + +V1 should emphasize project-level organization because all major resources are project-scoped. + +## Screen 2: Asset Workspace + +Purpose: + +- upload or import raw assets +- inspect asset type and status +- review probe summary +- launch preview or workflow entrypoint + +Core regions: + +- asset list with filters +- import actions +- asset status and source type +- probe summary card +- recommended next actions + +Key actions: + +- upload file +- upload archive +- import object storage prefix +- register storage path +- open preview +- create workflow from asset + +## Screen 3: Asset Detail / Explore Entry + +Purpose: + +- inspect one asset deeply +- browse folder structure +- inspect metadata and detected format +- preview representative files + +Suggested panels: + +- left: file tree or asset structure +- center: preview surface +- right: metadata, probe report, warnings, recommended nodes + +This screen should support both: + +- raw asset view +- canonical dataset summary view when available + +## Screen 4: Canvas Workspace + +This is the core authoring surface. + +### Layout + +Recommended layout, aligned with the Xspark reference pattern: + +- left: node library and workflow tools +- center: canvas +- right: node configuration panel + +### Left Panel + +Contains: + +- source nodes +- transform nodes +- inspect nodes +- annotate nodes +- export nodes +- utility nodes +- search/filter + +### Center Canvas + +Supports: + +- drag-and-drop node placement +- edge creation +- zoom and pan +- mini-map +- node badges for validation status +- run-state overlays when viewing an executed version + +### Right Configuration Panel + +The right panel is schema-driven. + +It should render: + +- node title +- node description +- config fields +- input/output schema summary +- executor selection +- runtime policy +- code hook editor if supported +- validation errors + +This panel is critical. It should feel like a structured system console, not a generic form dump. + +## Screen 5: Workflow Run Detail + +Purpose: + +- inspect execution state +- view DAG progress +- open task logs +- inspect task outputs +- retry failed nodes + +Recommended layout: + +- top: run summary and status +- center: workflow graph with execution overlays +- bottom or side drawer: logs and artifacts for selected node + +## Screen 6: Explore Workspace + +Purpose: + +- inspect raw or processed outputs outside the canvas authoring context +- compare source and transformed outputs +- validate whether a run produced expected results + +V1 renderer set: + +- directory tree renderer +- JSON renderer +- video renderer +- dataset summary renderer +- quality report renderer + +This workspace should open from: + +- asset detail +- workflow node output +- artifact detail + +## Screen 7: Label Workspace + +Purpose: + +- process annotation tasks +- review results +- attach annotations to data outputs + +V1 should keep this lightweight: + +- frame labels +- clip labels +- temporal segment labels +- quality tags + +The label workspace should be able to open from an artifact or dataset version, not only from a workflow node. + +## Screen 8: Admin Workspace + +Purpose: + +- manage members +- manage storage connections +- manage plugin enablement +- inspect audit and runtime settings + +Suggested sections: + +- members and roles +- workspace settings +- storage connections +- plugin registry +- executor policies +- audit log viewer + +## Key UX Principles + +### 1. Separate authoring from inspection + +Do not overload the canvas with deep preview or annotation workflows. The canvas configures process. Explore and Label workspaces handle dense interaction. + +### 2. Keep lineage visible + +Users should be able to move across: + +- asset +- workflow +- run +- task +- artifact +- annotation + +without losing context. + +### 3. Prefer explicit system terminology + +Use consistent object names in the UI: + +- Asset +- Dataset +- Workflow +- Run +- Task +- Artifact +- Plugin + +Do not rename the same concept differently across pages. + +### 4. Make validation obvious before execution + +Before users run a workflow, the editor should visibly show: + +- missing config +- invalid schema connections +- unsupported executor choices +- permission or plugin issues + +### 5. Keep the product usable on standard screens + +The canvas and right configuration panel must work on laptop-sized displays. On narrower screens, the right panel may collapse into a drawer. + +## V1 Visual Direction + +The UI should communicate: + +- precision +- observability +- traceability +- strong operator control + +It should feel closer to a workflow control console than a consumer productivity app. + +## V1 Non-Goals + +V1 UI does not need: + +- real-time multi-user cursor collaboration +- advanced canvas commenting systems +- highly customized renderer marketplace UX +- heavy design polish ahead of workflow clarity diff --git a/design/05-data/.gitkeep b/design/05-data/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/design/05-data/.gitkeep @@ -0,0 +1 @@ + diff --git a/design/05-data/mongodb-data-model.md b/design/05-data/mongodb-data-model.md new file mode 100644 index 0000000..1c05940 --- /dev/null +++ b/design/05-data/mongodb-data-model.md @@ -0,0 +1,521 @@ +# EmboFlow MongoDB Data Model + +## Goal + +Define the MongoDB-only persistence model for EmboFlow V1. + +The database must support: + +- user and workspace isolation +- raw asset tracking +- canonical dataset versions +- workflow versioning +- workflow execution history +- plugin registration +- auditability + +## Storage Principles + +- MongoDB stores metadata and execution state +- Object storage stores large binary files and large derived bundles +- MongoDB documents should have clear aggregate boundaries +- Large, fast-growing arrays should be split into separate collections +- Platform contracts should use references, not embedded file blobs + +## Primary Collections + +- `users` +- `workspaces` +- `projects` +- `memberships` +- `assets` +- `asset_probe_reports` +- `datasets` +- `dataset_versions` +- `workflow_definitions` +- `workflow_definition_versions` +- `workflow_runs` +- `run_tasks` +- `artifacts` +- `annotation_tasks` +- `annotations` +- `plugins` +- `storage_connections` +- `audit_logs` + +## Collection Design + +### users + +Purpose: + +- account identity +- profile +- login metadata + +Core fields: + +- `_id` +- `email` +- `displayName` +- `avatarUrl` +- `status` +- `lastLoginAt` +- `createdAt` +- `updatedAt` + +### workspaces + +Purpose: + +- resource ownership boundary + +Core fields: + +- `_id` +- `type` as `personal` or `team` +- `name` +- `slug` +- `ownerId` +- `status` +- `settings` +- `createdAt` +- `updatedAt` + +### memberships + +Purpose: + +- workspace and project role mapping + +Core fields: + +- `_id` +- `workspaceId` +- `projectId` optional +- `userId` +- `role` +- `status` +- `createdAt` +- `updatedAt` + +This collection should stay independent instead of embedding large member arrays on every resource. + +### projects + +Purpose: + +- project-scoped grouping for assets, workflows, runs, and outputs + +Core fields: + +- `_id` +- `workspaceId` +- `name` +- `slug` +- `description` +- `status` +- `createdBy` +- `createdAt` +- `updatedAt` + +### assets + +Purpose: + +- represent raw uploaded or imported inputs + +Supported asset types: + +- `raw_file` +- `archive` +- `folder` +- `video_collection` +- `standard_dataset` +- `rosbag` +- `hdf5_dataset` +- `object_storage_prefix` + +Core fields: + +- `_id` +- `workspaceId` +- `projectId` +- `type` +- `sourceType` +- `displayName` +- `status` +- `storageRef` +- `sizeBytes` +- `fileCount` +- `topLevelPaths` +- `detectedFormats` +- `summary` +- `createdBy` +- `createdAt` +- `updatedAt` + +Do not embed full large file listings in this document. + +### asset_probe_reports + +Purpose: + +- retain richer structure-detection and validation output + +Core fields: + +- `_id` +- `assetId` +- `reportVersion` +- `detectedFormatCandidates` +- `structureSummary` +- `warnings` +- `recommendedNextNodes` +- `rawReport` +- `createdAt` + +### datasets + +Purpose: + +- represent logical dataset identity + +Core fields: + +- `_id` +- `workspaceId` +- `projectId` +- `name` +- `type` +- `status` +- `latestVersionId` +- `summary` +- `createdBy` +- `createdAt` +- `updatedAt` + +### dataset_versions + +Purpose: + +- represent immutable dataset snapshots + +Core fields: + +- `_id` +- `datasetId` +- `workspaceId` +- `projectId` +- `sourceAssetId` +- `parentVersionId` +- `versionTag` +- `canonicalSchemaVersion` +- `manifestRef` +- `stats` +- `summary` +- `status` +- `createdBy` +- `createdAt` + +This collection is separated because versions will grow over time. + +### workflow_definitions + +Purpose: + +- represent logical workflow identity + +Core fields: + +- `_id` +- `workspaceId` +- `projectId` +- `name` +- `slug` +- `status` +- `latestVersionNumber` +- `publishedVersionNumber` +- `createdBy` +- `createdAt` +- `updatedAt` + +### workflow_definition_versions + +Purpose: + +- represent immutable workflow snapshots + +Core fields: + +- `_id` +- `workflowDefinitionId` +- `workspaceId` +- `projectId` +- `versionNumber` +- `visualGraph` +- `logicGraph` +- `runtimeGraph` +- `pluginRefs` +- `summary` +- `createdBy` +- `createdAt` + +Splitting versions from workflow head metadata avoids oversized documents and simplifies history queries. + +### workflow_runs + +Purpose: + +- store execution runs + +Core fields: + +- `_id` +- `workflowDefinitionId` +- `workflowVersionId` +- `workspaceId` +- `projectId` +- `triggeredBy` +- `status` +- `runtimeSnapshot` +- `summary` +- `startedAt` +- `finishedAt` +- `createdAt` + +### run_tasks + +Purpose: + +- store one execution unit per node per run + +Core fields: + +- `_id` +- `workflowRunId` +- `workflowVersionId` +- `nodeId` +- `nodeType` +- `status` +- `attempt` +- `executor` +- `scheduler` +- `inputRefs` +- `outputRefs` +- `logRef` +- `cacheKey` +- `cacheHit` +- `errorSummary` +- `startedAt` +- `finishedAt` +- `createdAt` + +This collection should remain separate from `workflow_runs` because task volume grows quickly. + +### artifacts + +Purpose: + +- store managed outputs and previews + +Artifact types may include: + +- preview bundle +- quality report +- normalized dataset package +- delivery package +- training config package +- intermediate task output + +Core fields: + +- `_id` +- `workspaceId` +- `projectId` +- `type` +- `producerType` +- `producerId` +- `storageRef` +- `previewable` +- `summary` +- `lineage` +- `createdBy` +- `createdAt` + +### annotation_tasks + +Purpose: + +- track assignment and state of manual labeling work + +Core fields: + +- `_id` +- `workspaceId` +- `projectId` +- `targetType` +- `targetRef` +- `labelType` +- `status` +- `assigneeIds` +- `reviewerIds` +- `createdBy` +- `createdAt` +- `updatedAt` + +### annotations + +Purpose: + +- persist annotation outputs + +Core fields: + +- `_id` +- `annotationTaskId` +- `workspaceId` +- `projectId` +- `targetRef` +- `payload` +- `status` +- `createdBy` +- `createdAt` +- `updatedAt` + +### plugins + +Purpose: + +- track installable and enabled plugin versions + +Core fields: + +- `_id` +- `workspaceId` optional for workspace-scoped plugins +- `scope` as `platform` or `workspace` +- `name` +- `status` +- `currentVersion` +- `versions` +- `permissions` +- `metadata` +- `createdAt` +- `updatedAt` + +If plugin version payloads become large, split versions into a separate collection later. V1 can keep them nested if bounded. + +### storage_connections + +Purpose: + +- store object storage and path registration configuration + +Core fields: + +- `_id` +- `workspaceId` +- `type` +- `provider` +- `name` +- `status` +- `config` +- `secretRef` +- `createdBy` +- `createdAt` +- `updatedAt` + +Store secrets outside plaintext document fields where possible. + +### audit_logs + +Purpose: + +- append-only history of sensitive actions + +Core fields: + +- `_id` +- `workspaceId` +- `projectId` +- `actorId` +- `resourceType` +- `resourceId` +- `action` +- `beforeSummary` +- `afterSummary` +- `metadata` +- `createdAt` + +## Reference Strategy + +Use stable ids between collections. + +References should be explicit: + +- asset to probe report +- dataset to dataset versions +- workflow definition to workflow versions +- workflow run to run tasks +- task to artifact +- annotation task to annotations + +Do not depend on implicit path-based linkage. + +## Index Recommendations + +### Always index + +- `workspaceId` +- `projectId` +- `status` +- `createdAt` + +### Important compound indexes + +- `memberships.workspaceId + memberships.userId` +- `projects.workspaceId + projects.slug` +- `assets.projectId + assets.type + assets.createdAt` +- `datasets.projectId + datasets.name` +- `dataset_versions.datasetId + dataset_versions.createdAt` +- `workflow_definitions.projectId + workflow_definitions.slug` +- `workflow_definition_versions.workflowDefinitionId + versionNumber` +- `workflow_runs.projectId + createdAt` +- `workflow_runs.workflowDefinitionId + status` +- `run_tasks.workflowRunId + nodeId` +- `artifacts.producerType + producerId` +- `annotation_tasks.projectId + status` +- `audit_logs.workspaceId + createdAt` + +## Object Storage References + +MongoDB should store references such as: + +- bucket +- key +- uri +- checksum +- content type +- size + +It should not store: + +- large binary file payloads +- full raw video content +- giant archive contents + +## V1 Constraints + +- MongoDB is the only database +- No relational sidecar is assumed +- No GridFS-first strategy is assumed +- Large manifests may live in object storage and be referenced from MongoDB + +## V1 Non-Goals + +The V1 model does not need: + +- cross-region data distribution +- advanced event sourcing +- fully normalized analytics warehouse modeling +- high-volume search indexing inside MongoDB itself diff --git a/design/06-api/.gitkeep b/design/06-api/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/design/06-api/.gitkeep @@ -0,0 +1 @@ + diff --git a/design/07-research/.gitkeep b/design/07-research/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/design/07-research/.gitkeep @@ -0,0 +1 @@ + diff --git a/design/08-decisions/.gitkeep b/design/08-decisions/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/design/08-decisions/.gitkeep @@ -0,0 +1 @@ + diff --git a/design/08-decisions/adr-0001-raw-asset-and-canonical-dataset.md b/design/08-decisions/adr-0001-raw-asset-and-canonical-dataset.md new file mode 100644 index 0000000..3622432 --- /dev/null +++ b/design/08-decisions/adr-0001-raw-asset-and-canonical-dataset.md @@ -0,0 +1,45 @@ +# ADR-0001: Separate Raw Assets From Canonical Datasets + +## Status + +Accepted + +## Context + +EmboFlow must support both structured embodied dataset formats and unstructured or semi-structured delivery-style raw assets, including: + +- RLDS +- LeRobot v2/v3 +- HDF5 +- Rosbag +- Raw video directories +- Archive packages + +If the platform treats every input as an already-standardized dataset, ingestion and delivery workflows become awkward and lossy. + +## Decision + +The platform will model: + +- Raw assets as first-class resources +- Canonical datasets as derived semantic resources + +Raw assets preserve original structure, paths, naming, and metadata layout. Canonical datasets provide normalized semantics for conversion, workflow execution, and export logic. + +## Consequences + +### Positive + +- Supports customer delivery package workflows +- Supports embodied dataset conversion workflows +- Preserves original structure for inspection and debugging +- Avoids forcing visualization to depend on a lossy normalized format + +### Negative + +- Adds one more layer to the object model +- Requires readers and mappers instead of direct format-to-format conversion + +## Notes + +Visualization may operate on raw assets directly. Processing and export should primarily operate on canonical semantics where possible. diff --git a/design/08-decisions/adr-0002-executor-and-scheduler-separation.md b/design/08-decisions/adr-0002-executor-and-scheduler-separation.md new file mode 100644 index 0000000..6a3a330 --- /dev/null +++ b/design/08-decisions/adr-0002-executor-and-scheduler-separation.md @@ -0,0 +1,56 @@ +# ADR-0002: Separate Executors From Schedulers + +## Status + +Accepted + +## Context + +EmboFlow needs to support multiple runtime modes now and later: + +- direct Python execution +- Docker-isolated execution +- HTTP-based execution +- local scheduling +- future Kubernetes scheduling +- future Volcano scheduling + +If execution logic and scheduling logic are coupled together, migration from single-host operation to cluster operation becomes costly. + +## Decision + +The architecture will separate: + +- Executor: how node logic runs +- Scheduler: where and under what dispatch policy tasks run + +V1 executors: + +- Python +- Docker +- HTTP + +V1 scheduler: + +- Local + +Reserved future schedulers: + +- Kubernetes +- Volcano + +## Consequences + +### Positive + +- Cleaner evolution path +- Better runtime abstraction +- Less refactoring required for cluster migration + +### Negative + +- Slightly more abstraction in V1 than the immediate deployment requires + +## Notes + +User-injected code should default to Docker execution, while trusted platform logic may use Python execution. diff --git a/design/09-assets/.gitkeep b/design/09-assets/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/design/09-assets/.gitkeep @@ -0,0 +1 @@ + diff --git a/design/README.md b/design/README.md new file mode 100644 index 0000000..712cfdb --- /dev/null +++ b/design/README.md @@ -0,0 +1,21 @@ +# EmboFlow Design Workspace + +This directory stores project design materials before or alongside implementation. + +## Structure + +- `00-overview`: project goals, scope, milestones +- `01-product`: requirements, user stories, feature definitions +- `02-architecture`: system architecture, modules, technical constraints +- `03-workflows`: business flows, sequence diagrams, operational flows +- `04-ui-ux`: wireframes, interaction notes, UX decisions +- `05-data`: data model, entities, schema drafts +- `06-api`: API contracts, request/response drafts, integration notes +- `07-research`: competitive analysis, references, discovery notes +- `08-decisions`: ADRs and major tradeoff records +- `09-assets`: diagrams, exported images, attachments +- `templates`: reusable design document templates + +## Suggested usage + +Keep design artifacts in Markdown where possible so they diff cleanly in Git. diff --git a/design/templates/.gitkeep b/design/templates/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/design/templates/.gitkeep @@ -0,0 +1 @@ + diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..1f48acd --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,62 @@ +services: + web: + image: node:20-alpine + working_dir: /workspace + command: ["sh", "-c", "sleep infinity"] + ports: + - "${WEB_PORT:-3000}:3000" + volumes: + - .:/workspace + depends_on: + - api + + api: + image: node:20-alpine + working_dir: /workspace + command: ["sh", "-c", "sleep infinity"] + ports: + - "${API_PORT:-3001}:3001" + volumes: + - .:/workspace + depends_on: + - mongo + + worker: + image: node:20-alpine + working_dir: /workspace + command: ["sh", "-c", "sleep infinity"] + ports: + - "${WORKER_PORT:-3002}:3002" + volumes: + - .:/workspace + depends_on: + - mongo + - minio + + mongo: + image: mongo:7 + restart: unless-stopped + ports: + - "${MONGO_PORT:-27017}:27017" + environment: + MONGO_INITDB_ROOT_USERNAME: "${MONGO_ROOT_USERNAME:-emboflow}" + MONGO_INITDB_ROOT_PASSWORD: "${MONGO_ROOT_PASSWORD:-emboflow}" + volumes: + - mongo-data:/data/db + + minio: + image: minio/minio:RELEASE.2024-10-29T16-01-48Z + restart: unless-stopped + command: ["server", "/data", "--console-address", ":9001"] + ports: + - "${MINIO_PORT:-9000}:9000" + - "${MINIO_CONSOLE_PORT:-9001}:9001" + environment: + MINIO_ROOT_USER: "${MINIO_ROOT_USER:-emboflow}" + MINIO_ROOT_PASSWORD: "${MINIO_ROOT_PASSWORD:-emboflow123}" + volumes: + - minio-data:/data + +volumes: + mongo-data: + minio-data: diff --git a/docs/development-workflow.md b/docs/development-workflow.md new file mode 100644 index 0000000..e51f75f --- /dev/null +++ b/docs/development-workflow.md @@ -0,0 +1,96 @@ +# EmboFlow Development Workflow + +## Goal + +Keep repository design artifacts and implementation changes aligned as EmboFlow evolves. + +## Working Agreement + +EmboFlow is being developed from explicit design documents under `design/`. Development should follow a doc-aware workflow instead of letting code drift ahead without recorded decisions. + +## Standard Change Flow + +### 1. Read Before Editing + +Before changing code, review the design files that define the affected area: + +- product scope +- architecture boundaries +- workflow model +- data model +- deployment model +- accepted ADRs + +### 2. Identify Impact + +Decide whether the change affects: + +- product behavior +- object model +- workflow/run/task semantics +- node or plugin contract +- storage assumptions +- user or permission behavior +- deployment/runtime assumptions + +If yes, the matching design files must be updated. + +### 3. Change Code And Docs Together + +Do not defer the design update. Treat design edits as part of the implementation, not follow-up cleanup. + +### 4. Run The Consistency Check + +From the repo root: + +```bash +python3 scripts/check_doc_code_sync.py . --strict +``` + +Interpret warnings manually. The script is a guardrail, not a replacement for judgment. + +### 5. Use The Local Hooks + +Install local hooks once per clone: + +```bash +bash scripts/install_hooks.sh +``` + +This enables: + +- `commit-msg`: require English-only gitmoji commit messages +- `pre-commit`: block staged code/config drift without doc updates +- `pre-push`: run commit-message validation, doc/code sync checks, and repository tests + +### 6. Close With Explicit Status + +Every implementation summary should state one of: + +- `Aligned` +- `Partially aligned` +- `Doc-first` + +and name the exact design files that were reviewed or updated. + +## EmboFlow-Specific Review Checklist + +Before closing a non-trivial change, confirm whether any of these need updates: + +- raw asset vs canonical dataset model +- workflow definition vs workflow run model +- node schema and plugin contract +- executor vs scheduler separation +- MongoDB collection or document shape +- workspace/project/user boundary +- deployment topology or storage assumptions + +## Automation + +This repository now uses both local and remote guardrails: + +- local git hooks from `.githooks/` +- commit message validation +- CI checks in `.github/workflows/guardrails.yml` + +These checks are intended to keep design documents, code changes, and commit history coherent. diff --git a/docs/plans/2026-03-26-emboflow-v1-foundation-and-mvp.md b/docs/plans/2026-03-26-emboflow-v1-foundation-and-mvp.md new file mode 100644 index 0000000..fd9f2b0 --- /dev/null +++ b/docs/plans/2026-03-26-emboflow-v1-foundation-and-mvp.md @@ -0,0 +1,621 @@ +# EmboFlow V1 Foundation And MVP Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Build the first usable EmboFlow increment: workspace-aware raw asset ingestion, workflow definition/versioning, local workflow execution, and the first web workflow authoring surfaces. + +**Architecture:** Use a TypeScript monorepo with a React web app, a Node.js API control plane, and a separate Node.js worker. Use MongoDB as the only database, object storage abstraction for cloud storage or MinIO, and a local scheduler with Python and Docker executor contracts. + +**Tech Stack:** pnpm workspace, React, TypeScript, React Flow, NestJS, Mongoose, MongoDB, Docker Compose, Python runtime hooks, unittest/Vitest/Jest-compatible project tests + +--- + +### Task 1: Bootstrap The Monorepo And Runtime Skeleton + +**Files:** +- Create: `package.json` +- Create: `pnpm-workspace.yaml` +- Create: `tsconfig.base.json` +- Create: `apps/web/package.json` +- Create: `apps/api/package.json` +- Create: `apps/worker/package.json` +- Create: `docker-compose.yml` +- Create: `.env.example` +- Test: `tests/test_repo_structure.py` + +**Step 1: Write the failing test** + +Create `tests/test_repo_structure.py` to assert the repository contains the expected top-level app folders and root workspace files. + +**Step 2: Run test to verify it fails** + +Run: + +```bash +python3 -m unittest tests/test_repo_structure.py -v +``` + +Expected: FAIL because the monorepo files and app folders do not exist yet. + +**Step 3: Write minimal implementation** + +Create the pnpm workspace root, app package manifests, root TypeScript config, `.env.example`, and `docker-compose.yml` with services for: + +- `web` +- `api` +- `worker` +- `mongo` +- `minio` + +Keep the first version minimal. Do not add extra infra services that are not required by the design. + +**Step 4: Run test to verify it passes** + +Run: + +```bash +python3 -m unittest tests/test_repo_structure.py -v +``` + +Expected: PASS + +**Step 5: Commit** + +```bash +git add package.json pnpm-workspace.yaml tsconfig.base.json apps docker-compose.yml .env.example tests/test_repo_structure.py +git commit -m ":tada: bootstrap workspace and runtime skeleton" +``` + +### Task 2: Create Shared Domain Contracts And Mongo Setup + +**Files:** +- Create: `packages/contracts/package.json` +- Create: `packages/contracts/src/domain.ts` +- Create: `apps/api/src/common/mongo/mongo.module.ts` +- Create: `apps/api/src/common/mongo/schemas/workspace.schema.ts` +- Create: `apps/api/src/common/mongo/schemas/project.schema.ts` +- Create: `apps/api/src/common/mongo/schemas/asset.schema.ts` +- Create: `apps/api/src/common/mongo/schemas/workflow.schema.ts` +- Test: `apps/api/test/domain-contracts.spec.ts` + +**Step 1: Write the failing test** + +Create `apps/api/test/domain-contracts.spec.ts` asserting: + +- workspace types include `personal` and `team` +- asset types include raw and dataset-style sources +- workflow status values match the design docs + +**Step 2: Run test to verify it fails** + +Run: + +```bash +pnpm --filter api test domain-contracts.spec.ts +``` + +Expected: FAIL because contracts and schemas are missing. + +**Step 3: Write minimal implementation** + +Create shared domain enums and base Mongo schema definitions for: + +- workspaces +- projects +- assets +- workflow definitions + +Add a minimal Mongo module in the API app using environment-based connection config. + +**Step 4: Run test to verify it passes** + +Run: + +```bash +pnpm --filter api test domain-contracts.spec.ts +``` + +Expected: PASS + +**Step 5: Commit** + +```bash +git add packages/contracts apps/api/src/common apps/api/test/domain-contracts.spec.ts +git commit -m ":sparkles: add shared domain contracts and mongo setup" +``` + +### Task 3: Implement Identity, Workspace, And Project APIs + +**Files:** +- Create: `apps/api/src/modules/auth/auth.module.ts` +- Create: `apps/api/src/modules/auth/auth.controller.ts` +- Create: `apps/api/src/modules/workspaces/workspaces.module.ts` +- Create: `apps/api/src/modules/workspaces/workspaces.controller.ts` +- Create: `apps/api/src/modules/projects/projects.module.ts` +- Create: `apps/api/src/modules/projects/projects.controller.ts` +- Create: `apps/api/src/modules/projects/projects.service.ts` +- Test: `apps/api/test/projects.e2e-spec.ts` + +**Step 1: Write the failing test** + +Create `apps/api/test/projects.e2e-spec.ts` covering: + +- create personal workspace bootstrap flow +- create project under a workspace +- reject project creation without a workspace id + +**Step 2: Run test to verify it fails** + +Run: + +```bash +pnpm --filter api test projects.e2e-spec.ts +``` + +Expected: FAIL because the modules and endpoints do not exist yet. + +**Step 3: Write minimal implementation** + +Implement: + +- development-safe auth stub or local auth module +- workspace creation and listing +- project creation and listing +- basic membership checks sufficient for V1 local development + +Do not build a full production auth stack before the API shape is stable. + +**Step 4: Run test to verify it passes** + +Run: + +```bash +pnpm --filter api test projects.e2e-spec.ts +``` + +Expected: PASS + +**Step 5: Commit** + +```bash +git add apps/api/src/modules/auth apps/api/src/modules/workspaces apps/api/src/modules/projects apps/api/test/projects.e2e-spec.ts +git commit -m ":sparkles: add workspace and project APIs" +``` + +### Task 4: Implement Asset Ingestion, Storage Abstraction, And Probe Metadata + +**Files:** +- Create: `apps/api/src/modules/storage/storage.module.ts` +- Create: `apps/api/src/modules/storage/storage.service.ts` +- Create: `apps/api/src/modules/assets/assets.module.ts` +- Create: `apps/api/src/modules/assets/assets.controller.ts` +- Create: `apps/api/src/modules/assets/assets.service.ts` +- Create: `apps/api/src/modules/assets/probe/probe.service.ts` +- Create: `apps/api/src/common/mongo/schemas/asset-probe-report.schema.ts` +- Test: `apps/api/test/assets.e2e-spec.ts` + +**Step 1: Write the failing test** + +Create `apps/api/test/assets.e2e-spec.ts` covering: + +- register an uploaded asset record +- create a probe report for a raw asset +- return recommended next actions from probe metadata + +**Step 2: Run test to verify it fails** + +Run: + +```bash +pnpm --filter api test assets.e2e-spec.ts +``` + +Expected: FAIL because asset ingestion and probe services are missing. + +**Step 3: Write minimal implementation** + +Implement: + +- storage abstraction interface +- MinIO/S3-compatible config contract +- asset create/list/detail endpoints +- probe-report persistence +- placeholder probe logic for directory and archive summaries + +Do not build full binary upload optimization yet. First make the metadata contract stable. + +**Step 4: Run test to verify it passes** + +Run: + +```bash +pnpm --filter api test assets.e2e-spec.ts +``` + +Expected: PASS + +**Step 5: Commit** + +```bash +git add apps/api/src/modules/storage apps/api/src/modules/assets apps/api/src/common/mongo/schemas/asset-probe-report.schema.ts apps/api/test/assets.e2e-spec.ts +git commit -m ":truck: add asset ingestion and probe metadata flow" +``` + +### Task 5: Implement Workflow Definitions, Versions, Runs, And Tasks + +**Files:** +- Create: `apps/api/src/modules/workflows/workflows.module.ts` +- Create: `apps/api/src/modules/workflows/workflows.controller.ts` +- Create: `apps/api/src/modules/workflows/workflows.service.ts` +- Create: `apps/api/src/modules/runs/runs.module.ts` +- Create: `apps/api/src/modules/runs/runs.controller.ts` +- Create: `apps/api/src/modules/runs/runs.service.ts` +- Create: `apps/api/src/common/mongo/schemas/workflow-definition-version.schema.ts` +- Create: `apps/api/src/common/mongo/schemas/workflow-run.schema.ts` +- Create: `apps/api/src/common/mongo/schemas/run-task.schema.ts` +- Test: `apps/api/test/workflow-runs.e2e-spec.ts` + +**Step 1: Write the failing test** + +Create `apps/api/test/workflow-runs.e2e-spec.ts` covering: + +- create workflow definition +- save workflow version +- create workflow run from saved version +- generate initial run tasks for ready nodes + +**Step 2: Run test to verify it fails** + +Run: + +```bash +pnpm --filter api test workflow-runs.e2e-spec.ts +``` + +Expected: FAIL because workflow versioning and run creation do not exist yet. + +**Step 3: Write minimal implementation** + +Implement: + +- workflow definition head record +- immutable workflow version snapshots +- run creation from a workflow version +- initial DAG compilation for simple source-to-transform chains +- run task persistence + +Keep V1 graph compilation simple. Support sequential edges first, then one-level branching. + +**Step 4: Run test to verify it passes** + +Run: + +```bash +pnpm --filter api test workflow-runs.e2e-spec.ts +``` + +Expected: PASS + +**Step 5: Commit** + +```bash +git add apps/api/src/modules/workflows apps/api/src/modules/runs apps/api/src/common/mongo/schemas/workflow-definition-version.schema.ts apps/api/src/common/mongo/schemas/workflow-run.schema.ts apps/api/src/common/mongo/schemas/run-task.schema.ts apps/api/test/workflow-runs.e2e-spec.ts +git commit -m ":sparkles: add workflow versioning and run records" +``` + +### Task 6: Add The Worker, Local Scheduler, And Executor Contracts + +**Files:** +- Create: `apps/worker/src/main.ts` +- Create: `apps/worker/src/runner/task-runner.ts` +- Create: `apps/worker/src/scheduler/local-scheduler.ts` +- Create: `apps/worker/src/executors/python-executor.ts` +- Create: `apps/worker/src/executors/docker-executor.ts` +- Create: `apps/worker/src/executors/http-executor.ts` +- Create: `apps/worker/src/contracts/execution-context.ts` +- Test: `apps/worker/test/task-runner.spec.ts` + +**Step 1: Write the failing test** + +Create `apps/worker/test/task-runner.spec.ts` covering: + +- worker loads pending tasks +- worker marks task running then success +- worker chooses executor based on node runtime config + +**Step 2: Run test to verify it fails** + +Run: + +```bash +pnpm --filter worker test task-runner.spec.ts +``` + +Expected: FAIL because the worker runtime does not exist yet. + +**Step 3: Write minimal implementation** + +Implement: + +- worker bootstrap +- polling or queue-backed local scheduler +- execution context builder +- stub Python, Docker, and HTTP executors +- task status transitions + +Do not implement full Docker isolation logic in one step. First lock the runtime interfaces and transitions. + +**Step 4: Run test to verify it passes** + +Run: + +```bash +pnpm --filter worker test task-runner.spec.ts +``` + +Expected: PASS + +**Step 5: Commit** + +```bash +git add apps/worker apps/api/src/modules/runs apps/worker/test/task-runner.spec.ts +git commit -m ":construction_worker: add local worker and executor contracts" +``` + +### Task 7: Build The Web Shell, Workspace Flow, And Asset Workspace + +**Files:** +- Create: `apps/web/src/main.tsx` +- Create: `apps/web/src/app/router.tsx` +- Create: `apps/web/src/features/layout/app-shell.tsx` +- Create: `apps/web/src/features/workspaces/workspace-switcher.tsx` +- Create: `apps/web/src/features/projects/project-selector.tsx` +- Create: `apps/web/src/features/assets/assets-page.tsx` +- Create: `apps/web/src/features/assets/asset-detail-page.tsx` +- Create: `apps/web/src/features/assets/components/asset-list.tsx` +- Create: `apps/web/src/features/assets/components/asset-summary-panel.tsx` +- Test: `apps/web/src/features/assets/assets-page.test.tsx` + +**Step 1: Write the failing test** + +Create `apps/web/src/features/assets/assets-page.test.tsx` covering: + +- app shell renders primary navigation +- assets page renders asset rows from API data +- asset detail page renders probe summary + +**Step 2: Run test to verify it fails** + +Run: + +```bash +pnpm --filter web test assets-page.test.tsx +``` + +Expected: FAIL because the web app shell and pages do not exist yet. + +**Step 3: Write minimal implementation** + +Implement: + +- web app bootstrap +- primary navigation matching the design docs +- workspace/project header controls +- asset list page +- asset detail page with summary and action buttons + +Defer advanced preview renderers. Start with structured metadata and simple detail views. + +**Step 4: Run test to verify it passes** + +Run: + +```bash +pnpm --filter web test assets-page.test.tsx +``` + +Expected: PASS + +**Step 5: Commit** + +```bash +git add apps/web apps/web/src/features/assets/assets-page.test.tsx +git commit -m ":sparkles: add web shell and asset workspace" +``` + +### Task 8: Build Canvas Authoring, Run Detail, And First Workflow Actions + +**Files:** +- Create: `apps/web/src/features/workflows/workflows-page.tsx` +- Create: `apps/web/src/features/workflows/workflow-editor-page.tsx` +- Create: `apps/web/src/features/workflows/components/node-library.tsx` +- Create: `apps/web/src/features/workflows/components/workflow-canvas.tsx` +- Create: `apps/web/src/features/workflows/components/node-config-panel.tsx` +- Create: `apps/web/src/features/runs/run-detail-page.tsx` +- Create: `apps/web/src/features/runs/components/run-graph-view.tsx` +- Create: `apps/web/src/features/runs/components/task-log-panel.tsx` +- Test: `apps/web/src/features/workflows/workflow-editor-page.test.tsx` + +**Step 1: Write the failing test** + +Create `apps/web/src/features/workflows/workflow-editor-page.test.tsx` covering: + +- node library renders categories +- node config panel opens when a node is selected +- run detail view shows node status badges from run data + +**Step 2: Run test to verify it fails** + +Run: + +```bash +pnpm --filter web test workflow-editor-page.test.tsx +``` + +Expected: FAIL because the workflow editor and run detail pages do not exist yet. + +**Step 3: Write minimal implementation** + +Implement: + +- workflow list page +- workflow editor page using React Flow +- left node library, center canvas, right config panel +- save workflow version action +- trigger workflow run action +- run detail page with graph and selected-node log panel + +Keep the first editor scoped to V1 node categories and schema-driven config rendering. + +**Step 4: Run test to verify it passes** + +Run: + +```bash +pnpm --filter web test workflow-editor-page.test.tsx +``` + +Expected: PASS + +**Step 5: Commit** + +```bash +git add apps/web/src/features/workflows apps/web/src/features/runs apps/web/src/features/workflows/workflow-editor-page.test.tsx +git commit -m ":sparkles: add canvas workflow editor and run detail pages" +``` + +### Task 9: Add Preview Surface, Delivery Nodes, And MVP Integration + +**Files:** +- Create: `apps/api/src/modules/artifacts/artifacts.module.ts` +- Create: `apps/api/src/modules/artifacts/artifacts.controller.ts` +- Create: `apps/api/src/modules/artifacts/artifacts.service.ts` +- Create: `apps/web/src/features/explore/explore-page.tsx` +- Create: `apps/web/src/features/explore/renderers/json-renderer.tsx` +- Create: `apps/web/src/features/explore/renderers/video-renderer.tsx` +- Create: `apps/web/src/features/explore/renderers/directory-renderer.tsx` +- Create: `apps/api/src/modules/plugins/builtin/delivery-nodes.ts` +- Test: `apps/api/test/artifacts.e2e-spec.ts` +- Test: `apps/web/src/features/explore/explore-page.test.tsx` + +**Step 1: Write the failing tests** + +Create: + +- `apps/api/test/artifacts.e2e-spec.ts` for artifact retrieval by producer +- `apps/web/src/features/explore/explore-page.test.tsx` for opening and rendering supported artifact types + +**Step 2: Run tests to verify they fail** + +Run: + +```bash +pnpm --filter api test artifacts.e2e-spec.ts +pnpm --filter web test explore-page.test.tsx +``` + +Expected: FAIL because artifact APIs and explore renderers do not exist yet. + +**Step 3: Write minimal implementation** + +Implement: + +- artifact module and lookup endpoints +- explore page +- JSON, directory, and video renderers +- built-in delivery-normalization node definitions for the V1 business path + +Do not implement the full renderer plugin platform yet. Start with built-ins and stable renderer contracts. + +**Step 4: Run tests to verify they pass** + +Run: + +```bash +pnpm --filter api test artifacts.e2e-spec.ts +pnpm --filter web test explore-page.test.tsx +``` + +Expected: PASS + +**Step 5: Commit** + +```bash +git add apps/api/src/modules/artifacts apps/api/src/modules/plugins/builtin/delivery-nodes.ts apps/api/test/artifacts.e2e-spec.ts apps/web/src/features/explore apps/web/src/features/explore/explore-page.test.tsx +git commit -m ":package: add explore surface and delivery artifacts" +``` + +### Task 10: Harden Guardrails, Docs, And Developer Entry Commands + +**Files:** +- Modify: `CONTRIBUTING.md` +- Modify: `docs/development-workflow.md` +- Modify: `design/03-workflows/workflow-execution-model.md` +- Modify: `design/05-data/mongodb-data-model.md` +- Create: `Makefile` +- Create: `README.md` +- Test: `tests/test_dev_commands.py` + +**Step 1: Write the failing test** + +Create `tests/test_dev_commands.py` asserting: + +- `Makefile` exposes expected local commands +- `README.md` documents bootstrap, hooks, test, and local run commands + +**Step 2: Run test to verify it fails** + +Run: + +```bash +python3 -m unittest tests/test_dev_commands.py -v +``` + +Expected: FAIL because developer entry commands are not documented yet. + +**Step 3: Write minimal implementation** + +Add: + +- `make bootstrap` +- `make test` +- `make dev-api` +- `make dev-web` +- `make dev-worker` +- `make guardrails` + +Document the developer flow in `README.md` and update design docs if implementation decisions changed during Tasks 1-9. + +**Step 4: Run test to verify it passes** + +Run: + +```bash +python3 -m unittest tests/test_dev_commands.py -v +``` + +Expected: PASS + +**Step 5: Commit** + +```bash +git add CONTRIBUTING.md docs/development-workflow.md design/03-workflows/workflow-execution-model.md design/05-data/mongodb-data-model.md Makefile README.md tests/test_dev_commands.py +git commit -m ":memo: add developer entry commands and bootstrap docs" +``` + +## Exit Criteria + +The first implementation pass is complete when: + +- a user can create a workspace and project +- a raw asset can be registered and probed +- a workflow can be created, versioned, and executed locally +- run tasks produce observable status and artifacts +- the web app exposes assets, workflows, runs, and basic explore views +- guardrails for docs, hooks, commit messages, and CI remain green + +## Notes + +- Keep commits small and use the repository gitmoji + English commit policy. +- Update design files in the same task where behavior or architecture changes. +- Do not add training execution before the V1 data workflow loop is stable. diff --git a/package.json b/package.json new file mode 100644 index 0000000..41a3dd1 --- /dev/null +++ b/package.json @@ -0,0 +1,9 @@ +{ + "name": "emboflow", + "private": true, + "version": "0.1.0", + "packageManager": "pnpm@9.12.3", + "scripts": { + "test": "python3 -m unittest discover -s tests -p 'test_*.py'" + } +} diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml new file mode 100644 index 0000000..3ff5faa --- /dev/null +++ b/pnpm-workspace.yaml @@ -0,0 +1,3 @@ +packages: + - "apps/*" + - "packages/*" diff --git a/scripts/check_commit_message.py b/scripts/check_commit_message.py new file mode 100755 index 0000000..2ea53de --- /dev/null +++ b/scripts/check_commit_message.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +import argparse +import re +import subprocess +import sys +from pathlib import Path + + +SHORTCODE_PREFIX = re.compile(r"^:[a-z0-9_+-]+:\s+") +EMOJI_PREFIX = re.compile(r"^[\u2600-\u27BF\U0001F300-\U0001FAFF]\s+") + + +def strip_prefix(message: str) -> str: + if SHORTCODE_PREFIX.match(message): + return SHORTCODE_PREFIX.sub("", message, count=1) + if EMOJI_PREFIX.match(message): + return EMOJI_PREFIX.sub("", message, count=1) + return message + + +def validate_message(message: str) -> list[str]: + lines = [line.rstrip("\n") for line in message.splitlines()] + cleaned_lines = [line for line in lines if line and not line.startswith("#")] + if not cleaned_lines: + return ["Commit message must not be empty."] + + subject = cleaned_lines[0] + errors: list[str] = [] + + if not SHORTCODE_PREFIX.match(subject) and not EMOJI_PREFIX.match(subject): + errors.append("Commit subject must start with a gitmoji shortcode or emoji.") + + body = "\n".join(cleaned_lines) + normalized = strip_prefix(subject) + ("\n" + "\n".join(cleaned_lines[1:]) if len(cleaned_lines) > 1 else "") + + try: + normalized.encode("ascii") + except UnicodeEncodeError: + errors.append("Commit message must be written in English ASCII text after the gitmoji prefix.") + + if not strip_prefix(subject).strip(): + errors.append("Commit subject must include an English summary after the gitmoji prefix.") + + if re.search(r"[\u4e00-\u9fff]", body): + errors.append("Commit message must not contain Chinese characters.") + + return errors + + +def read_message_file(path: Path) -> str: + return path.read_text(encoding="utf-8") + + +def run_git(*args: str) -> list[str]: + result = subprocess.run( + ["git", *args], + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0: + raise RuntimeError(result.stderr.strip() or "git command failed") + return [line.strip() for line in result.stdout.splitlines() if line.strip()] + + +def commit_messages_from_range(rev_range: str) -> list[tuple[str, str]]: + if ".." in rev_range: + shas = run_git("rev-list", rev_range) + else: + shas = [rev_range] + + messages: list[tuple[str, str]] = [] + for sha in shas: + message = subprocess.run( + ["git", "log", "--format=%B", "-n", "1", sha], + capture_output=True, + text=True, + check=False, + ) + if message.returncode != 0: + raise RuntimeError(message.stderr.strip() or "git log failed") + messages.append((sha, message.stdout.strip())) + return messages + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Validate commit message format.") + parser.add_argument("--file", help="path to commit message file") + parser.add_argument("--rev-range", help="git revision range or single commit") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + + if bool(args.file) == bool(args.rev_range): + print("Use exactly one of --file or --rev-range.") + return 2 + + failures: list[str] = [] + + if args.file: + message = read_message_file(Path(args.file)) + errors = validate_message(message) + if errors: + failures.extend(errors) + else: + for sha, message in commit_messages_from_range(args.rev_range): + errors = validate_message(message) + for error in errors: + failures.append(f"{sha[:12]}: {error}") + + if failures: + print("Commit message validation failed:") + for failure in failures: + print(f" - {failure}") + print("\nExpected format example:") + print(" :sparkles: add hook templates and CI guardrails") + return 1 + + print("Commit message validation passed.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/check_doc_code_sync.py b/scripts/check_doc_code_sync.py new file mode 100755 index 0000000..7fdcaec --- /dev/null +++ b/scripts/check_doc_code_sync.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 +import argparse +import subprocess +import sys +from pathlib import Path + + +DOC_PATTERNS = ( + "design/", + "docs/", + "adr", + "architecture", + "prd", + "spec", + "plan", +) + +CODE_SUFFIXES = { + ".py", + ".ts", + ".tsx", + ".js", + ".jsx", + ".java", + ".go", + ".rs", + ".rb", + ".php", + ".kt", + ".swift", + ".scala", + ".sh", +} + +CODE_HINTS = ("apps/", "packages/", "scripts/") +TEST_HINTS = ("test", "spec", "__tests__", "tests/") +CONFIG_SUFFIXES = {".yml", ".yaml", ".json", ".toml", ".ini", ".env"} +CONFIG_HINTS = ("docker", "compose", "k8s", "helm", "terraform", ".github/", ".githooks/", ".env") + + +def run_git(repo: Path, *args: str) -> list[str]: + result = subprocess.run( + ["git", "-C", str(repo), *args], + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0: + raise RuntimeError(result.stderr.strip() or "git command failed") + return [line.strip() for line in result.stdout.splitlines() if line.strip()] + + +def classify(path_text: str) -> str: + lower = path_text.lower() + path = Path(path_text) + + if any(token in lower for token in DOC_PATTERNS) or path.suffix == ".md": + return "docs" + if any(token in lower for token in TEST_HINTS): + return "tests" + if any(token in lower for token in CODE_HINTS): + return "code" + if path.suffix in CODE_SUFFIXES: + return "code" + if path.suffix in CONFIG_SUFFIXES or any(token in lower for token in CONFIG_HINTS): + return "config" + return "other" + + +def print_group(title: str, items: list[str]) -> None: + print(f"\n{title}:") + if not items: + print(" - none") + return + for item in items: + print(f" - {item}") + + +def assess_changes( + docs: list[str], + code: list[str], + tests: list[str], + config: list[str], + other: list[str], + strict: bool, +) -> dict: + warnings: list[str] = [] + blockers: list[str] = [] + + if code and not docs: + message = "Code changed but no design/doc files changed." + warnings.append(message) + if strict: + blockers.append(message) + if config and not docs: + message = "Config or deployment files changed without any doc updates." + warnings.append(message) + if strict: + blockers.append(message) + if docs and not code and not config and not tests: + warnings.append( + "Docs changed without code changes. This may be intentional, but verify they still match the repository." + ) + if code and not tests: + warnings.append( + "Code changed without any test-file changes. Verify whether tests should change." + ) + if other: + warnings.append( + "Unclassified files changed. Confirm they do not affect documented behavior or runtime assumptions." + ) + + return { + "warnings": warnings, + "blockers": blockers, + "blocking": bool(blockers), + } + + +def collect_paths(repo: Path, args: argparse.Namespace) -> list[str]: + if args.staged: + return run_git(repo, "diff", "--cached", "--name-only", "--diff-filter=ACMR") + if args.base_ref: + return run_git(repo, "diff", "--name-only", "--diff-filter=ACMR", f"{args.base_ref}...HEAD") + if args.rev_range: + if ".." in args.rev_range: + return run_git(repo, "diff", "--name-only", "--diff-filter=ACMR", args.rev_range) + return run_git(repo, "diff-tree", "--no-commit-id", "--name-only", "-r", args.rev_range) + + changed = run_git(repo, "status", "--short") + return sorted({line[3:] for line in changed if len(line) > 3}) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Check whether doc changes track code changes.") + parser.add_argument("repo", nargs="?", default=".", help="git repository path") + parser.add_argument("--strict", action="store_true", help="fail on blocking drift") + parser.add_argument("--staged", action="store_true", help="inspect staged files only") + parser.add_argument("--base-ref", help="compare changes from base ref to HEAD") + parser.add_argument("--rev-range", help="inspect a git revision range or a single commit") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + repo = Path(args.repo).expanduser().resolve() + + if not (repo / ".git").exists(): + print(f"Not a git repository: {repo}") + return 2 + + paths = sorted(set(collect_paths(repo, args))) + + docs = [p for p in paths if classify(p) == "docs"] + code = [p for p in paths if classify(p) == "code"] + tests = [p for p in paths if classify(p) == "tests"] + config = [p for p in paths if classify(p) == "config"] + other = [p for p in paths if classify(p) == "other"] + assessment = assess_changes(docs, code, tests, config, other, args.strict) + + print(f"Repository: {repo}") + print(f"Changed files: {len(paths)}") + print_group("Design and doc files", docs) + print_group("Code files", code) + print_group("Test files", tests) + print_group("Config and infra files", config) + print_group("Other files", other) + + print("\nAssessment:") + if not assessment["warnings"]: + print(" - No obvious doc/code drift detected from changed-file classification.") + else: + for warning in assessment["warnings"]: + print(f" - {warning}") + + print("\nNext actions:") + if code and not docs: + print(" - Review design/ or docs/ and update affected architecture, workflow, or API notes.") + if docs: + print(" - Confirm each changed doc still matches the actual implementation.") + if code: + print(" - Confirm changed code paths match documented workflow, schema, and runtime assumptions.") + if other: + print(" - Review unclassified paths and decide whether docs or tests should be updated.") + + if assessment["blocking"]: + print("\nResult: blocking drift detected.") + return 1 + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/install_hooks.sh b/scripts/install_hooks.sh new file mode 100644 index 0000000..7b322a9 --- /dev/null +++ b/scripts/install_hooks.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +git -C "$repo_root" config core.hooksPath .githooks +chmod +x "$repo_root"/.githooks/* +chmod +x "$repo_root"/scripts/check_doc_code_sync.py +chmod +x "$repo_root"/scripts/check_commit_message.py + +echo "Installed local git hooks from .githooks" +echo "Active hooks path: $(git -C "$repo_root" config core.hooksPath)" diff --git a/tests/test_commit_message.py b/tests/test_commit_message.py new file mode 100644 index 0000000..41444f4 --- /dev/null +++ b/tests/test_commit_message.py @@ -0,0 +1,40 @@ +import importlib.util +from pathlib import Path +import unittest + + +def load_module(module_name: str, path: Path): + spec = importlib.util.spec_from_file_location(module_name, path) + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + spec.loader.exec_module(module) + return module + + +REPO_ROOT = Path(__file__).resolve().parents[1] +MODULE = load_module( + "check_commit_message", + REPO_ROOT / "scripts" / "check_commit_message.py", +) + + +class CommitMessageValidationTests(unittest.TestCase): + def test_accepts_gitmoji_shortcode_with_english_message(self): + errors = MODULE.validate_message(":sparkles: add local hook templates") + self.assertEqual(errors, []) + + def test_accepts_unicode_gitmoji_with_english_message(self): + errors = MODULE.validate_message("✨ add ci validation for hooks") + self.assertEqual(errors, []) + + def test_rejects_message_without_gitmoji_prefix(self): + errors = MODULE.validate_message("add local hook templates") + self.assertTrue(any("gitmoji" in error.lower() for error in errors)) + + def test_rejects_non_english_message(self): + errors = MODULE.validate_message(":sparkles: 添加本地 hook") + self.assertTrue(any("english" in error.lower() for error in errors)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_doc_code_sync.py b/tests/test_doc_code_sync.py new file mode 100644 index 0000000..3af7a39 --- /dev/null +++ b/tests/test_doc_code_sync.py @@ -0,0 +1,55 @@ +import importlib.util +from pathlib import Path +import unittest + + +def load_module(module_name: str, path: Path): + spec = importlib.util.spec_from_file_location(module_name, path) + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + spec.loader.exec_module(module) + return module + + +REPO_ROOT = Path(__file__).resolve().parents[1] +MODULE = load_module( + "check_doc_code_sync", + REPO_ROOT / "scripts" / "check_doc_code_sync.py", +) + + +class DocCodeSyncAssessmentTests(unittest.TestCase): + def test_classifies_python_scripts_as_code(self): + self.assertEqual(MODULE.classify("scripts/check_doc_code_sync.py"), "code") + + def test_classifies_app_paths_as_code(self): + self.assertEqual(MODULE.classify("apps/web/package.json"), "code") + + def test_classifies_env_example_as_config(self): + self.assertEqual(MODULE.classify(".env.example"), "config") + + def test_strict_mode_blocks_code_without_doc_updates(self): + assessment = MODULE.assess_changes( + docs=[], + code=["src/app.ts"], + tests=[], + config=[], + other=[], + strict=True, + ) + self.assertTrue(assessment["blocking"]) + + def test_doc_and_code_changes_together_do_not_block(self): + assessment = MODULE.assess_changes( + docs=["design/02-architecture/system-architecture.md"], + code=["src/app.ts"], + tests=[], + config=[], + other=[], + strict=True, + ) + self.assertFalse(assessment["blocking"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_repo_structure.py b/tests/test_repo_structure.py new file mode 100644 index 0000000..69fa382 --- /dev/null +++ b/tests/test_repo_structure.py @@ -0,0 +1,35 @@ +from pathlib import Path +import unittest + + +REPO_ROOT = Path(__file__).resolve().parents[1] + + +class RepoStructureTests(unittest.TestCase): + def test_root_workspace_files_exist(self): + required_files = [ + "package.json", + "pnpm-workspace.yaml", + "tsconfig.base.json", + "docker-compose.yml", + ".env.example", + ] + + for relative_path in required_files: + with self.subTest(path=relative_path): + self.assertTrue((REPO_ROOT / relative_path).is_file()) + + def test_app_package_manifests_exist(self): + required_files = [ + "apps/web/package.json", + "apps/api/package.json", + "apps/worker/package.json", + ] + + for relative_path in required_files: + with self.subTest(path=relative_path): + self.assertTrue((REPO_ROOT / relative_path).is_file()) + + +if __name__ == "__main__": + unittest.main() diff --git a/tsconfig.base.json b/tsconfig.base.json new file mode 100644 index 0000000..228a29e --- /dev/null +++ b/tsconfig.base.json @@ -0,0 +1,12 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "ESNext", + "moduleResolution": "Bundler", + "strict": true, + "esModuleInterop": true, + "resolveJsonModule": true, + "skipLibCheck": true, + "baseUrl": "." + } +}