From 44d61ad4bf3ad31b86b6e91521f8f438bdbcb6d2 Mon Sep 17 00:00:00 2001 From: Songbird Date: Fri, 7 Nov 2025 17:29:37 +0100 Subject: [PATCH] Store Cognee datasets in the projects bucket --- CHANGELOG.md | 2 +- README.md | 2 +- cli/src/fuzzforge_cli/config.py | 20 ++++++++++++++------ docker/docker-compose.cognee.yml | 5 +---- docs/docs/ai/a2a-services.md | 2 +- docs/docs/ai/architecture.md | 2 +- docs/docs/ai/ingestion.md | 2 +- docs/docs/how-to/docker-setup.md | 2 +- volumes/env/.env.template | 4 ++-- 9 files changed, 23 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e6c455..4b3ddc8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### ✨ Enhancements - Added Ladybug-backed Cognee integration with optional MinIO/S3 storage. Projects can now set `COGNEE_STORAGE_BACKEND=s3` (plus the `COGNEE_S3_*` settings) to keep knowledge graphs in the shared MinIO bucket seeded by `docker-compose`, enabling multi-tenant ingestion across workers and containers. - Introduced a dedicated Cognee service (`docker/docker-compose.cognee.yml`) and HTTP client so `fuzzforge ingest` streams data to the shared backend (`COGNEE_SERVICE_URL`) instead of importing Cognee locally. Each project now auto-provisions its own Cognee account/tenant and authenticates via the REST API, keeping datasets isolated even though the service is shared. -- Added an event-driven ingestion pipeline: MinIO publishes `PUT` events from `s3://cognee/projects//...` to RabbitMQ, and the new `ingestion-dispatcher` container downloads the file, logs into Cognee as that project’s tenant, and invokes `/api/v1/add` + `/api/v1/cognify`. Uploading files (rsync, CI, etc.) now keeps datasets fresh without touching the CLI. +- Added an event-driven ingestion pipeline: MinIO publishes `PUT` events from `s3://projects//...` to RabbitMQ, and the new `ingestion-dispatcher` container downloads the file, logs into Cognee as that project’s tenant, and invokes `/api/v1/add` + `/api/v1/cognify`. Uploading files (rsync, CI, etc.) now keeps datasets fresh without touching the CLI. ### 📝 Documentation - Added comprehensive worker startup documentation across all guides diff --git a/README.md b/README.md index 43ca9ff..5a6919d 100644 --- a/README.md +++ b/README.md @@ -197,7 +197,7 @@ ff workflow run security_assessment . # Start workflow - CLI uploads files au Uploading files into MinIO automatically streams them into Cognee: ``` -s3://cognee/projects// +s3://projects// files/... # → _codebase dataset findings/... # → _findings dataset docs/... # → _docs dataset diff --git a/cli/src/fuzzforge_cli/config.py b/cli/src/fuzzforge_cli/config.py index 680490d..adb0065 100644 --- a/cli/src/fuzzforge_cli/config.py +++ b/cli/src/fuzzforge_cli/config.py @@ -138,7 +138,7 @@ class CogneeConfig(BaseModel): service_password: Optional[str] = None storage_backend: Literal["local", "s3"] = "s3" s3_bucket: Optional[str] = None - s3_prefix: Optional[str] = "projects" + s3_prefix: Optional[str] = None s3_endpoint_url: Optional[str] = None s3_region: Optional[str] = None s3_access_key: Optional[str] = None @@ -217,8 +217,12 @@ class FuzzForgeConfig(BaseModel): cognee.service_url = "http://localhost:18000" changed = True - if not cognee.s3_prefix: - cognee.s3_prefix = "projects" + if not cognee.s3_bucket: + cognee.s3_bucket = "projects" + changed = True + + if cognee.s3_prefix is None: + cognee.s3_prefix = "" changed = True default_email = f"project_{self.project.id}@fuzzforge.dev" @@ -234,9 +238,13 @@ class FuzzForgeConfig(BaseModel): changed = True if cognee.storage_backend.lower() == "s3": - bucket = cognee.s3_bucket or "cognee" - prefix = (cognee.s3_prefix or "projects").strip("/") - base_uri = f"s3://{bucket}/{prefix}/{self.project.id}" + bucket = cognee.s3_bucket or "projects" + prefix = (cognee.s3_prefix or "").strip("/") + path_parts = [f"s3://{bucket}"] + if prefix: + path_parts.append(prefix) + path_parts.append(self.project.id) + base_uri = "/".join(path_parts) data_dir = f"{base_uri}/files" system_dir = f"{base_uri}/graph" else: diff --git a/docker/docker-compose.cognee.yml b/docker/docker-compose.cognee.yml index 7feaed2..e74e5e5 100644 --- a/docker/docker-compose.cognee.yml +++ b/docker/docker-compose.cognee.yml @@ -12,10 +12,7 @@ services: GRAPH_DATABASE_PROVIDER: ladybug ENABLE_BACKEND_ACCESS_CONTROL: "true" STORAGE_BACKEND: s3 - STORAGE_BUCKET_NAME: ${COGNEE_S3_BUCKET:-cognee} - DATA_ROOT_DIRECTORY: s3://${COGNEE_S3_BUCKET:-cognee}/${COGNEE_S3_PREFIX:-projects} - SYSTEM_ROOT_DIRECTORY: s3://${COGNEE_S3_BUCKET:-cognee}/${COGNEE_S3_PREFIX:-projects} - CACHE_ROOT_DIRECTORY: s3://${COGNEE_S3_BUCKET:-cognee}/${COGNEE_S3_PREFIX:-projects}/cache + STORAGE_BUCKET_NAME: ${COGNEE_S3_BUCKET:-projects} DB_PROVIDER: sqlite DB_PATH: /data/relational DB_NAME: cognee.db diff --git a/docs/docs/ai/a2a-services.md b/docs/docs/ai/a2a-services.md index ea43d90..dbcc6e7 100644 --- a/docs/docs/ai/a2a-services.md +++ b/docs/docs/ai/a2a-services.md @@ -12,7 +12,7 @@ Run the command from a project directory that already contains `.fuzzforge/`. Th **Default directories / services** - Logs: `.fuzzforge/logs/cognee.log` -- Cognee datasets: hosted by the shared Cognee service (`COGNEE_SERVICE_URL`) inside the configured MinIO/S3 bucket. Local mode falls back to `.fuzzforge/cognee/project_/{data,system}`. Uploads dropped into `s3://cognee/projects//...` are ingested automatically via RabbitMQ + the dispatcher. +- Cognee datasets: hosted by the shared Cognee service (`COGNEE_SERVICE_URL`) inside the configured MinIO/S3 bucket. Local mode falls back to `.fuzzforge/cognee/project_/{data,system}`. Uploads dropped into `s3://projects//...` are ingested automatically via RabbitMQ + the dispatcher. - Artifact cache: `.fuzzforge/artifacts` ## HTTP Endpoints diff --git a/docs/docs/ai/architecture.md b/docs/docs/ai/architecture.md index 28d5643..24b4324 100644 --- a/docs/docs/ai/architecture.md +++ b/docs/docs/ai/architecture.md @@ -92,7 +92,7 @@ The CLI surface mirrors these helpers as natural-language prompts (`You> run fuz ## Knowledge & Ingestion -- The `fuzzforge ingest` and `fuzzforge rag ingest` commands call into `ai/src/fuzzforge_ai/ingest_utils.py`, which filters file types, ignores caches, and streams files to the Cognee service where they are stored under `s3:////project_/`. When files land directly in `s3://cognee/projects///...`, the dispatcher performs the same workflow automatically via RabbitMQ events. +- The `fuzzforge ingest` and `fuzzforge rag ingest` commands call into `ai/src/fuzzforge_ai/ingest_utils.py`, which filters file types, ignores caches, and streams files to the Cognee service where they are stored under `s3://projects//`. When files land directly in `s3://projects///...`, the dispatcher performs the same workflow automatically via RabbitMQ events. - Runtime queries hit `query_project_knowledge_api` on the executor, which defers to `cognee_service` for dataset lookup and semantic search. When Cognee credentials are absent the tools return a friendly "not configured" response. ## Artifact Pipeline diff --git a/docs/docs/ai/ingestion.md b/docs/docs/ai/ingestion.md index ed5c74f..364c23f 100644 --- a/docs/docs/ai/ingestion.md +++ b/docs/docs/ai/ingestion.md @@ -102,7 +102,7 @@ Add comments or project-specific overrides as needed; the agent reads these vari ## Event-Driven Ingestion -Uploading files directly into MinIO triggers Cognee automatically. The dispatcher watches `s3://cognee/projects//...` and translates the top-level folder into a dataset: +Uploading files directly into MinIO triggers Cognee automatically. The dispatcher watches `s3://projects//...` and translates the top-level folder into a dataset: | Prefix | Dataset name | |-----------|---------------------------------------| diff --git a/docs/docs/how-to/docker-setup.md b/docs/docs/how-to/docker-setup.md index d4df710..38572b5 100644 --- a/docs/docs/how-to/docker-setup.md +++ b/docs/docs/how-to/docker-setup.md @@ -243,7 +243,7 @@ This spins up the Cognee API on `http://localhost:18000`, publishes it to the ho ### RabbitMQ + Dispatcher -`docker-compose.yml` also launches RabbitMQ (`http://localhost:15672`, ingest/ingest) and the `ingestion-dispatcher` container. MinIO publishes `PUT` events from `s3://cognee/projects//...` to the `cognee-ingest` exchange, and the dispatcher downloads the object and calls Cognee’s REST API. That means any rsync/upload into the projects bucket automatically becomes a dataset. +`docker-compose.yml` also launches RabbitMQ (`http://localhost:15672`, ingest/ingest) and the `ingestion-dispatcher` container. MinIO publishes `PUT` events from `s3://projects//...` to the `cognee-ingest` exchange, and the dispatcher downloads the object and calls Cognee’s REST API. That means any rsync/upload into the projects bucket automatically becomes a dataset. --- diff --git a/volumes/env/.env.template b/volumes/env/.env.template index 036f5d6..9cdd162 100644 --- a/volumes/env/.env.template +++ b/volumes/env/.env.template @@ -69,8 +69,8 @@ COGNEE_MCP_URL= COGNEE_SERVICE_URL=http://localhost:18000 COGNEE_API_KEY= COGNEE_STORAGE_BACKEND=s3 -COGNEE_S3_BUCKET=cognee -COGNEE_S3_PREFIX=projects +COGNEE_S3_BUCKET=projects +COGNEE_S3_PREFIX= COGNEE_S3_ENDPOINT=http://localhost:9000 COGNEE_S3_REGION=us-east-1 COGNEE_S3_ACCESS_KEY=fuzzforge