Add event-driven Cognee ingestion pipeline

This commit is contained in:
Songbird
2025-11-07 17:21:18 +01:00
parent 83244ee537
commit 3ef6bf2437
17 changed files with 795 additions and 82 deletions
+159
View File
@@ -0,0 +1,159 @@
"""HTTP client for the Cognee REST API."""
from __future__ import annotations
from pathlib import Path
from typing import Any, Iterable, Sequence
import httpx
class CogneeApiError(RuntimeError):
"""Raised when the Cognee API returns an error status."""
def __init__(self, message: str, *, status_code: int | None = None):
super().__init__(message)
self.status_code = status_code
class CogneeApiClient:
"""Async client for interacting with the Cognee service."""
def __init__(
self,
base_url: str,
api_key: str | None = None,
*,
email: str | None = None,
password: str | None = None,
timeout: float = 180.0,
):
base = base_url.rstrip("/")
headers = {}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
self._client = httpx.AsyncClient(
base_url=base,
timeout=httpx.Timeout(timeout),
follow_redirects=True,
headers=headers,
)
self._email = email
self._password = password
self._token: str | None = None
async def __aenter__(self) -> "CogneeApiClient":
return self
async def __aexit__(self, exc_type, exc, tb) -> None:
await self.close()
async def close(self) -> None:
await self._client.aclose()
async def ensure_authenticated(self) -> None:
"""Ensure we have a bearer token before making privileged calls."""
if self._client.headers.get("Authorization") or self._token:
return
if not (self._email and self._password):
# Service might be running with authentication disabled.
return
try:
await self.register_user(self._email, self._password)
except CogneeApiError as exc:
if exc.status_code not in (400, 409):
raise
token = await self.login(self._email, self._password)
self._token = token
self._client.headers["Authorization"] = f"Bearer {token}"
async def register_user(self, email: str, password: str) -> Any:
payload = {
"email": email,
"password": password,
"is_active": True,
"is_verified": True,
}
response = await self._client.post("/api/v1/auth/register", json=payload)
return self._handle_response(response)
async def login(self, email: str, password: str) -> str:
data = {"username": email, "password": password}
response = await self._client.post("/api/v1/auth/login", data=data)
payload = self._handle_response(response)
token = payload.get("access_token")
if not token:
raise CogneeApiError("Cognee auth response did not include an access_token")
return token
async def add_files(self, file_paths: Iterable[Path], dataset_name: str) -> Any:
await self.ensure_authenticated()
files: list[tuple[str, tuple[str, bytes, str]]] = []
for path in file_paths:
data = path.read_bytes()
files.append(("data", (path.name, data, "application/octet-stream")))
data = {"datasetName": dataset_name}
response = await self._client.post("/api/v1/add", data=data, files=files)
return self._handle_response(response)
async def add_texts(self, texts: Sequence[str], dataset_name: str) -> Any:
await self.ensure_authenticated()
files: list[tuple[str, tuple[str, bytes, str]]] = []
for idx, text in enumerate(texts):
data = text.encode("utf-8")
files.append(("data", (f"snippet_{idx}.txt", data, "text/plain")))
response = await self._client.post(
"/api/v1/add",
data={"datasetName": dataset_name},
files=files,
)
return self._handle_response(response)
async def cognify(self, datasets: Sequence[str]) -> Any:
await self.ensure_authenticated()
payload = {"datasets": list(datasets), "run_in_background": False}
response = await self._client.post("/api/v1/cognify", json=payload)
return self._handle_response(response)
async def search(
self,
*,
query: str,
search_type: str,
datasets: Sequence[str] | None = None,
top_k: int | None = None,
only_context: bool = False,
) -> Any:
await self.ensure_authenticated()
payload: dict[str, object] = {
"query": query,
"search_type": search_type,
"only_context": only_context,
}
if datasets:
payload["datasets"] = list(datasets)
if top_k is not None:
payload["top_k"] = top_k
response = await self._client.post("/api/v1/search", json=payload)
return self._handle_response(response)
def _handle_response(self, response: httpx.Response) -> Any:
try:
response.raise_for_status()
except httpx.HTTPStatusError as exc: # pragma: no cover - surfaced to caller
message = exc.response.text
raise CogneeApiError(
f"Cognee API request failed ({exc.response.status_code}): {message}",
status_code=exc.response.status_code,
) from exc
if response.content:
return response.json()
return {}
+76 -48
View File
@@ -23,6 +23,7 @@ from rich.console import Console
from rich.prompt import Confirm
from ..config import ProjectConfigManager
from ..cognee_api import CogneeApiClient, CogneeApiError
from ..ingest_utils import collect_ingest_files
console = Console()
@@ -92,23 +93,18 @@ def ingest_callback(
config.setup_cognee_environment()
if os.getenv("FUZZFORGE_DEBUG", "0") == "1":
storage_backend = os.getenv("COGNEE_STORAGE_BACKEND", "local")
console.print(
"[dim]Cognee directories:\n"
f" DATA: {os.getenv('COGNEE_DATA_ROOT', 'unset')}\n"
f" SYSTEM: {os.getenv('COGNEE_SYSTEM_ROOT', 'unset')}\n"
f" USER: {os.getenv('COGNEE_USER_ID', 'unset')}\n",
f" USER: {os.getenv('COGNEE_USER_ID', 'unset')}\n"
f" STORAGE: {storage_backend}\n",
)
project_context = config.get_project_context()
target_path = path or Path.cwd()
dataset_name = dataset or f"{project_context['project_name']}_codebase"
try:
import cognee # noqa: F401 # Just to validate installation
except ImportError as exc:
console.print("[red]Cognee is not installed.[/red]")
console.print("Install with: pip install 'cognee[all]' litellm")
raise typer.Exit(1) from exc
dataset_name = dataset or f"{project_context['project_id']}_codebase"
console.print(f"[bold]🔍 Ingesting {target_path} into Cognee knowledge graph[/bold]")
console.print(
@@ -155,10 +151,21 @@ async def _run_ingestion(
force: bool,
) -> None:
"""Perform the actual ingestion work."""
from fuzzforge_ai.cognee_service import CogneeService
cognee_cfg = config.get_cognee_config()
service_url = (
cognee_cfg.get("service_url")
or os.getenv("COGNEE_SERVICE_URL")
or "http://localhost:18000"
)
service_email = os.getenv("COGNEE_SERVICE_EMAIL") or cognee_cfg.get("service_email")
service_password = os.getenv("COGNEE_SERVICE_PASSWORD") or cognee_cfg.get("service_password")
cognee_service = CogneeService(config)
await cognee_service.initialize()
if not service_email or not service_password:
console.print(
"[red]Missing Cognee service credentials.[/red] Run `ff init` again or set "
"COGNEE_SERVICE_EMAIL / COGNEE_SERVICE_PASSWORD in .fuzzforge/.env."
)
return
# Always skip internal bookkeeping directories
exclude_patterns = list(exclude or [])
@@ -192,11 +199,9 @@ async def _run_ingestion(
console.print(f"Found [green]{len(files_to_ingest)}[/green] files to ingest")
if force:
console.print("Cleaning existing data for this project...")
try:
await cognee_service.clear_data(confirm=True)
except Exception as exc:
console.print(f"[yellow]Warning:[/yellow] Could not clean existing data: {exc}")
console.print(
"[yellow]Warning:[/yellow] Force re-ingest is not yet supported for the remote Cognee service."
)
console.print("Adding files to Cognee...")
valid_file_paths = []
@@ -213,39 +218,62 @@ async def _run_ingestion(
console.print("[yellow]No readable files found to ingest[/yellow]")
return
results = await cognee_service.ingest_files(valid_file_paths, dataset)
async with CogneeApiClient(
service_url,
email=service_email,
password=service_password,
) as client:
try:
await client.ensure_authenticated()
except CogneeApiError as exc:
console.print(f"[red]Cognee authentication failed:[/red] {exc}")
return
except Exception as exc:
console.print(f"[red]Cognee authentication error:[/red] {exc}")
return
try:
await client.add_files(valid_file_paths, dataset)
await client.cognify([dataset])
except CogneeApiError as exc:
console.print(f"[red]Cognee API error:[/red] {exc}")
return
except Exception as exc:
console.print(f"[red]Unexpected Cognee error:[/red] {exc}")
return
console.print(
f"[green]✅ Successfully ingested {results['success']} files into knowledge graph[/green]"
)
if results["failed"]:
console.print(
f"[yellow]⚠️ Skipped {results['failed']} files due to errors[/yellow]"
f"[green]✅ Successfully ingested {len(valid_file_paths)} files into knowledge graph[/green]"
)
try:
insights = await cognee_service.search_insights(
query=f"What insights can you provide about the {dataset} dataset?",
dataset=dataset,
)
if insights:
console.print(f"\n[bold]📊 Generated {len(insights)} insights:[/bold]")
for index, insight in enumerate(insights[:3], 1):
console.print(f" {index}. {insight}")
if len(insights) > 3:
console.print(f" ... and {len(insights) - 3} more")
chunks = await cognee_service.search_chunks(
query=f"functions classes methods in {dataset}",
dataset=dataset,
)
if chunks:
console.print(
f"\n[bold]🔍 Sample searchable content ({len(chunks)} chunks found):[/bold]"
try:
insights = await client.search(
query=f"What insights can you provide about the {dataset} dataset?",
search_type="INSIGHTS",
datasets=[dataset],
)
for index, chunk in enumerate(chunks[:2], 1):
preview = chunk[:100] + "..." if len(chunk) > 100 else chunk
console.print(f" {index}. {preview}")
except Exception:
# Best-effort stats — ignore failures here
pass
insight_list = insights if isinstance(insights, list) else insights.get("results", [])
if insight_list:
console.print(f"\n[bold]📊 Generated {len(insight_list)} insights:[/bold]")
for index, insight in enumerate(insight_list[:3], 1):
console.print(f" {index}. {insight}")
if len(insight_list) > 3:
console.print(f" ... and {len(insight_list) - 3} more")
chunks = await client.search(
query=f"functions classes methods in {dataset}",
search_type="CHUNKS",
datasets=[dataset],
top_k=5,
)
chunk_list = chunks if isinstance(chunks, list) else chunks.get("results", [])
if chunk_list:
console.print(
f"\n[bold]🔍 Sample searchable content ({len(chunk_list)} chunks found):[/bold]"
)
for index, chunk in enumerate(chunk_list[:2], 1):
text = str(chunk)
preview = text[:100] + "..." if len(text) > 100 else text
console.print(f" {index}. {preview}")
except Exception:
pass
+11
View File
@@ -246,6 +246,17 @@ def _ensure_env_file(fuzzforge_dir: Path, force: bool) -> None:
"LLM_COGNEE_EMBEDDING_MODEL=litellm_proxy/text-embedding-3-large",
"LLM_COGNEE_EMBEDDING_ENDPOINT=http://localhost:10999",
"COGNEE_MCP_URL=",
"COGNEE_SERVICE_URL=http://localhost:18000",
"COGNEE_STORAGE_BACKEND=s3",
"COGNEE_SERVICE_EMAIL=",
"COGNEE_SERVICE_PASSWORD=",
"COGNEE_S3_BUCKET=cognee",
"COGNEE_S3_PREFIX=",
"COGNEE_S3_ENDPOINT=http://localhost:9000",
"COGNEE_S3_REGION=us-east-1",
"COGNEE_S3_ACCESS_KEY=",
"COGNEE_S3_SECRET_KEY=",
"COGNEE_S3_ALLOW_HTTP=1",
"",
"# Session persistence options: inmemory | sqlite",
"SESSION_PERSISTENCE=sqlite",
+114 -21
View File
@@ -21,7 +21,7 @@ from __future__ import annotations
import hashlib
import os
from pathlib import Path
from typing import Any, Dict, Optional
from typing import Any, Dict, Optional, Literal
try: # Optional dependency; fall back if not installed
from dotenv import load_dotenv
@@ -131,7 +131,19 @@ class CogneeConfig(BaseModel):
"""Cognee integration metadata."""
enabled: bool = True
graph_database_provider: str = "kuzu"
graph_database_provider: str = "ladybug"
service_url: str = "http://localhost:18000"
api_key: Optional[str] = None
service_email: Optional[str] = None
service_password: Optional[str] = None
storage_backend: Literal["local", "s3"] = "s3"
s3_bucket: Optional[str] = None
s3_prefix: Optional[str] = "projects"
s3_endpoint_url: Optional[str] = None
s3_region: Optional[str] = None
s3_access_key: Optional[str] = None
s3_secret_key: Optional[str] = None
s3_allow_http: bool = False
data_directory: Optional[str] = None
system_directory: Optional[str] = None
backend_access_control: bool = True
@@ -201,25 +213,55 @@ class FuzzForgeConfig(BaseModel):
cognee.tenant_id = self.project.tenant_id
changed = True
base_dir = project_dir / ".fuzzforge" / "cognee" / f"project_{self.project.id}"
data_dir = base_dir / "data"
system_dir = base_dir / "system"
for path in (
base_dir,
data_dir,
system_dir,
system_dir / "kuzu_db",
system_dir / "lancedb",
):
if not path.exists():
path.mkdir(parents=True, exist_ok=True)
if cognee.data_directory != str(data_dir):
cognee.data_directory = str(data_dir)
if not cognee.service_url:
cognee.service_url = "http://localhost:18000"
changed = True
if cognee.system_directory != str(system_dir):
cognee.system_directory = str(system_dir)
if not cognee.s3_prefix:
cognee.s3_prefix = "projects"
changed = True
default_email = f"project_{self.project.id}@fuzzforge.dev"
if not cognee.service_email or cognee.service_email.endswith(
("@cognee.local", "@cognee.localhost")
):
cognee.service_email = default_email
changed = True
derived_password = hashlib.sha256(self.project.id.encode()).hexdigest()[:20]
if not cognee.service_password or len(cognee.service_password) < 12:
cognee.service_password = derived_password
changed = True
if cognee.storage_backend.lower() == "s3":
bucket = cognee.s3_bucket or "cognee"
prefix = (cognee.s3_prefix or "projects").strip("/")
base_uri = f"s3://{bucket}/{prefix}/{self.project.id}"
data_dir = f"{base_uri}/files"
system_dir = f"{base_uri}/graph"
else:
base_dir = project_dir / ".fuzzforge" / "cognee" / f"project_{self.project.id}"
data_path = base_dir / "data"
system_path = base_dir / "system"
for path in (
base_dir,
data_path,
system_path,
system_path / "ladybug",
system_path / "lancedb",
):
if not path.exists():
path.mkdir(parents=True, exist_ok=True)
data_dir = str(data_path)
system_dir = str(system_path)
if cognee.data_directory != data_dir:
cognee.data_directory = data_dir
changed = True
if cognee.system_directory != system_dir:
cognee.system_directory = system_dir
changed = True
return changed
@@ -368,16 +410,67 @@ class ProjectConfigManager:
backend_access = "true" if cognee.get("backend_access_control", True) else "false"
os.environ["ENABLE_BACKEND_ACCESS_CONTROL"] = backend_access
os.environ["GRAPH_DATABASE_PROVIDER"] = cognee.get("graph_database_provider", "kuzu")
graph_provider = cognee.get("graph_database_provider", "ladybug")
os.environ["GRAPH_DATABASE_PROVIDER"] = graph_provider
service_url = cognee.get("service_url") or os.getenv("COGNEE_SERVICE_URL") or "http://localhost:18000"
os.environ["COGNEE_SERVICE_URL"] = service_url
api_key = os.getenv("COGNEE_API_KEY") or cognee.get("api_key")
if api_key:
os.environ["COGNEE_API_KEY"] = api_key
service_email = os.getenv("COGNEE_SERVICE_EMAIL") or cognee.get("service_email")
if service_email:
os.environ["COGNEE_SERVICE_EMAIL"] = service_email
service_password = os.getenv("COGNEE_SERVICE_PASSWORD") or cognee.get("service_password")
if service_password:
os.environ["COGNEE_SERVICE_PASSWORD"] = service_password
data_dir = cognee.get("data_directory")
system_dir = cognee.get("system_directory")
tenant_id = cognee.get("tenant_id", "fuzzforge_tenant")
storage_backend = cognee.get("storage_backend", "local").lower()
os.environ["COGNEE_STORAGE_BACKEND"] = storage_backend
if storage_backend == "s3":
os.environ["STORAGE_BACKEND"] = "s3"
bucket = os.getenv("COGNEE_S3_BUCKET") or cognee.get("s3_bucket") or "cognee"
os.environ["STORAGE_BUCKET_NAME"] = bucket
os.environ["COGNEE_S3_BUCKET"] = bucket
prefix_override = os.getenv("COGNEE_S3_PREFIX") or cognee.get("s3_prefix")
if prefix_override:
os.environ["COGNEE_S3_PREFIX"] = prefix_override
endpoint = os.getenv("COGNEE_S3_ENDPOINT") or cognee.get("s3_endpoint_url") or "http://localhost:9000"
os.environ["AWS_ENDPOINT_URL"] = endpoint
os.environ["COGNEE_S3_ENDPOINT"] = endpoint
region = os.getenv("COGNEE_S3_REGION") or cognee.get("s3_region") or "us-east-1"
os.environ["AWS_REGION"] = region
os.environ["COGNEE_S3_REGION"] = region
access_key = os.getenv("COGNEE_S3_ACCESS_KEY") or cognee.get("s3_access_key")
secret_key = os.getenv("COGNEE_S3_SECRET_KEY") or cognee.get("s3_secret_key")
if access_key:
os.environ.setdefault("AWS_ACCESS_KEY_ID", access_key)
os.environ["COGNEE_S3_ACCESS_KEY"] = access_key
if secret_key:
os.environ.setdefault("AWS_SECRET_ACCESS_KEY", secret_key)
os.environ["COGNEE_S3_SECRET_KEY"] = secret_key
allow_http_env = os.getenv("COGNEE_S3_ALLOW_HTTP")
allow_http_flag = allow_http_env if allow_http_env is not None else ("true" if cognee.get("s3_allow_http") else "false")
if allow_http_flag.lower() in {"1", "true", "yes"}:
os.environ["AWS_ALLOW_HTTP"] = "true"
os.environ["COGNEE_S3_ALLOW_HTTP"] = "1"
if data_dir:
os.environ["COGNEE_DATA_ROOT"] = data_dir
os.environ.setdefault("DATA_ROOT_DIRECTORY", data_dir)
if system_dir:
os.environ["COGNEE_SYSTEM_ROOT"] = system_dir
os.environ.setdefault("SYSTEM_ROOT_DIRECTORY", system_dir)
os.environ["COGNEE_USER_ID"] = tenant_id
os.environ["COGNEE_TENANT_ID"] = tenant_id