Observability Hub

Observability Hub provides complete visibility into your multi-agent systems. Track traces across agents, monitor metrics, control costs, and set up alerts.

Key Features

Distributed tracing - Follow requests across agents
Built-in metrics - Latency, throughput, error rates
Cost tracking - LLM spend by agent, plan, and model
Log aggregation - Searchable, correlated logs
Alerting - Threshold, anomaly, and absence alerts
OTLP export - Send data to external systems

Distributed Tracing

Create Spans

async with client.observability.span(
    name="process_document",
    attributes={
        "document_id": "doc-123",
        "document_type": "pdf"
    }
) as span:
    # Your processing logic
    result = await process(document)

    span.set_attribute("pages_processed", result.page_count)
    span.set_attribute("status", "success")

Child Spans

async with client.observability.span(name="parent_operation") as parent:
    # First child
    async with client.observability.span(name="step_1") as step1:
        await do_step_1()

    # Second child
    async with client.observability.span(name="step_2") as step2:
        await do_step_2()

Query Traces

# Find traces by time range
traces = await client.observability.query_traces(
    start_time=datetime.now() - timedelta(hours=1),
    end_time=datetime.now(),
    limit=100
)

# Filter by attributes
traces = await client.observability.query_traces(
    filters={
        "agent_id": "document-processor",
        "status": "error"
    }
)

# Get a specific trace
trace = await client.observability.get_trace(trace_id="tr_xxx")

Cost Tracking

Track LLM costs per span:

async with client.observability.span(name="llm_call") as span:
    response = await call_llm(prompt)

    # Record cost
    span.set_cost(
        model="gpt-4",
        input_tokens=1500,
        output_tokens=500,
        cost_usd=0.045
    )

Query Costs

# Get cost summary
costs = await client.observability.get_costs(
    start_time=datetime.now() - timedelta(days=7),
    end_time=datetime.now(),
    group_by=["agent_id", "model"]
)

for item in costs:
    print(f"{item.agent_id} - {item.model}: ${item.total_cost:.2f}")

Logging

Write Logs

await client.observability.log(
    level="info",
    message="Document processed successfully",
    attributes={
        "document_id": "doc-123",
        "processing_time_ms": 1500
    }
)

Correlated Logs

Logs are automatically correlated with traces:

async with client.observability.span(name="process") as span:
    # This log is automatically linked to the span
    await client.observability.log(
        level="info",
        message="Starting processing"
    )

Query Logs

logs = await client.observability.query_logs(
    start_time=datetime.now() - timedelta(hours=1),
    level="error",
    search="connection failed"
)

Metrics

Acenta tracks standard metrics automatically:

Metric	Description
`request_latency_ms`	Request processing time
`request_count`	Total request count
`error_count`	Total error count
`message_count`	Messages sent/received
`llm_cost_usd`	Total LLM spend

Query Metrics

metrics = await client.observability.query_metrics(
    metric="request_latency_ms",
    start_time=datetime.now() - timedelta(hours=1),
    aggregation="p95",
    group_by=["agent_id"]
)

Alerting

Create Threshold Alert

alert = await client.observability.create_alert(
    name="High Error Rate",
    condition={
        "type": "threshold",
        "metric": "error_count",
        "operator": ">",
        "threshold": 100,
        "window": "5m"
    },
    channels=[
        {"type": "email", "to": "ops@example.com"},
        {"type": "slack", "webhook": "https://hooks.slack.com/..."}
    ]
)

Anomaly Detection

alert = await client.observability.create_alert(
    name="Latency Anomaly",
    condition={
        "type": "anomaly",
        "metric": "request_latency_ms",
        "sensitivity": 2.0,  # Standard deviations
        "baseline_window": "24h"
    },
    channels=[...]
)

Absence Alert

alert = await client.observability.create_alert(
    name="Missing Heartbeats",
    condition={
        "type": "absence",
        "metric": "heartbeat_count",
        "missing_for": "5m"
    },
    channels=[...]
)

OTLP Export

Export data to external observability systems:

# Configure OTLP export
await client.observability.configure_export(
    endpoint="https://otel-collector.example.com:4318",
    headers={"Authorization": "Bearer xxx"},
    export_traces=True,
    export_logs=True
)

Dashboard

Get pre-built dashboard data:

dashboard = await client.observability.get_dashboard(
    time_range="1h"
)

print(f"Total requests: {dashboard.total_requests}")
print(f"Error rate: {dashboard.error_rate:.2%}")
print(f"P95 latency: {dashboard.p95_latency_ms}ms")
print(f"Total cost: ${dashboard.total_cost:.2f}")

Sampling

Configure trace sampling:

# Default: 10% sampling (errors always kept)
# Configure per-namespace
await client.observability.configure_sampling(
    rate=0.25,  # 25% sampling
    always_keep=["error", "slow"]  # Always keep errors and slow requests
)

Next Steps

Monitoring Guide - Set up effective monitoring
API Reference - Complete API documentation