Testing Applications with HoneyHive
Problem: You need to test your LLM application with HoneyHive tracing enabled, write unit tests for traced functions, and verify that traces are captured correctly without relying on mocks.
Solution: Use pytest with real HoneyHive tracers in test mode, validate trace outputs programmatically, and follow testing best practices for LLM applications.
Testing Philosophy
Key Principles:
Test with Real Tracers: Don’t mock HoneyHive - test with actual tracing
Validate Trace Structure: Ensure spans contain expected attributes
Separate Test Projects: Use dedicated test projects in HoneyHive
Fixture-Based Setup: Reusable tracer fixtures for consistency
Why Test with Real Tracing?
✅ Catches integration issues early
✅ Validates span enrichment logic
✅ Ensures production-like behavior
❌ Mocking hides real-world failures
Setup for Testing
Test Environment Configuration
# .env.test file
HH_API_KEY=hh_test_your_test_api_key
HH_PROJECT=test-project
HH_SOURCE=pytest
# Use separate API key and project for testing
# DO NOT use production credentials in tests
Pytest Configuration
# conftest.py - Shared test fixtures
import pytest
import os
from honeyhive import HoneyHiveTracer
from dotenv import load_dotenv
# Load test environment
load_dotenv('.env.test')
@pytest.fixture(scope="session")
def test_tracer():
"""Provide a HoneyHive tracer for testing."""
tracer = HoneyHiveTracer.init(
api_key=os.getenv("HH_API_KEY"),
project=os.getenv("HH_PROJECT", "test-project"),
source="pytest"
)
yield tracer
# Cleanup after all tests
# HoneyHive automatically flushes on process exit
@pytest.fixture
def clean_tracer():
"""Provide a fresh tracer for each test."""
tracer = HoneyHiveTracer.init(
api_key=os.getenv("HH_API_KEY"),
project=f"test-{pytest.current_test_name}",
source="pytest"
)
yield tracer
# Test-specific cleanup if needed
Unit Testing Traced Functions
Basic Function Testing
# test_traced_functions.py
from honeyhive import trace, enrich_span
from honeyhive.models import EventType
import pytest
# Function under test
@trace(event_type=EventType.tool)
def process_data(data: dict) -> dict:
"""Process data with tracing."""
enrich_span({
"input.size": len(data),
"process.type": "transformation"
})
result = {"processed": True, **data}
enrich_span({"output.size": len(result)})
return result
# Test the function
def test_process_data(test_tracer):
"""Test data processing with real tracing."""
# Arrange
input_data = {"key": "value", "count": 10}
# Act
result = process_data(input_data)
# Assert
assert result["processed"] is True
assert result["key"] == "value"
assert result["count"] == 10
# Trace is captured automatically in test project
Testing with Span Validation
from opentelemetry import trace as otel_trace
from opentelemetry.sdk.trace import ReadableSpan
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
@pytest.fixture
def span_capture(test_tracer):
"""Capture spans for validation in tests."""
exporter = InMemorySpanExporter()
processor = SimpleSpanProcessor(exporter)
test_tracer.provider.add_span_processor(processor)
yield exporter
exporter.clear()
def test_span_enrichment(test_tracer, span_capture):
"""Validate that span enrichment works correctly."""
# Act
result = process_data({"key": "value"})
# Assert
spans = span_capture.get_finished_spans()
assert len(spans) > 0
span = spans[0]
attributes = dict(span.attributes)
# Validate expected attributes
assert attributes.get("input.size") == 1
assert attributes.get("process.type") == "transformation"
assert attributes.get("output.size") == 2
Testing Error Handling
@trace(event_type=EventType.tool)
def risky_operation(value: int) -> int:
"""Operation that may fail."""
enrich_span({"input.value": value})
if value < 0:
enrich_span({"error.type": "ValueError"})
raise ValueError("Value must be non-negative")
result = value * 2
enrich_span({"output.value": result})
return result
def test_risky_operation_success(test_tracer):
"""Test successful execution."""
result = risky_operation(5)
assert result == 10
def test_risky_operation_failure(test_tracer, span_capture):
"""Test error handling with trace validation."""
with pytest.raises(ValueError, match="Value must be non-negative"):
risky_operation(-1)
# Validate error was captured in span
spans = span_capture.get_finished_spans()
assert len(spans) > 0
span = spans[0]
attributes = dict(span.attributes)
assert attributes.get("error.type") == "ValueError"
Integration Testing
Testing LLM Workflows
# test_llm_workflow.py
from honeyhive import HoneyHiveTracer, trace
from honeyhive.models import EventType
import openai
import pytest
@trace(event_type=EventType.chain)
def llm_workflow(query: str) -> str:
"""Complete LLM workflow."""
from honeyhive import enrich_span
enrich_span({"workflow.query": query, "workflow.type": "rag"})
# Step 1: Retrieve context
context = retrieve_context(query)
# Step 2: Generate response
response = generate_response(query, context)
enrich_span({"workflow.success": True})
return response
@trace(event_type=EventType.tool)
def retrieve_context(query: str) -> list:
"""Retrieve relevant context."""
from honeyhive import enrich_span
enrich_span({"retrieval.query": query})
# Mock retrieval for testing
context = ["doc1", "doc2"]
enrich_span({"retrieval.found": len(context)})
return context
@trace(event_type=EventType.model)
def generate_response(query: str, context: list) -> str:
"""Generate LLM response."""
from honeyhive import enrich_span
enrich_span({
"llm.provider": "openai",
"llm.model": "gpt-4",
"llm.context_size": len(context)
})
# For testing, use a mock or test-safe LLM call
response = f"Response to: {query} (with {len(context)} docs)"
enrich_span({"llm.response_length": len(response)})
return response
def test_llm_workflow_integration(test_tracer):
"""Test complete LLM workflow with tracing."""
query = "What is machine learning?"
result = llm_workflow(query)
assert "Response to:" in result
assert "machine learning" in result
# Trace automatically captured with 3 spans (chain + tool + model)
Testing Multi-Provider Scenarios
@trace(event_type=EventType.chain)
def multi_provider_call(prompt: str) -> str:
"""Try multiple LLM providers with fallback."""
from honeyhive import enrich_span
providers = ["openai", "anthropic"]
enrich_span({"providers.available": len(providers)})
for i, provider in enumerate(providers):
try:
result = call_provider(provider, prompt)
enrich_span({
"providers.used": provider,
"providers.attempts": i + 1
})
return result
except Exception as e:
enrich_span({f"providers.{provider}_failed": str(e)})
if i == len(providers) - 1:
raise
return ""
@trace(event_type=EventType.model)
def call_provider(provider: str, prompt: str) -> str:
"""Call specific LLM provider."""
from honeyhive import enrich_span
enrich_span({"provider.name": provider, "provider.prompt_length": len(prompt)})
# Mock for testing
if provider == "openai":
return "OpenAI response"
elif provider == "anthropic":
return "Anthropic response"
else:
raise ValueError(f"Unknown provider: {provider}")
def test_multi_provider_fallback(test_tracer):
"""Test provider fallback logic."""
result = multi_provider_call("Test prompt")
assert result in ["OpenAI response", "Anthropic response"]
Evaluation Testing
Testing with Evaluation Metrics
# test_evaluation.py
from honeyhive import HoneyHiveTracer
import pytest
def test_llm_output_quality(test_tracer):
"""Test LLM output meets quality thresholds."""
query = "Explain Python decorators"
response = generate_response(query, [])
# Quality checks
assert len(response) > 50, "Response too short"
assert "decorator" in response.lower(), "Key term missing"
assert not any(word in response.lower() for word in ["sorry", "cannot", "unable"]), \
"Negative response detected"
# Trace captured automatically for review in HoneyHive dashboard
def test_latency_requirements(test_tracer):
"""Test that operations meet latency requirements."""
import time
start = time.time()
result = llm_workflow("Simple query")
duration = time.time() - start
assert duration < 5.0, f"Operation took {duration:.2f}s, expected < 5s"
assert result is not None
For comprehensive evaluation testing, see Evaluation & Analysis Guides.
Best Practices
1. Use Separate Test Projects
# ✅ Good: Dedicated test project
@pytest.fixture
def test_tracer():
return HoneyHiveTracer.init(
api_key=os.getenv("HH_TEST_API_KEY"),
project="test-project", # Separate from production
source="pytest"
)
# ❌ Bad: Using production project
# project="production-app" # DON'T do this
2. Clean Fixture Management
# conftest.py
@pytest.fixture(scope="session")
def session_tracer():
"""One tracer for entire test session."""
tracer = HoneyHiveTracer.init(
api_key=os.getenv("HH_TEST_API_KEY"),
project="test-project",
source="pytest-session"
)
yield tracer
@pytest.fixture
def function_tracer():
"""Fresh tracer for each test function."""
tracer = HoneyHiveTracer.init(
api_key=os.getenv("HH_TEST_API_KEY"),
project=f"test-{pytest.current_test_name}",
source="pytest-function"
)
yield tracer
3. Environment-Based Configuration
# tests/conftest.py
import os
import pytest
from dotenv import load_dotenv
def pytest_configure(config):
"""Load test environment before tests run."""
load_dotenv('.env.test')
# Verify test configuration
if not os.getenv("HH_API_KEY"):
pytest.exit("HH_API_KEY not set in test environment")
if os.getenv("HH_PROJECT") == "production":
pytest.exit("Cannot use production project in tests")
4. Parametrized Testing
@pytest.mark.parametrize("input_value,expected_output", [
(5, 10),
(0, 0),
(100, 200),
])
def test_risky_operation_parametrized(test_tracer, input_value, expected_output):
"""Test multiple scenarios with tracing."""
result = risky_operation(input_value)
assert result == expected_output
Common Testing Patterns
Pattern 1: Test Helper with Tracing
# test_helpers.py
from contextlib import contextmanager
from honeyhive import enrich_span
import time
@contextmanager
def assert_trace_timing(max_duration_ms: float):
"""Context manager to validate operation timing."""
start = time.time()
yield
duration_ms = (time.time() - start) * 1000
enrich_span({"test.duration_ms": duration_ms})
assert duration_ms < max_duration_ms, \
f"Operation took {duration_ms:.2f}ms, expected < {max_duration_ms}ms"
# Usage
def test_with_timing(test_tracer):
with assert_trace_timing(max_duration_ms=500):
result = process_data({"key": "value"})
Pattern 2: Trace Assertion Helper
def assert_span_has_attributes(span, expected_attrs: dict):
"""Assert span contains expected attributes."""
actual_attrs = dict(span.attributes)
for key, expected_value in expected_attrs.items():
actual_value = actual_attrs.get(key)
assert actual_value == expected_value, \
f"Attribute {key}: expected {expected_value}, got {actual_value}"
# Usage
def test_span_attributes(test_tracer, span_capture):
process_data({"key": "value"})
spans = span_capture.get_finished_spans()
assert_span_has_attributes(spans[0], {
"input.size": 1,
"process.type": "transformation"
})
Running Tests
Basic Test Execution:
# Run all tests with test environment
pytest tests/ --env-file=.env.test
# Run specific test file
pytest tests/test_traced_functions.py -v
# Run with coverage
pytest tests/ --cov=src --cov-report=html
Test Selection:
# Run only integration tests
pytest tests/ -m integration
# Run only unit tests
pytest tests/ -m unit
# Skip slow tests
pytest tests/ -m "not slow"
Pytest Markers:
import pytest
@pytest.mark.unit
def test_unit_function(test_tracer):
"""Unit test with tracing."""
pass
@pytest.mark.integration
def test_integration_workflow(test_tracer):
"""Integration test with tracing."""
pass
@pytest.mark.slow
def test_heavy_processing(test_tracer):
"""Slow test that may be skipped."""
pass
Next Steps
Evaluation & Analysis Guides - Comprehensive evaluation testing strategies
Production Deployment Guide - Production testing and monitoring
SDK Development - SDK development testing (for contributors)
Key Takeaway: Test with real HoneyHive tracing enabled to catch integration issues early. Use pytest fixtures for consistent tracer setup, validate trace attributes programmatically, and maintain separate test projects to avoid polluting production data. ✨