Multi-Step Experiments
How do I evaluate a pipeline with multiple steps (e.g., RAG)?
Use component-level tracing and metrics within your evaluation function.
How do I evaluate each component separately?
Using Context Manager (Explicit Tracer)
from typing import Any, Dict
from honeyhive.experiments import evaluate
from honeyhive import HoneyHiveTracer
def rag_pipeline(datapoint: Dict[str, Any], tracer: HoneyHiveTracer) -> Dict[str, Any]:
"""Multi-step RAG pipeline with explicit tracer parameter.
Args:
datapoint: Contains 'inputs' and 'ground_truth'
tracer: Auto-injected by evaluate()
Returns:
Dictionary with pipeline outputs
"""
inputs = datapoint.get("inputs", {})
query = inputs["question"]
# Step 1: Retrieval
with tracer.trace("retrieval"):
docs = retrieve_documents(query)
# Add component metric
tracer.enrich_span(metrics={"retrieval_count": len(docs)})
# Step 2: Reranking
with tracer.trace("reranking"):
ranked_docs = rerank(docs, query)
# Add component metric
tracer.enrich_span(metrics={"rerank_score": ranked_docs[0].score})
# Step 3: Generation
with tracer.trace("generation"):
answer = generate_answer(query, ranked_docs)
# Add component metric
tracer.enrich_span(metrics={"answer_length": len(answer)})
return {"answer": answer, "sources": ranked_docs}
# Evaluate entire pipeline
result = evaluate(
function=rag_pipeline,
dataset=dataset,
api_key="your-api-key",
project="your-project"
)
Using @trace Decorator
from typing import Any, Dict
from honeyhive.experiments import evaluate
from honeyhive import HoneyHiveTracer, trace
# Initialize tracer for decorators
tracer = HoneyHiveTracer.init(
api_key="your-api-key",
project="your-project"
)
@trace(tracer=tracer, event_name="retrieval", event_type="tool")
def retrieve_documents(query: str) -> list:
"""Retrieval component with automatic tracing."""
docs = vector_db.search(query, top_k=10)
# Metrics automatically captured by @trace
tracer.enrich_span(metrics={"retrieval_count": len(docs)})
return docs
@trace(tracer=tracer, event_name="reranking", event_type="tool")
def rerank(docs: list, query: str) -> list:
"""Reranking component with automatic tracing."""
ranked = reranker.rerank(query, docs)
tracer.enrich_span(metrics={"rerank_score": ranked[0].score})
return ranked
@trace(tracer=tracer, event_name="generation", event_type="tool")
def generate_answer(query: str, docs: list) -> str:
"""Generation component with automatic tracing."""
context = "\n".join([d.content for d in docs])
answer = llm.generate(f"Context: {context}\n\nQuestion: {query}")
tracer.enrich_span(metrics={"answer_length": len(answer)})
return answer
def rag_pipeline(datapoint: Dict[str, Any]) -> Dict[str, Any]:
"""Multi-step RAG pipeline using decorated helper functions.
Args:
datapoint: Contains 'inputs' and 'ground_truth'
Returns:
Dictionary with pipeline outputs
"""
inputs = datapoint.get("inputs", {})
query = inputs["question"]
# Each function call is automatically traced
docs = retrieve_documents(query)
ranked_docs = rerank(docs, query)
answer = generate_answer(query, ranked_docs)
return {"answer": answer, "sources": ranked_docs}
# Evaluate entire pipeline
result = evaluate(
function=rag_pipeline,
dataset=dataset,
api_key="your-api-key",
project="your-project"
)
Component-Level Metrics
Each component can have its own metrics that are tracked separately in HoneyHive:
Retrieval: precision, recall, relevance scores
Reranking: rerank confidence, position changes
Generation: length, quality, fact accuracy
These appear as separate metric traces in the dashboard.
See Also
Running Experiments - Run multi-step experiments
Custom Span Management - Create custom spans
Enable Span Enrichment - Enrich traces with metrics