Source code for honeyhive.api.datasets

"""Datasets API module for HoneyHive."""

from typing import List, Literal, Optional

from ..models import CreateDatasetRequest, Dataset, DatasetUpdate
from .base import BaseAPI


[docs] class DatasetsAPI(BaseAPI): """API for dataset operations."""
[docs] def create_dataset(self, request: CreateDatasetRequest) -> Dataset: """Create a new dataset using CreateDatasetRequest model.""" response = self.client.request( "POST", "/datasets", json=request.model_dump(mode="json", exclude_none=True), ) data = response.json() # Handle new API response format that returns insertion result if "result" in data and "insertedId" in data["result"]: # New format: {"inserted": true, "result": {"insertedId": "...", ...}} inserted_id = data["result"]["insertedId"] # Create a Dataset object with the inserted ID dataset = Dataset( project=request.project, name=request.name, description=request.description, metadata=request.metadata, ) # Attach ID as a dynamic attribute for retrieval setattr(dataset, "_id", inserted_id) return dataset # Legacy format: direct dataset object return Dataset(**data)
[docs] def create_dataset_from_dict(self, dataset_data: dict) -> Dataset: """Create a new dataset from dictionary (legacy method).""" response = self.client.request("POST", "/datasets", json=dataset_data) data = response.json() # Handle new API response format that returns insertion result if "result" in data and "insertedId" in data["result"]: # New format: {"inserted": true, "result": {"insertedId": "...", ...}} inserted_id = data["result"]["insertedId"] # Create a Dataset object with the inserted ID dataset = Dataset( project=dataset_data.get("project"), name=dataset_data.get("name"), description=dataset_data.get("description"), metadata=dataset_data.get("metadata"), ) # Attach ID as a dynamic attribute for retrieval setattr(dataset, "_id", inserted_id) return dataset # Legacy format: direct dataset object return Dataset(**data)
[docs] async def create_dataset_async(self, request: CreateDatasetRequest) -> Dataset: """Create a new dataset asynchronously using CreateDatasetRequest model.""" response = await self.client.request_async( "POST", "/datasets", json=request.model_dump(mode="json", exclude_none=True), ) data = response.json() # Handle new API response format that returns insertion result if "result" in data and "insertedId" in data["result"]: # New format: {"inserted": true, "result": {"insertedId": "...", ...}} inserted_id = data["result"]["insertedId"] # Create a Dataset object with the inserted ID dataset = Dataset( project=request.project, name=request.name, description=request.description, metadata=request.metadata, ) # Attach ID as a dynamic attribute for retrieval setattr(dataset, "_id", inserted_id) return dataset # Legacy format: direct dataset object return Dataset(**data)
[docs] async def create_dataset_from_dict_async(self, dataset_data: dict) -> Dataset: """Create a new dataset asynchronously from dictionary (legacy method).""" response = await self.client.request_async( "POST", "/datasets", json=dataset_data ) data = response.json() # Handle new API response format that returns insertion result if "result" in data and "insertedId" in data["result"]: # New format: {"inserted": true, "result": {"insertedId": "...", ...}} inserted_id = data["result"]["insertedId"] # Create a Dataset object with the inserted ID dataset = Dataset( project=dataset_data.get("project"), name=dataset_data.get("name"), description=dataset_data.get("description"), metadata=dataset_data.get("metadata"), ) # Attach ID as a dynamic attribute for retrieval setattr(dataset, "_id", inserted_id) return dataset # Legacy format: direct dataset object return Dataset(**data)
[docs] def get_dataset(self, dataset_id: str) -> Dataset: """Get a dataset by ID.""" response = self.client.request( "GET", "/datasets", params={"dataset_id": dataset_id} ) data = response.json() # Backend returns {"testcases": [dataset]} datasets = data.get("testcases", []) if not datasets: raise ValueError(f"Dataset not found: {dataset_id}") return Dataset(**datasets[0])
[docs] async def get_dataset_async(self, dataset_id: str) -> Dataset: """Get a dataset by ID asynchronously.""" response = await self.client.request_async( "GET", "/datasets", params={"dataset_id": dataset_id} ) data = response.json() # Backend returns {"testcases": [dataset]} datasets = data.get("testcases", []) if not datasets: raise ValueError(f"Dataset not found: {dataset_id}") return Dataset(**datasets[0])
[docs] def list_datasets( self, project: Optional[str] = None, *, dataset_type: Optional[Literal["evaluation", "fine-tuning"]] = None, dataset_id: Optional[str] = None, name: Optional[str] = None, include_datapoints: bool = False, limit: int = 100, ) -> List[Dataset]: """List datasets with optional filtering. Args: project: Project name to filter by dataset_type: Type of dataset - "evaluation" or "fine-tuning" dataset_id: Specific dataset ID to filter by name: Dataset name to filter by (exact match) include_datapoints: Include datapoints in response (may impact performance) limit: Maximum number of datasets to return (default: 100) Returns: List of Dataset objects matching the filters Examples: Find dataset by name:: datasets = client.datasets.list_datasets( project="My Project", name="Training Data Q4" ) Get specific dataset with datapoints:: dataset = client.datasets.list_datasets( dataset_id="663876ec4611c47f4970f0c3", include_datapoints=True )[0] Filter by type and name:: eval_datasets = client.datasets.list_datasets( dataset_type="evaluation", name="Regression Tests" ) """ params = {"limit": str(limit)} if project: params["project"] = project if dataset_type: params["type"] = dataset_type if dataset_id: params["dataset_id"] = dataset_id if name: params["name"] = name if include_datapoints: params["include_datapoints"] = str(include_datapoints).lower() response = self.client.request("GET", "/datasets", params=params) data = response.json() return self._process_data_dynamically( data.get("testcases", []), Dataset, "testcases" )
[docs] async def list_datasets_async( self, project: Optional[str] = None, *, dataset_type: Optional[Literal["evaluation", "fine-tuning"]] = None, dataset_id: Optional[str] = None, name: Optional[str] = None, include_datapoints: bool = False, limit: int = 100, ) -> List[Dataset]: """List datasets asynchronously with optional filtering. Args: project: Project name to filter by dataset_type: Type of dataset - "evaluation" or "fine-tuning" dataset_id: Specific dataset ID to filter by name: Dataset name to filter by (exact match) include_datapoints: Include datapoints in response (may impact performance) limit: Maximum number of datasets to return (default: 100) Returns: List of Dataset objects matching the filters Examples: Find dataset by name:: datasets = await client.datasets.list_datasets_async( project="My Project", name="Training Data Q4" ) Get specific dataset with datapoints:: dataset = await client.datasets.list_datasets_async( dataset_id="663876ec4611c47f4970f0c3", include_datapoints=True ) Filter by type and name:: eval_datasets = await client.datasets.list_datasets_async( dataset_type="evaluation", name="Regression Tests" ) """ params = {"limit": str(limit)} if project: params["project"] = project if dataset_type: params["type"] = dataset_type if dataset_id: params["dataset_id"] = dataset_id if name: params["name"] = name if include_datapoints: params["include_datapoints"] = str(include_datapoints).lower() response = await self.client.request_async("GET", "/datasets", params=params) data = response.json() return self._process_data_dynamically( data.get("testcases", []), Dataset, "testcases" )
[docs] def update_dataset(self, dataset_id: str, request: DatasetUpdate) -> Dataset: """Update a dataset using DatasetUpdate model.""" response = self.client.request( "PUT", f"/datasets/{dataset_id}", json=request.model_dump(mode="json", exclude_none=True), ) data = response.json() return Dataset(**data)
[docs] def update_dataset_from_dict(self, dataset_id: str, dataset_data: dict) -> Dataset: """Update a dataset from dictionary (legacy method).""" response = self.client.request( "PUT", f"/datasets/{dataset_id}", json=dataset_data ) data = response.json() return Dataset(**data)
[docs] async def update_dataset_async( self, dataset_id: str, request: DatasetUpdate ) -> Dataset: """Update a dataset asynchronously using DatasetUpdate model.""" response = await self.client.request_async( "PUT", f"/datasets/{dataset_id}", json=request.model_dump(mode="json", exclude_none=True), ) data = response.json() return Dataset(**data)
[docs] async def update_dataset_from_dict_async( self, dataset_id: str, dataset_data: dict ) -> Dataset: """Update a dataset asynchronously from dictionary (legacy method).""" response = await self.client.request_async( "PUT", f"/datasets/{dataset_id}", json=dataset_data ) data = response.json() return Dataset(**data)
[docs] def delete_dataset(self, dataset_id: str) -> bool: """Delete a dataset by ID.""" context = self._create_error_context( operation="delete_dataset", method="DELETE", path="/datasets", additional_context={"dataset_id": dataset_id}, ) with self.error_handler.handle_operation(context): response = self.client.request( "DELETE", "/datasets", params={"dataset_id": dataset_id} ) return response.status_code == 200
[docs] async def delete_dataset_async(self, dataset_id: str) -> bool: """Delete a dataset by ID asynchronously.""" context = self._create_error_context( operation="delete_dataset_async", method="DELETE", path="/datasets", additional_context={"dataset_id": dataset_id}, ) with self.error_handler.handle_operation(context): response = await self.client.request_async( "DELETE", "/datasets", params={"dataset_id": dataset_id} ) return response.status_code == 200