from datetime import datetime
from typing import Dict, Any
class DocumentChunk:
"""Production-grade chunk with comprehensive metadata."""
def __init__(
self,
content: str,
metadata: Dict[str, Any]
):
self.content = content
self.metadata = metadata
self.validate_metadata()
def validate_metadata(self):
"""Ensure required metadata is present."""
required = ['source', 'chunk_id', 'created_at']
missing = [f for f in required if f not in self.metadata]
if missing:
raise ValueError(f"Missing metadata: {missing}")
@classmethod
def from_document(
cls,
content: str,
source: str,
chunk_index: int,
total_chunks: int,
**extra_metadata
) -> 'DocumentChunk':
"""Factory method with standard metadata."""
metadata = {
# Required metadata
'source': source, # e.g., "docs/api-guide.md"
'chunk_id': f"{source}_{chunk_index}",
'created_at': datetime.now().isoformat(),
# Chunk context
'chunk_index': chunk_index,
'total_chunks': total_chunks,
# Domain-specific (examples)
'document_type': extra_metadata.get('document_type'),
'author': extra_metadata.get('author'),
'last_modified': extra_metadata.get('last_modified'),
'section': extra_metadata.get('section'),
'language': extra_metadata.get('language', 'en'),
# Quality signals
'word_count': len(content.split()),
'char_count': len(content)
}
return cls(content, metadata)
# Example: Processing a technical document
chunk = DocumentChunk.from_document(
content="The /users endpoint returns a list of all users...",
source="docs/api-reference.md",
chunk_index=5,
total_chunks=42,
document_type="api_documentation",
section="Endpoints > User Management",
last_modified="2025-01-15"
)
# Now you can filter retrieval by metadata
# e.g., "Only search API docs modified after 2025-01-01"