Split text into chunks based on semantic similarity with advanced features
The SemanticChunker splits text into chunks based on semantic similarity, ensuring that related content stays together in the same chunk. This chunker now includes advanced features like Savitzky-Golay filtering for smoother boundary detection and skip-window merging for connecting related content that may not be consecutive. This chunker is inspired by the work of Greg Kamradt.
from chonkie import SemanticChunkertext = """Artificial intelligence is transforming industries worldwide. Machine learning algorithms can now process vast amounts of data efficiently.Deep learning models have achieved remarkable accuracy in complex tasks.Climate change poses significant challenges to our planet.Rising temperatures affect ecosystems and biodiversity globally.Sustainable practices are essential for environmental preservation.Quantum computing represents a paradigm shift in computation.These systems leverage quantum mechanical phenomena for processing.Potential applications include cryptography and drug discovery."""# Create semantic chunkerchunker = SemanticChunker( embedding_model="minishlab/potion-base-32M", threshold=0.75, # Higher threshold = more similar content grouped chunk_size=1024)chunks = chunker.chunk(text)# Analyze semantic groupingsfor i, chunk in enumerate(chunks): print(f"\n--- Semantic Group {i+1} ---") print(f"Content: {chunk.text[:100]}...") print(f"Token count: {chunk.token_count}") print(f"Theme: {chunk.text.split('.')[0]}") # First sentence as theme indicator
Skip-Window Merging
Copy
Ask AI
from chonkie import SemanticChunker# Text with alternating topicstext = """Neural networks process information through interconnected nodes.The stock market experienced significant volatility this quarter.Deep learning models require substantial training data for optimization.Economic indicators point to potential recession risks ahead.GPU acceleration has revolutionized machine learning computations.Federal reserve policies impact global financial markets.Transformer architectures dominate modern NLP applications.Cryptocurrency markets show correlation with traditional assets."""# Enable skip-window to merge non-consecutive similar contentchunker = SemanticChunker( embedding_model="minishlab/potion-base-32M", threshold=0.65, chunk_size=512, skip_window=2 # Look ahead 2 groups for similar content)chunks = chunker.chunk(text)# AI-related content will be grouped together# Financial content will be grouped separatelyfor i, chunk in enumerate(chunks): print(f"\nGroup {i+1}: {len(chunk.text.split('.'))} sentences") print(f"Preview: {chunk.text[:80]}...")
Fine-tuned Similarity Control
Copy
Ask AI
from chonkie import SemanticChunkertext = """Your comprehensive document with various topics..."""# Experiment with different thresholdsthresholds = [0.5, 0.7, 0.9]for threshold in thresholds: chunker = SemanticChunker( embedding_model="minishlab/potion-base-32M", threshold=threshold, chunk_size=512, similarity_window=3 # Consider 3 sentences for similarity ) chunks = chunker.chunk(text) print(f"\nThreshold {threshold}: {len(chunks)} chunks created") # Lower threshold = larger, more diverse chunks # Higher threshold = smaller, more focused chunks avg_size = sum(c.token_count for c in chunks) / len(chunks) print(f"Average chunk size: {avg_size:.1f} tokens")
Batch Document Processing
Copy
Ask AI
from chonkie import SemanticChunker# Initialize chunker oncechunker = SemanticChunker( embedding_model="minishlab/potion-base-32M", threshold=0.7, chunk_size=1024, min_sentences_per_chunk=2 # Ensure meaningful chunks)# Multiple documents with different topicsdocuments = [ """Document about artificial intelligence and machine learning...""", """Document about climate change and environmental science...""", """Document about quantum computing and physics..."""]# Process all documentsbatch_results = chunker.chunk_batch(documents)# Analyze resultsfor doc_idx, chunks in enumerate(batch_results): print(f"\nDocument {doc_idx + 1}:") print(f" Total chunks: {len(chunks)}") print(f" Total tokens: {sum(c.token_count for c in chunks)}") # Show semantic boundaries for i, chunk in enumerate(chunks): first_sentence = chunk.text.split('.')[0] print(f" Chunk {i+1}: {first_sentence[:50]}...")
Custom Embeddings Integration
Copy
Ask AI
from chonkie import SemanticChunkerfrom chonkie.embeddings import AutoEmbeddings# Use AutoEmbeddings for automatic model selectionembeddings = AutoEmbeddings.get_embeddings( model="sentence-transformers/all-MiniLM-L6-v2")chunker = SemanticChunker( embedding_model=embeddings, threshold=0.8, chunk_size=512)# Or use specific embedding providersfrom chonkie.embeddings import OpenAIEmbeddingsopenai_embeddings = OpenAIEmbeddings( model="text-embedding-ada-002")chunker = SemanticChunker( embedding_model=openai_embeddings, threshold=0.75, chunk_size=1024)text = "Your text to chunk with custom embeddings..."chunks = chunker.chunk(text)
Advanced Filtering Options
Copy
Ask AI
from chonkie import SemanticChunker# Configure Savitzky-Golay filter for smoother boundarieschunker = SemanticChunker( embedding_model="minishlab/potion-base-32M", threshold=0.7, chunk_size=512, filter_window=7, # Larger window for smoother filtering filter_polyorder=4, # Higher order polynomial filter_tolerance=0.15 # Stricter boundary detection)text = """Complex document with subtle topic transitions..."""chunks = chunker.chunk(text)# The filtering helps identify more natural semantic boundaries# especially in documents with gradual topic shiftsfor chunk in chunks: print(f"Smooth boundary chunk: {chunk.text[:60]}...")
Sentence Configuration
Copy
Ask AI
from chonkie import SemanticChunker# Customize sentence detectionchunker = SemanticChunker( embedding_model="minishlab/potion-base-32M", threshold=0.7, chunk_size=1024, min_sentences_per_chunk=3, # At least 3 sentences per chunk min_characters_per_sentence=30, # Filter out short fragments delim=[". ", "! ", "? ", "\n\n"], # Custom sentence delimiters include_delim="prev" # Include delimiter with previous sentence)# Text with various sentence structurestext = """Short sentence. This is a much longer sentence with more detail.Question here? Exclamation point! New paragraph starts here.Another paragraph with different content..."""chunks = chunker.chunk(text)for chunk in chunks: sentences = chunk.text.split('. ') print(f"Chunk with {len(sentences)} sentences")
RAG Pipeline Integration
Copy
Ask AI
from chonkie import SemanticChunkerfrom chonkie.refinery import OverlapRefinery, EmbeddingsRefinery# Create semantic chunkerchunker = SemanticChunker( embedding_model="minishlab/potion-base-32M", threshold=0.7, chunk_size=512)# Add refineries for RAG optimizationoverlap_refinery = OverlapRefinery(overlap_size=50)embeddings_refinery = EmbeddingsRefinery( embedding_model="minishlab/potion-base-32M")# Process documenttext = """Your document for RAG system..."""chunks = chunker.chunk(text)# Apply refinementschunks = overlap_refinery.refine(chunks)chunks = embeddings_refinery.refine(chunks) # Add embeddings# Ready for vector databasefor chunk in chunks: print(f"Chunk ready for indexing: {chunk.text[:50]}...") if hasattr(chunk, 'embeddings'): print(f" Embedding shape: {chunk.embeddings.shape}")
The SemanticChunker uses Savitzky-Golay filtering for smoother boundary detection in similarity curves. This reduces noise in the semantic similarity signal and provides more stable chunk boundaries.