from chonkie import TokenChunker# Basic initialization with default parameterschunker = TokenChunker(tokenizer="character", # Default tokenizer (or use "gpt2", etc.)chunk_size=2048, # Maximum tokens per chunkchunk_overlap=128 # Overlap between chunks)# Using a custom tokenizerfrom tokenizers import Tokenizercustom_tokenizer = Tokenizer.from_pretrained("your-tokenizer")chunker = TokenChunker(tokenizer=custom_tokenizer,chunk_size=2048,chunk_overlap=128)
from chonkie import TokenChunker# Create a chunker with specific parameterschunker = TokenChunker( tokenizer="gpt2", chunk_size=1024, chunk_overlap=128)text = """Natural language processing has revolutionized how we interact with computers.Machine learning models can now understand context, generate text, and even translatebetween languages with remarkable accuracy. This transformation has enabled applicationsranging from virtual assistants to automated content generation."""# Chunk the textchunks = chunker.chunk(text)# Process each chunkfor i, chunk in enumerate(chunks): print(f"\n--- Chunk {i+1} ---") print(f"Text: {chunk.text}") print(f"Token count: {chunk.token_count}") print(f"Start index: {chunk.start_index}") print(f"End index: {chunk.end_index}")
Batch Processing
Batch processing is only supported in Python
Copy
Ask AI
from chonkie import TokenChunker# Initialize chunker for batch processingchunker = TokenChunker( tokenizer="gpt2", chunk_size=512, chunk_overlap=50)# Multiple documents to processdocuments = [ "First document about machine learning fundamentals...", "Second document discussing neural networks...", "Third document on natural language processing..."]# Process all documents at oncebatch_chunks = chunker.chunk_batch(documents)# Iterate through resultsfor doc_idx, doc_chunks in enumerate(batch_chunks): print(f"\nDocument {doc_idx + 1}: {len(doc_chunks)} chunks") for chunk in doc_chunks: print(f" - Chunk: {chunk.text[:50]}... ({chunk.token_count} tokens)")
Using Custom Tokenizers
Custom tokenizers are only supported in Python. See the Installation section for JavaScript tokenizer support.
Copy
Ask AI
from chonkie import TokenChunkerimport tiktoken# Using TikToken with a specific model encodingtokenizer = tiktoken.get_encoding("cl100k_base") # GPT-4 encodingchunker = TokenChunker( tokenizer=tokenizer, chunk_size=2048, chunk_overlap=200)# Or using Hugging Face tokenizersfrom transformers import AutoTokenizerhf_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")chunker = TokenChunker( tokenizer=hf_tokenizer, chunk_size=512, chunk_overlap=50)text = "Your text to chunk with custom tokenizer..."chunks = chunker.chunk(text)
Callable Interface
The callable interface is only supported in Python
Copy
Ask AI
from chonkie import TokenChunker# Initialize oncechunker = TokenChunker( tokenizer="gpt2", chunk_size=1024, chunk_overlap=100)# Use as a callable for single textsingle_text = "This is a document that needs chunking..."chunks = chunker(single_text)print(f"Single text produced {len(chunks)} chunks")# Use as a callable for multiple textsmultiple_texts = [ "First document text...", "Second document text...", "Third document text..."]batch_results = chunker(multiple_texts)print(f"Processed {len(batch_results)} documents")
Overlap Configuration
Copy
Ask AI
from chonkie import TokenChunker# Fixed token overlapchunker_fixed = TokenChunker( tokenizer="gpt2", chunk_size=1000, chunk_overlap=100 # Exactly 100 tokens overlap)# Percentage-based overlapchunker_percent = TokenChunker( tokenizer="gpt2", chunk_size=1000, chunk_overlap=0.1 # 10% overlap (100 tokens for 1000 token chunks))text = "Long document text that will be chunked with overlap..."# Compare the resultsfixed_chunks = chunker_fixed.chunk(text)percent_chunks = chunker_percent.chunk(text)print(f"Fixed overlap: {len(fixed_chunks)} chunks")print(f"Percentage overlap: {len(percent_chunks)} chunks")
Processing Large Documents
Copy
Ask AI
from chonkie import TokenChunker# Configure for large documentschunker = TokenChunker( tokenizer="gpt2", chunk_size=4096, # Larger chunks for efficiency chunk_overlap=512 # Maintain context between chunks)# Read a large documentwith open("large_document.txt", "r") as f: large_text = f.read()# Process efficientlychunks = chunker.chunk(large_text)print(f"Document statistics:")print(f" Original length: {len(large_text)} characters")print(f" Number of chunks: {len(chunks)}")print(f" Average chunk size: {sum(c.token_count for c in chunks) / len(chunks):.1f} tokens")# Save chunks for further processingfor i, chunk in enumerate(chunks): with open(f"chunk_{i:03d}.txt", "w") as f: f.write(chunk.text)
@dataclassclass Chunk: text: str # The chunk text start_index: int # Starting position in original text end_index: int # Ending position in original text token_count: int # Number of tokens in chunk