From c837c0d9cb16f9f718bf40e96d2537d893e29746 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Sat, 11 Apr 2026 17:27:18 +0800 Subject: [PATCH] fix(chunking): preserve sentence order in NlpSentenceChunking (#1909) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove broken re-import of load_nltk_punkt (already imported at module level). Replace list(set(sens)) with plain return — set() destroyed document order and silently dropped duplicate sentences. --- crawl4ai/chunking_strategy.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crawl4ai/chunking_strategy.py b/crawl4ai/chunking_strategy.py index f46cb667c..a0bfe1bf4 100644 --- a/crawl4ai/chunking_strategy.py +++ b/crawl4ai/chunking_strategy.py @@ -71,7 +71,6 @@ def __init__(self, **kwargs): """ Initialize the NlpSentenceChunking object. """ - from crawl4ai.le.legacy.model_loader import load_nltk_punkt load_nltk_punkt() def chunk(self, text: str) -> list: @@ -86,7 +85,7 @@ def chunk(self, text: str) -> list: sentences = sent_tokenize(text) sens = [sent.strip() for sent in sentences] - return list(set(sens)) + return sens # Topic-based segmentation using TextTiling