From c837c0d9cb16f9f718bf40e96d2537d893e29746 Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Sat, 11 Apr 2026 17:27:18 +0800
Subject: [PATCH] fix(chunking): preserve sentence order in NlpSentenceChunking
 (#1909)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove broken re-import of load_nltk_punkt (already imported at module level).
Replace list(set(sens)) with plain return — set() destroyed document order
and silently dropped duplicate sentences.
---
 crawl4ai/chunking_strategy.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/crawl4ai/chunking_strategy.py b/crawl4ai/chunking_strategy.py
index f46cb667c..a0bfe1bf4 100644
--- a/crawl4ai/chunking_strategy.py
+++ b/crawl4ai/chunking_strategy.py
@@ -71,7 +71,6 @@ def __init__(self, **kwargs):
         """
         Initialize the NlpSentenceChunking object.
         """
-        from crawl4ai.le.legacy.model_loader import load_nltk_punkt
         load_nltk_punkt()
 
     def chunk(self, text: str) -> list:
@@ -86,7 +85,7 @@ def chunk(self, text: str) -> list:
         sentences = sent_tokenize(text)
         sens = [sent.strip() for sent in sentences]
 
-        return list(set(sens))
+        return sens
 
 
 # Topic-based segmentation using TextTiling