From dedd95302d06281fc865868b87779b605fea22be Mon Sep 17 00:00:00 2001 From: KUSHAL P Date: Fri, 20 Mar 2026 20:46:25 +0530 Subject: [PATCH] Skip non-meaningful unicode lines to reduce false positives in copyright detection Skip non-meaningful unicode lines to reduce false positives in copyright detection Some unicode or binary-like text was incorrectly being processed during tokenization, leading to false copyright detections. This change adds a lightweight filter in get_tokens() to skip lines that do not contain any alphabetic characters. This prevents non-readable or unicode-heavy content from being parsed, while keeping existing valid detections unaffected. Fixes #4381 Signed-off-by: KUSHAL P --- src/cluecode/copyrights.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py index 6d17467acf..5388f33142 100644 --- a/src/cluecode/copyrights.py +++ b/src/cluecode/copyrights.py @@ -406,6 +406,10 @@ def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;]+').split): if TRACE_TOK: logger_debug(' get_tokens: bare line: ' + repr(line)) + # 🔥 NEW FIX: Skip non-meaningful / unicode-heavy lines + if not any(c.isalpha() for c in line): + continue + # keep or skip empty lines if not line.strip(): stripped = last_line.lower().strip(string.punctuation)