From dedd95302d06281fc865868b87779b605fea22be Mon Sep 17 00:00:00 2001
From: KUSHAL P <kushalmys55@gmail.com>
Date: Fri, 20 Mar 2026 20:46:25 +0530
Subject: [PATCH] Skip non-meaningful unicode lines to reduce false positives
 in copyright detection

Skip non-meaningful unicode lines to reduce false positives in copyright detection

Some unicode or binary-like text was incorrectly being processed during tokenization,
leading to false copyright detections.

This change adds a lightweight filter in get_tokens() to skip lines that do not
contain any alphabetic characters. This prevents non-readable or unicode-heavy
content from being parsed, while keeping existing valid detections unaffected.

Fixes #4381

Signed-off-by: KUSHAL P <kushalmys55@gmail.com>
---
 src/cluecode/copyrights.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py
index 6d17467acf..5388f33142 100644
--- a/src/cluecode/copyrights.py
+++ b/src/cluecode/copyrights.py
@@ -406,6 +406,10 @@ def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;]+').split):
         if TRACE_TOK:
             logger_debug('  get_tokens: bare line: ' + repr(line))
 
+        # 🔥 NEW FIX: Skip non-meaningful / unicode-heavy lines
+        if not any(c.isalpha() for c in line):
+            continue
+
         # keep or skip empty lines
         if not line.strip():
             stripped = last_line.lower().strip(string.punctuation)