Skip to content

Commit b0957d7

Browse files
gh-154: Fix SLEN handling of multi-byte chars.
1 parent 53e0b38 commit b0957d7

1 file changed

Lines changed: 55 additions & 1 deletion

File tree

src/builtins.c

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,60 @@ static unsigned char* enc_utf8_to_utf16(const char* s, size_t* out_sz, int littl
360360
return out;
361361
}
362362

363+
/* Count Unicode code points in a UTF-8 string. Returns the number of
364+
* Unicode code points (characters) represented by the UTF-8 sequence.
365+
* Invalid sequences are treated as a single replacement character.
366+
*/
367+
static size_t utf8_codepoint_count(const char* s) {
368+
if (!s) return 0;
369+
const unsigned char* p = (const unsigned char*)s;
370+
size_t count = 0;
371+
372+
while (*p) {
373+
unsigned char c = *p;
374+
uint32_t codepoint = 0;
375+
size_t seq_len = 0;
376+
377+
if (c < 0x80) {
378+
codepoint = c; seq_len = 1;
379+
} else if ((c & 0xE0) == 0xC0) {
380+
if (p[1] != '\0' && (p[1] & 0xC0) == 0x80) {
381+
codepoint = ((uint32_t)(c & 0x1F) << 6) | (uint32_t)(p[1] & 0x3F);
382+
seq_len = 2;
383+
if (codepoint < 0x80) seq_len = 0; /* overlong */
384+
}
385+
} else if ((c & 0xF0) == 0xE0) {
386+
if (p[1] != '\0' && p[2] != '\0' && (p[1] & 0xC0) == 0x80 && (p[2] & 0xC0) == 0x80) {
387+
codepoint = ((uint32_t)(c & 0x0F) << 12) | ((uint32_t)(p[1] & 0x3F) << 6) | (uint32_t)(p[2] & 0x3F);
388+
seq_len = 3;
389+
if (codepoint < 0x800) seq_len = 0; /* overlong */
390+
}
391+
} else if ((c & 0xF8) == 0xF0) {
392+
if (p[1] != '\0' && p[2] != '\0' && p[3] != '\0' && (p[1] & 0xC0) == 0x80 && (p[2] & 0xC0) == 0x80 && (p[3] & 0xC0) == 0x80) {
393+
codepoint = ((uint32_t)(c & 0x07) << 18) | ((uint32_t)(p[1] & 0x3F) << 12) | ((uint32_t)(p[2] & 0x3F) << 6) | (uint32_t)(p[3] & 0x3F);
394+
seq_len = 4;
395+
if (codepoint < 0x10000 || codepoint > 0x10FFFF) seq_len = 0; /* overlong or out of range */
396+
}
397+
}
398+
399+
if (seq_len == 0) {
400+
/* invalid sequence -> advance one byte (counts as one character) */
401+
p++;
402+
} else {
403+
/* reject surrogate halves as invalid */
404+
if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
405+
p++;
406+
} else {
407+
p += seq_len;
408+
}
409+
}
410+
411+
count++;
412+
}
413+
414+
return count;
415+
}
416+
363417
static unsigned char* enc_utf8_to_cp1252(const char* s, size_t* out_sz, int is_windows) {
364418
size_t slen = s ? strlen(s) : 0;
365419
size_t outcap = slen + 16;
@@ -5016,7 +5070,7 @@ static Value builtin_bytes(Interpreter* interp, Value* args, int argc, Expr** ar
50165070
static Value builtin_slen(Interpreter* interp, Value* args, int argc, Expr** arg_nodes, Env* env, int line, int col) {
50175071
(void)arg_nodes; (void)env;
50185072
EXPECT_STR(args[0], "SLEN", interp, line, col);
5019-
return value_int((int64_t)strlen(args[0].as.s));
5073+
return value_int((int64_t)utf8_codepoint_count(args[0].as.s));
50205074
}
50215075

50225076
static Value builtin_upper(Interpreter* interp, Value* args, int argc, Expr** arg_nodes, Env* env, int line, int col) {

0 commit comments

Comments
 (0)