@@ -360,6 +360,60 @@ static unsigned char* enc_utf8_to_utf16(const char* s, size_t* out_sz, int littl
360360 return out ;
361361}
362362
363+ /* Count Unicode code points in a UTF-8 string. Returns the number of
364+ * Unicode code points (characters) represented by the UTF-8 sequence.
365+ * Invalid sequences are treated as a single replacement character.
366+ */
367+ static size_t utf8_codepoint_count (const char * s ) {
368+ if (!s ) return 0 ;
369+ const unsigned char * p = (const unsigned char * )s ;
370+ size_t count = 0 ;
371+
372+ while (* p ) {
373+ unsigned char c = * p ;
374+ uint32_t codepoint = 0 ;
375+ size_t seq_len = 0 ;
376+
377+ if (c < 0x80 ) {
378+ codepoint = c ; seq_len = 1 ;
379+ } else if ((c & 0xE0 ) == 0xC0 ) {
380+ if (p [1 ] != '\0' && (p [1 ] & 0xC0 ) == 0x80 ) {
381+ codepoint = ((uint32_t )(c & 0x1F ) << 6 ) | (uint32_t )(p [1 ] & 0x3F );
382+ seq_len = 2 ;
383+ if (codepoint < 0x80 ) seq_len = 0 ; /* overlong */
384+ }
385+ } else if ((c & 0xF0 ) == 0xE0 ) {
386+ if (p [1 ] != '\0' && p [2 ] != '\0' && (p [1 ] & 0xC0 ) == 0x80 && (p [2 ] & 0xC0 ) == 0x80 ) {
387+ codepoint = ((uint32_t )(c & 0x0F ) << 12 ) | ((uint32_t )(p [1 ] & 0x3F ) << 6 ) | (uint32_t )(p [2 ] & 0x3F );
388+ seq_len = 3 ;
389+ if (codepoint < 0x800 ) seq_len = 0 ; /* overlong */
390+ }
391+ } else if ((c & 0xF8 ) == 0xF0 ) {
392+ if (p [1 ] != '\0' && p [2 ] != '\0' && p [3 ] != '\0' && (p [1 ] & 0xC0 ) == 0x80 && (p [2 ] & 0xC0 ) == 0x80 && (p [3 ] & 0xC0 ) == 0x80 ) {
393+ codepoint = ((uint32_t )(c & 0x07 ) << 18 ) | ((uint32_t )(p [1 ] & 0x3F ) << 12 ) | ((uint32_t )(p [2 ] & 0x3F ) << 6 ) | (uint32_t )(p [3 ] & 0x3F );
394+ seq_len = 4 ;
395+ if (codepoint < 0x10000 || codepoint > 0x10FFFF ) seq_len = 0 ; /* overlong or out of range */
396+ }
397+ }
398+
399+ if (seq_len == 0 ) {
400+ /* invalid sequence -> advance one byte (counts as one character) */
401+ p ++ ;
402+ } else {
403+ /* reject surrogate halves as invalid */
404+ if (codepoint >= 0xD800 && codepoint <= 0xDFFF ) {
405+ p ++ ;
406+ } else {
407+ p += seq_len ;
408+ }
409+ }
410+
411+ count ++ ;
412+ }
413+
414+ return count ;
415+ }
416+
363417static unsigned char * enc_utf8_to_cp1252 (const char * s , size_t * out_sz , int is_windows ) {
364418 size_t slen = s ? strlen (s ) : 0 ;
365419 size_t outcap = slen + 16 ;
@@ -5016,7 +5070,7 @@ static Value builtin_bytes(Interpreter* interp, Value* args, int argc, Expr** ar
50165070static Value builtin_slen (Interpreter * interp , Value * args , int argc , Expr * * arg_nodes , Env * env , int line , int col ) {
50175071 (void )arg_nodes ; (void )env ;
50185072 EXPECT_STR (args [0 ], "SLEN" , interp , line , col );
5019- return value_int ((int64_t )strlen (args [0 ].as .s ));
5073+ return value_int ((int64_t )utf8_codepoint_count (args [0 ].as .s ));
50205074}
50215075
50225076static Value builtin_upper (Interpreter * interp , Value * args , int argc , Expr * * arg_nodes , Env * env , int line , int col ) {
0 commit comments