Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions lib/codegen_deno.ml
Original file line number Diff line number Diff line change
Expand Up @@ -721,8 +721,8 @@ and gen_literal (lit : literal) : string =
if String.length s > 0 && s.[String.length s - 1] = '.' then s ^ "0" else s
| LitBool (true, _) -> "true"
| LitBool (false, _) -> "false"
| LitString (s, _) -> "\"" ^ String.escaped s ^ "\""
| LitChar (c, _) -> "\"" ^ Char.escaped c ^ "\""
| LitString (s, _) -> Js_codegen.js_string_lit s
| LitChar (c, _) -> Js_codegen.js_string_lit (String.make 1 c)
| LitUnit _ -> "Unit"

and gen_pattern ctx (pat : pattern) : string =
Expand Down
60 changes: 58 additions & 2 deletions lib/js_codegen.ml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,62 @@ let mangle (name : string) : string =
if List.mem name js_reserved then name ^ "_"
else name

(** Lower a UTF-8 byte string to a JS double-quoted literal that is
safe under strict-mode ESM.

OCaml's [String.escaped] emits non-ASCII bytes as [\NNN] *decimal*
sequences; JavaScript parses [\NNN] as *octal* escapes which strict
mode rejects ([SyntaxError: Octal escape sequences are not allowed
in strict mode]) and which would decode to wrong characters even
outside strict mode. This helper instead decodes the UTF-8 byte
sequence to code points and emits [\uXXXX] (BMP) or [\u{XXXXX}]
(non-BMP) Unicode escapes — accepted everywhere, no parser-mode
surprises, and preserves the original character. Closes #460. *)
let js_string_lit (s : string) : string =
let buf = Buffer.create (String.length s + 8) in
Buffer.add_char buf '"';
let n = String.length s in
let i = ref 0 in
while !i < n do
let b0 = Char.code s.[!i] in
if b0 < 0x80 then begin
(match Char.chr b0 with
| '\\' -> Buffer.add_string buf "\\\\"
| '"' -> Buffer.add_string buf "\\\""
| '\n' -> Buffer.add_string buf "\\n"
| '\r' -> Buffer.add_string buf "\\r"
| '\t' -> Buffer.add_string buf "\\t"
| c when b0 >= 0x20 && b0 <= 0x7E -> Buffer.add_char buf c
| _ -> Buffer.add_string buf (Printf.sprintf "\\x%02X" b0));
incr i
end else begin
let cp, len =
if b0 < 0xC0 then (b0, 1)
else if b0 < 0xE0 && !i + 1 < n then
let b1 = Char.code s.[!i + 1] in
(((b0 land 0x1F) lsl 6) lor (b1 land 0x3F), 2)
else if b0 < 0xF0 && !i + 2 < n then
let b1 = Char.code s.[!i + 1] in
let b2 = Char.code s.[!i + 2] in
(((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F), 3)
else if !i + 3 < n then
let b1 = Char.code s.[!i + 1] in
let b2 = Char.code s.[!i + 2] in
let b3 = Char.code s.[!i + 3] in
(((b0 land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12)
lor ((b2 land 0x3F) lsl 6) lor (b3 land 0x3F), 4)
else (b0, 1)
in
if cp <= 0xFFFF then
Buffer.add_string buf (Printf.sprintf "\\u%04X" cp)
else
Buffer.add_string buf (Printf.sprintf "\\u{%X}" cp);
i := !i + len
end
done;
Buffer.add_char buf '"';
Buffer.contents buf

(* ============================================================================
Expression Code Generation
============================================================================ *)
Expand Down Expand Up @@ -230,8 +286,8 @@ and gen_literal (lit : literal) : string =
if String.length s > 0 && s.[String.length s - 1] = '.' then s ^ "0" else s
| LitBool (true, _) -> "true"
| LitBool (false, _) -> "false"
| LitString (s, _) -> "\"" ^ String.escaped s ^ "\""
| LitChar (c, _) -> "\"" ^ Char.escaped c ^ "\""
| LitString (s, _) -> js_string_lit s
| LitChar (c, _) -> js_string_lit (String.make 1 c)
| LitUnit _ -> "Unit"

and gen_pattern ctx (pat : pattern) : string =
Expand Down
17 changes: 17 additions & 0 deletions tests/codegen-deno/non_ascii.affine
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// SPDX-License-Identifier: MPL-2.0
// issue #460 — non-ASCII string literals must round-trip under
// strict-mode ESM. Pre-fix, the JS codegen used OCaml `String.escaped`
// which emitted `\NNN` decimal sequences; the JS parser reads `\NNN`
// as OCTAL escapes, which strict-mode ESM rejects with
// `SyntaxError: Octal escape sequences are not allowed in strict mode`.
// Post-fix, non-ASCII bytes lower to `\uXXXX` / `\u{XXXXX}` Unicode
// escapes which all JS parser modes accept.

pub fn emoji_cross() -> String { return "❌"; }
pub fn emoji_check() -> String { return "✓"; }
pub fn cjk_hello() -> String { return "你好"; }
pub fn latin_accent() -> String { return "café résumé"; }
pub fn non_bmp_sob() -> String { return "😭"; }
pub fn mixed() -> String { return "[OK] café 你好 ❌"; }
pub fn ascii_only() -> String { return "plain ASCII"; }
pub fn quotes_and_backslash() -> String { return "\"escaped\" and \\back"; }
28 changes: 28 additions & 0 deletions tests/codegen-deno/non_ascii.harness.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// SPDX-License-Identifier: MPL-2.0
// issue #460 — round-trip non-ASCII string literals through the
// Deno-ESM backend under strict-mode ESM. The `import` itself is the
// strictest test: if the emitted `.deno.js` contains octal escapes,
// the module fails to parse and the import throws SyntaxError before
// any assertion can run.
import assert from "node:assert/strict";
import {
emoji_cross,
emoji_check,
cjk_hello,
latin_accent,
non_bmp_sob,
mixed,
ascii_only,
quotes_and_backslash,
} from "./non_ascii.deno.js";

assert.equal(emoji_cross(), "❌", "BMP emoji ❌ round-trips");
assert.equal(emoji_check(), "✓", "BMP check mark ✓ round-trips");
assert.equal(cjk_hello(), "你好", "CJK 'nihao' round-trips");
assert.equal(latin_accent(), "café résumé", "Latin accented round-trips");
assert.equal(non_bmp_sob(), "\u{1F62D}", "non-BMP code point round-trips");
assert.equal(mixed(), "[OK] café 你好 ❌", "mixed ASCII+non-ASCII round-trips");
assert.equal(ascii_only(), "plain ASCII", "ASCII-only unchanged");
assert.equal(quotes_and_backslash(), "\"escaped\" and \\back", "quote+backslash escapes preserved");

console.log("non_ascii.harness.mjs OK");
Loading