gh-185: Update ident name rules.

python-processing-unit · python-processing-unit · commit e16fc8e7d565 · 2026-05-12T05:44:32.000-04:00
Closes #185
diff --git a/docs/SPECIFICATION.html b/docs/SPECIFICATION.html
@@ -54,7 +54,7 @@
 
 Keywords and built-in names MUST be matched case-sensitively and MUST be written in their canonical uppercase forms. If a reserved word is written in any other case, it MUST be tokenized as an identifier instead.
 
-The character `-` MUST be interpreted only as the leading sign of a numeric literal. Any unsupported use of `-` MUST raise a syntax error.
+The character `-` MUST introduce a negative numeric literal when it is followed, optionally after horizontal whitespace, by a `0`-prefixed numeric base marker. The exact spelling `-INF` MUST remain reserved for negative infinity, and the exact spelling `-NaN` MUST remain invalid. In all other cases, `-` MAY participate in identifiers subject to [2.3](#23-identifiers).
 
 The character `~` MUST be reserved for coerced function parameters and MUST NOT appear inside identifiers.
 
@@ -64,9 +64,9 @@
 
 Identifiers MUST be non-empty and case-sensitive. Variables and user-defined functions share a single flat namespace, so one name MUST NOT denote both a variable and a function. A user-defined function name MUST NOT conflict with any built-in operator or function name.
 
-Identifiers MUST NOT contain non-ASCII characters or any of the following characters: `{`, `}`, `[`, `]`, `(`, `)`, `=`, `,`, `!`, `~`, or `@`. The first character of an identifier MUST NOT be `0`.
+Identifiers MUST NOT contain non-ASCII characters or any of the following characters: `{`, `}`, `[`, `]`, `(`, `)`, `=`, `,`, `!`, `~`, or `@`. The first character of an identifier MUST NOT be `0`, and the first two characters of an identifier MUST NOT be `-0`.
 
-The first identifier character MAY be a letter `A-Z` or `a-z`, a decimal digit `1-9`, or one of `/`, `$`, `%`, `&`, `_`, `+`, `|`, or `?`. Subsequent identifier characters MAY additionally include the digit `0`. This permissive ASCII-only character set preserves an unambiguous distinction between identifiers and numeric literals, which MUST begin with a `0`-prefixed base marker.
+The first identifier character MAY be a letter `A-Z` or `a-z`, a decimal digit `1-9`, `*`, or one of `/`, `$`, `%`, `&`, `_`, `+`, `|`, `?`, or `-`. If the first identifier character is `-`, the second character MAY be any otherwise valid identifier character except `0`. Subsequent identifier characters MAY additionally include the digit `0`, `*`, and `-`. The exact spelling `-INF` is reserved by [4.3.2](#432-special-values). This permissive ASCII-only character set preserves an unambiguous distinction between identifiers and numeric literals, which MUST begin with a `0`-prefixed base marker.
 
 ---
 
diff --git a/src/lexer.c b/src/lexer.c
@@ -51,6 +51,9 @@ static void consume_line_continuation(Lexer* lexer);
 static int hex_digit(char c);
 static bool is_base_prefix_char(char c);
 static bool is_number_body_char(char c);
+static bool is_identifier_start_char(char c);
+static bool is_identifier_body_char(char c);
+static bool matches_reserved_signed_special(Lexer* lexer, size_t index, const char* text);
 
 static PTokenType check_keyword(const char* text, size_t length) {
 #define KEYWORD(str, type) \
@@ -202,6 +205,30 @@ static bool is_number_body_char(char c) {
     return c == '+' || c == '_';
 }
 
+static bool is_identifier_start_char(char c) {
+    return strchr("abcdefghijklmnopqrstuvwxyz123456789/ABCDEFGHIJKLMNOPQRSTUVWXYZ$%&_+|?*", c) != NULL;
+}
+
+static bool is_identifier_body_char(char c) {
+    return strchr("abcdefghijklmnopqrstuvwxyz1234567890/ABCDEFGHIJKLMNOPQRSTUVWXYZ$%&_+|?*-", c) != NULL;
+}
+
+static bool matches_reserved_signed_special(Lexer* lexer, size_t index, const char* text) {
+    size_t length = strlen(text);
+
+    if (index + length > lexer->source_len) {
+        return false;
+    }
+    if (memcmp(lexer->source + index, text, length) != 0) {
+        return false;
+    }
+    if (index + length >= lexer->source_len) {
+        return true;
+    }
+
+    return !is_identifier_body_char(lexer->source[index + length]);
+}
+
 Token lexer_next_token(Lexer* lexer) {
     while (!is_at_end(lexer)) {
         char c = peek(lexer);
@@ -270,50 +297,35 @@ Token lexer_next_token(Lexer* lexer) {
         }
 
         if (c == '-') {
-            int start_line = lexer->line;
-            int start_col = lexer->column;
-            advance(lexer);
-
-            // Determine the previous non-whitespace character to avoid
-            // interpreting a mid-token '-' (eg. `1-10`) as a negative number
-            // start. If the previous significant character is a digit,
-            // identifier character, or a closing bracket, treat '-' as a
-            // plain DASH token.
-            int prev_index = (int)lexer->current - 2; // position before the '-'
-            while (prev_index >= 0) {
-                char pc = lexer->source[prev_index];
-                if (pc == ' ' || pc == '\t' || pc == '\r' || pc == '\n') {
-                    prev_index--;
-                    continue;
-                }
-                // found a non-whitespace previous char
-                if (pc == '0' || pc == '1' || isalnum((unsigned char)pc) || pc == ']' || pc == ')' || pc == '}' ) {
-                    Token t = {TOKEN_DASH, safe_strdup("-"), start_line, start_col};
-                    return t;
-                }
-                break;
-            }
-
-            size_t lookahead = lexer->current;
+            size_t lookahead = lexer->current + 1;
             while (lookahead < lexer->source_len &&
                   (lexer->source[lookahead] == ' ' || lexer->source[lookahead] == '\t' || lexer->source[lookahead] == '\r')) {
                 lookahead++;
             }
-                if (lookahead + 1 < lexer->source_len &&
-                    (lexer->source[lookahead] == '0' && is_base_prefix_char(lexer->source[lookahead + 1]))) {
-                   while(lexer->current < lookahead) advance(lexer);
-                   return number_token(lexer, true);
+
+            if (lookahead + 1 < lexer->source_len &&
+                lexer->source[lookahead] == '0' &&
+                is_base_prefix_char(lexer->source[lookahead + 1])) {
+                advance(lexer);
+                while (lexer->current < lookahead) advance(lexer);
+                return number_token(lexer, true);
+            }
+
+            if ((peek_next(lexer) == '0') ||
+                matches_reserved_signed_special(lexer, lookahead, "INF") ||
+                matches_reserved_signed_special(lexer, lookahead, "NaN")) {
+                advance(lexer);
+                return make_token(lexer, TOKEN_DASH, "-", 1);
             }
 
-            Token t = {TOKEN_DASH, safe_strdup("-"), start_line, start_col};
-            return t;
+            return identifier_token(lexer);
         }
 
         if (c == '0' && is_base_prefix_char(peek_next(lexer))) {
             return number_token(lexer, false);
         }
 
-        if (strchr("abcdefghijklmnopqrstuvwxyz123456789/ABCDEFGHIJKLMNOPQRSTUVWXYZ$%&_+|?", c)) {
+        if (is_identifier_start_char(c)) {
             return identifier_token(lexer);
         }
 
@@ -484,7 +496,7 @@ static Token identifier_token(Lexer* lexer) {
     
     while (!is_at_end(lexer)) {
         char c = peek(lexer);
-        if (strchr("abcdefghijklmnopqrstuvwxyz1234567890/ABCDEFGHIJKLMNOPQRSTUVWXYZ$%&_+|?", c)) {
+        if (is_identifier_body_char(c)) {
             advance(lexer);
             if (len_val + 1 >= capacity) { capacity *= 2; value = safe_realloc(value, capacity); }
             value[len_val++] = c;
diff --git a/src/token.h b/src/token.h
@@ -31,7 +31,7 @@ typedef enum {
     TOKEN_TILDE,    // ~
     TOKEN_HASH,     // #
     TOKEN_DOT,      // .
-    TOKEN_DASH,     // - (reserved; negative numeric literals are part of NUMBER/FLOAT; standalone '-' is a syntax error)
+    TOKEN_DASH,     // - (reserved for signed special-value syntax and invalid leading -0... forms; other '-' spellings may be identifiers)
 
     // Keywords
     TOKEN_TRY,
diff --git a/tests/cases/failing/dash-zero-starts-ident.pre b/tests/cases/failing/dash-zero-starts-ident.pre
@@ -0,0 +1 @@
+BOOL -0name = TRUE
diff --git a/tests/cases/passing/ident-includes-chars.pre b/tests/cases/passing/ident-includes-chars.pre
@@ -67,3 +67,4 @@ BOOL _ = TRUE
 BOOL + = TRUE
 BOOL | = TRUE
 BOOL ? = TRUE
+BOOL * = TRUE
diff --git a/tests/cases/passing/ident-includes-dash.pre b/tests/cases/passing/ident-includes-dash.pre
@@ -0,0 +1,9 @@
+BOOL - = TRUE
+BOOL -lead = TRUE
+BOOL mid-dash = TRUE
+BOOL a-0tail = TRUE
+
+ASSERT(-)
+ASSERT(-lead)
+ASSERT(mid-dash)
+ASSERT(a-0tail)