Skip to content

Commit e16fc8e

Browse files
gh-185: Update ident name rules.
Closes #185
1 parent 4e0bc12 commit e16fc8e

6 files changed

Lines changed: 60 additions & 37 deletions

File tree

docs/SPECIFICATION.html

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454

5555
Keywords and built-in names MUST be matched case-sensitively and MUST be written in their canonical uppercase forms. If a reserved word is written in any other case, it MUST be tokenized as an identifier instead.
5656

57-
The character `-` MUST be interpreted only as the leading sign of a numeric literal. Any unsupported use of `-` MUST raise a syntax error.
57+
The character `-` MUST introduce a negative numeric literal when it is followed, optionally after horizontal whitespace, by a `0`-prefixed numeric base marker. The exact spelling `-INF` MUST remain reserved for negative infinity, and the exact spelling `-NaN` MUST remain invalid. In all other cases, `-` MAY participate in identifiers subject to [2.3](#23-identifiers).
5858

5959
The character `~` MUST be reserved for coerced function parameters and MUST NOT appear inside identifiers.
6060

@@ -64,9 +64,9 @@
6464

6565
Identifiers MUST be non-empty and case-sensitive. Variables and user-defined functions share a single flat namespace, so one name MUST NOT denote both a variable and a function. A user-defined function name MUST NOT conflict with any built-in operator or function name.
6666

67-
Identifiers MUST NOT contain non-ASCII characters or any of the following characters: `{`, `}`, `[`, `]`, `(`, `)`, `=`, `,`, `!`, `~`, or `@`. The first character of an identifier MUST NOT be `0`.
67+
Identifiers MUST NOT contain non-ASCII characters or any of the following characters: `{`, `}`, `[`, `]`, `(`, `)`, `=`, `,`, `!`, `~`, or `@`. The first character of an identifier MUST NOT be `0`, and the first two characters of an identifier MUST NOT be `-0`.
6868

69-
The first identifier character MAY be a letter `A-Z` or `a-z`, a decimal digit `1-9`, or one of `/`, `$`, `%`, `&`, `_`, `+`, `|`, or `?`. Subsequent identifier characters MAY additionally include the digit `0`. This permissive ASCII-only character set preserves an unambiguous distinction between identifiers and numeric literals, which MUST begin with a `0`-prefixed base marker.
69+
The first identifier character MAY be a letter `A-Z` or `a-z`, a decimal digit `1-9`, `*`, or one of `/`, `$`, `%`, `&`, `_`, `+`, `|`, `?`, or `-`. If the first identifier character is `-`, the second character MAY be any otherwise valid identifier character except `0`. Subsequent identifier characters MAY additionally include the digit `0`, `*`, and `-`. The exact spelling `-INF` is reserved by [4.3.2](#432-special-values). This permissive ASCII-only character set preserves an unambiguous distinction between identifiers and numeric literals, which MUST begin with a `0`-prefixed base marker.
7070

7171
---
7272

src/lexer.c

Lines changed: 45 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ static void consume_line_continuation(Lexer* lexer);
5151
static int hex_digit(char c);
5252
static bool is_base_prefix_char(char c);
5353
static bool is_number_body_char(char c);
54+
static bool is_identifier_start_char(char c);
55+
static bool is_identifier_body_char(char c);
56+
static bool matches_reserved_signed_special(Lexer* lexer, size_t index, const char* text);
5457

5558
static PTokenType check_keyword(const char* text, size_t length) {
5659
#define KEYWORD(str, type) \
@@ -202,6 +205,30 @@ static bool is_number_body_char(char c) {
202205
return c == '+' || c == '_';
203206
}
204207

208+
static bool is_identifier_start_char(char c) {
209+
return strchr("abcdefghijklmnopqrstuvwxyz123456789/ABCDEFGHIJKLMNOPQRSTUVWXYZ$%&_+|?*", c) != NULL;
210+
}
211+
212+
static bool is_identifier_body_char(char c) {
213+
return strchr("abcdefghijklmnopqrstuvwxyz1234567890/ABCDEFGHIJKLMNOPQRSTUVWXYZ$%&_+|?*-", c) != NULL;
214+
}
215+
216+
static bool matches_reserved_signed_special(Lexer* lexer, size_t index, const char* text) {
217+
size_t length = strlen(text);
218+
219+
if (index + length > lexer->source_len) {
220+
return false;
221+
}
222+
if (memcmp(lexer->source + index, text, length) != 0) {
223+
return false;
224+
}
225+
if (index + length >= lexer->source_len) {
226+
return true;
227+
}
228+
229+
return !is_identifier_body_char(lexer->source[index + length]);
230+
}
231+
205232
Token lexer_next_token(Lexer* lexer) {
206233
while (!is_at_end(lexer)) {
207234
char c = peek(lexer);
@@ -270,50 +297,35 @@ Token lexer_next_token(Lexer* lexer) {
270297
}
271298

272299
if (c == '-') {
273-
int start_line = lexer->line;
274-
int start_col = lexer->column;
275-
advance(lexer);
276-
277-
// Determine the previous non-whitespace character to avoid
278-
// interpreting a mid-token '-' (eg. `1-10`) as a negative number
279-
// start. If the previous significant character is a digit,
280-
// identifier character, or a closing bracket, treat '-' as a
281-
// plain DASH token.
282-
int prev_index = (int)lexer->current - 2; // position before the '-'
283-
while (prev_index >= 0) {
284-
char pc = lexer->source[prev_index];
285-
if (pc == ' ' || pc == '\t' || pc == '\r' || pc == '\n') {
286-
prev_index--;
287-
continue;
288-
}
289-
// found a non-whitespace previous char
290-
if (pc == '0' || pc == '1' || isalnum((unsigned char)pc) || pc == ']' || pc == ')' || pc == '}' ) {
291-
Token t = {TOKEN_DASH, safe_strdup("-"), start_line, start_col};
292-
return t;
293-
}
294-
break;
295-
}
296-
297-
size_t lookahead = lexer->current;
300+
size_t lookahead = lexer->current + 1;
298301
while (lookahead < lexer->source_len &&
299302
(lexer->source[lookahead] == ' ' || lexer->source[lookahead] == '\t' || lexer->source[lookahead] == '\r')) {
300303
lookahead++;
301304
}
302-
if (lookahead + 1 < lexer->source_len &&
303-
(lexer->source[lookahead] == '0' && is_base_prefix_char(lexer->source[lookahead + 1]))) {
304-
while(lexer->current < lookahead) advance(lexer);
305-
return number_token(lexer, true);
305+
306+
if (lookahead + 1 < lexer->source_len &&
307+
lexer->source[lookahead] == '0' &&
308+
is_base_prefix_char(lexer->source[lookahead + 1])) {
309+
advance(lexer);
310+
while (lexer->current < lookahead) advance(lexer);
311+
return number_token(lexer, true);
312+
}
313+
314+
if ((peek_next(lexer) == '0') ||
315+
matches_reserved_signed_special(lexer, lookahead, "INF") ||
316+
matches_reserved_signed_special(lexer, lookahead, "NaN")) {
317+
advance(lexer);
318+
return make_token(lexer, TOKEN_DASH, "-", 1);
306319
}
307320

308-
Token t = {TOKEN_DASH, safe_strdup("-"), start_line, start_col};
309-
return t;
321+
return identifier_token(lexer);
310322
}
311323

312324
if (c == '0' && is_base_prefix_char(peek_next(lexer))) {
313325
return number_token(lexer, false);
314326
}
315327

316-
if (strchr("abcdefghijklmnopqrstuvwxyz123456789/ABCDEFGHIJKLMNOPQRSTUVWXYZ$%&_+|?", c)) {
328+
if (is_identifier_start_char(c)) {
317329
return identifier_token(lexer);
318330
}
319331

@@ -484,7 +496,7 @@ static Token identifier_token(Lexer* lexer) {
484496

485497
while (!is_at_end(lexer)) {
486498
char c = peek(lexer);
487-
if (strchr("abcdefghijklmnopqrstuvwxyz1234567890/ABCDEFGHIJKLMNOPQRSTUVWXYZ$%&_+|?", c)) {
499+
if (is_identifier_body_char(c)) {
488500
advance(lexer);
489501
if (len_val + 1 >= capacity) { capacity *= 2; value = safe_realloc(value, capacity); }
490502
value[len_val++] = c;

src/token.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ typedef enum {
3131
TOKEN_TILDE, // ~
3232
TOKEN_HASH, // #
3333
TOKEN_DOT, // .
34-
TOKEN_DASH, // - (reserved; negative numeric literals are part of NUMBER/FLOAT; standalone '-' is a syntax error)
34+
TOKEN_DASH, // - (reserved for signed special-value syntax and invalid leading -0... forms; other '-' spellings may be identifiers)
3535

3636
// Keywords
3737
TOKEN_TRY,
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
BOOL -0name = TRUE

tests/cases/passing/ident-includes-chars.pre

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,3 +67,4 @@ BOOL _ = TRUE
6767
BOOL + = TRUE
6868
BOOL | = TRUE
6969
BOOL ? = TRUE
70+
BOOL * = TRUE
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
BOOL - = TRUE
2+
BOOL -lead = TRUE
3+
BOOL mid-dash = TRUE
4+
BOOL a-0tail = TRUE
5+
6+
ASSERT(-)
7+
ASSERT(-lead)
8+
ASSERT(mid-dash)
9+
ASSERT(a-0tail)

0 commit comments

Comments
 (0)