xref: /freebsd/contrib/lua/src/llex.c (revision 6be3386466ab79a84b48429ae66244f21526d3df)
1 /*
2 ** $Id: llex.c $
3 ** Lexical Analyzer
4 ** See Copyright Notice in lua.h
5 */
6 
7 #define llex_c
8 #define LUA_CORE
9 
10 #include "lprefix.h"
11 
12 
13 #include <locale.h>
14 #include <string.h>
15 
16 #include "lua.h"
17 
18 #include "lctype.h"
19 #include "ldebug.h"
20 #include "ldo.h"
21 #include "lgc.h"
22 #include "llex.h"
23 #include "lobject.h"
24 #include "lparser.h"
25 #include "lstate.h"
26 #include "lstring.h"
27 #include "ltable.h"
28 #include "lzio.h"
29 
30 
31 
32 #define next(ls)	(ls->current = zgetc(ls->z))
33 
34 
35 
36 #define currIsNewline(ls)	(ls->current == '\n' || ls->current == '\r')
37 
38 
39 /* ORDER RESERVED */
40 static const char *const luaX_tokens [] = {
41     "and", "break", "do", "else", "elseif",
42     "end", "false", "for", "function", "goto", "if",
43     "in", "local", "nil", "not", "or", "repeat",
44     "return", "then", "true", "until", "while",
45     "//", "..", "...", "==", ">=", "<=", "~=",
46     "<<", ">>", "::", "<eof>",
47     "<number>", "<integer>", "<name>", "<string>"
48 };
49 
50 
51 #define save_and_next(ls) (save(ls, ls->current), next(ls))
52 
53 
54 static l_noret lexerror (LexState *ls, const char *msg, int token);
55 
56 
57 static void save (LexState *ls, int c) {
58   Mbuffer *b = ls->buff;
59   if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
60     size_t newsize;
61     if (luaZ_sizebuffer(b) >= MAX_SIZE/2)
62       lexerror(ls, "lexical element too long", 0);
63     newsize = luaZ_sizebuffer(b) * 2;
64     luaZ_resizebuffer(ls->L, b, newsize);
65   }
66   b->buffer[luaZ_bufflen(b)++] = cast_char(c);
67 }
68 
69 
70 void luaX_init (lua_State *L) {
71   int i;
72   TString *e = luaS_newliteral(L, LUA_ENV);  /* create env name */
73   luaC_fix(L, obj2gco(e));  /* never collect this name */
74   for (i=0; i<NUM_RESERVED; i++) {
75     TString *ts = luaS_new(L, luaX_tokens[i]);
76     luaC_fix(L, obj2gco(ts));  /* reserved words are never collected */
77     ts->extra = cast_byte(i+1);  /* reserved word */
78   }
79 }
80 
81 
82 const char *luaX_token2str (LexState *ls, int token) {
83   if (token < FIRST_RESERVED) {  /* single-byte symbols? */
84     if (lisprint(token))
85       return luaO_pushfstring(ls->L, "'%c'", token);
86     else  /* control character */
87       return luaO_pushfstring(ls->L, "'<\\%d>'", token);
88   }
89   else {
90     const char *s = luaX_tokens[token - FIRST_RESERVED];
91     if (token < TK_EOS)  /* fixed format (symbols and reserved words)? */
92       return luaO_pushfstring(ls->L, "'%s'", s);
93     else  /* names, strings, and numerals */
94       return s;
95   }
96 }
97 
98 
99 static const char *txtToken (LexState *ls, int token) {
100   switch (token) {
101     case TK_NAME: case TK_STRING:
102     case TK_FLT: case TK_INT:
103       save(ls, '\0');
104       return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff));
105     default:
106       return luaX_token2str(ls, token);
107   }
108 }
109 
110 
111 static l_noret lexerror (LexState *ls, const char *msg, int token) {
112   msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber);
113   if (token)
114     luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
115   luaD_throw(ls->L, LUA_ERRSYNTAX);
116 }
117 
118 
119 l_noret luaX_syntaxerror (LexState *ls, const char *msg) {
120   lexerror(ls, msg, ls->t.token);
121 }
122 
123 
124 /*
125 ** creates a new string and anchors it in scanner's table so that
126 ** it will not be collected until the end of the compilation
127 ** (by that time it should be anchored somewhere)
128 */
129 TString *luaX_newstring (LexState *ls, const char *str, size_t l) {
130   lua_State *L = ls->L;
131   TValue *o;  /* entry for 'str' */
132   TString *ts = luaS_newlstr(L, str, l);  /* create new string */
133   setsvalue2s(L, L->top++, ts);  /* temporarily anchor it in stack */
134   o = luaH_set(L, ls->h, s2v(L->top - 1));
135   if (isempty(o)) {  /* not in use yet? */
136     /* boolean value does not need GC barrier;
137        table is not a metatable, so it does not need to invalidate cache */
138     setbtvalue(o);  /* t[string] = true */
139     luaC_checkGC(L);
140   }
141   else {  /* string already present */
142     ts = keystrval(nodefromval(o));  /* re-use value previously stored */
143   }
144   L->top--;  /* remove string from stack */
145   return ts;
146 }
147 
148 
149 /*
150 ** increment line number and skips newline sequence (any of
151 ** \n, \r, \n\r, or \r\n)
152 */
153 static void inclinenumber (LexState *ls) {
154   int old = ls->current;
155   lua_assert(currIsNewline(ls));
156   next(ls);  /* skip '\n' or '\r' */
157   if (currIsNewline(ls) && ls->current != old)
158     next(ls);  /* skip '\n\r' or '\r\n' */
159   if (++ls->linenumber >= MAX_INT)
160     lexerror(ls, "chunk has too many lines", 0);
161 }
162 
163 
164 void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source,
165                     int firstchar) {
166   ls->t.token = 0;
167   ls->L = L;
168   ls->current = firstchar;
169   ls->lookahead.token = TK_EOS;  /* no look-ahead token */
170   ls->z = z;
171   ls->fs = NULL;
172   ls->linenumber = 1;
173   ls->lastline = 1;
174   ls->source = source;
175   ls->envn = luaS_newliteral(L, LUA_ENV);  /* get env name */
176   luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER);  /* initialize buffer */
177 }
178 
179 
180 
181 /*
182 ** =======================================================
183 ** LEXICAL ANALYZER
184 ** =======================================================
185 */
186 
187 
188 static int check_next1 (LexState *ls, int c) {
189   if (ls->current == c) {
190     next(ls);
191     return 1;
192   }
193   else return 0;
194 }
195 
196 
197 /*
198 ** Check whether current char is in set 'set' (with two chars) and
199 ** saves it
200 */
201 static int check_next2 (LexState *ls, const char *set) {
202   lua_assert(set[2] == '\0');
203   if (ls->current == set[0] || ls->current == set[1]) {
204     save_and_next(ls);
205     return 1;
206   }
207   else return 0;
208 }
209 
210 
211 /* LUA_NUMBER */
212 /*
213 ** This function is quite liberal in what it accepts, as 'luaO_str2num'
214 ** will reject ill-formed numerals. Roughly, it accepts the following
215 ** pattern:
216 **
217 **   %d(%x|%.|([Ee][+-]?))* | 0[Xx](%x|%.|([Pp][+-]?))*
218 **
219 ** The only tricky part is to accept [+-] only after a valid exponent
220 ** mark, to avoid reading '3-4' or '0xe+1' as a single number.
221 **
222 ** The caller might have already read an initial dot.
223 */
224 static int read_numeral (LexState *ls, SemInfo *seminfo) {
225   TValue obj;
226   const char *expo = "Ee";
227   int first = ls->current;
228   lua_assert(lisdigit(ls->current));
229   save_and_next(ls);
230   if (first == '0' && check_next2(ls, "xX"))  /* hexadecimal? */
231     expo = "Pp";
232   for (;;) {
233     if (check_next2(ls, expo))  /* exponent mark? */
234       check_next2(ls, "-+");  /* optional exponent sign */
235     else if (lisxdigit(ls->current) || ls->current == '.')  /* '%x|%.' */
236       save_and_next(ls);
237     else break;
238   }
239   if (lislalpha(ls->current))  /* is numeral touching a letter? */
240     save_and_next(ls);  /* force an error */
241   save(ls, '\0');
242   if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0)  /* format error? */
243     lexerror(ls, "malformed number", TK_FLT);
244   if (ttisinteger(&obj)) {
245     seminfo->i = ivalue(&obj);
246     return TK_INT;
247   }
248   else {
249     lua_assert(ttisfloat(&obj));
250     seminfo->r = fltvalue(&obj);
251     return TK_FLT;
252   }
253 }
254 
255 
256 /*
257 ** read a sequence '[=*[' or ']=*]', leaving the last bracket. If
258 ** sequence is well formed, return its number of '='s + 2; otherwise,
259 ** return 1 if it is a single bracket (no '='s and no 2nd bracket);
260 ** otherwise (an unfinished '[==...') return 0.
261 */
262 static size_t skip_sep (LexState *ls) {
263   size_t count = 0;
264   int s = ls->current;
265   lua_assert(s == '[' || s == ']');
266   save_and_next(ls);
267   while (ls->current == '=') {
268     save_and_next(ls);
269     count++;
270   }
271   return (ls->current == s) ? count + 2
272          : (count == 0) ? 1
273          : 0;
274 }
275 
276 
277 static void read_long_string (LexState *ls, SemInfo *seminfo, size_t sep) {
278   int line = ls->linenumber;  /* initial line (for error message) */
279   save_and_next(ls);  /* skip 2nd '[' */
280   if (currIsNewline(ls))  /* string starts with a newline? */
281     inclinenumber(ls);  /* skip it */
282   for (;;) {
283     switch (ls->current) {
284       case EOZ: {  /* error */
285         const char *what = (seminfo ? "string" : "comment");
286         const char *msg = luaO_pushfstring(ls->L,
287                      "unfinished long %s (starting at line %d)", what, line);
288         lexerror(ls, msg, TK_EOS);
289         break;  /* to avoid warnings */
290       }
291       case ']': {
292         if (skip_sep(ls) == sep) {
293           save_and_next(ls);  /* skip 2nd ']' */
294           goto endloop;
295         }
296         break;
297       }
298       case '\n': case '\r': {
299         save(ls, '\n');
300         inclinenumber(ls);
301         if (!seminfo) luaZ_resetbuffer(ls->buff);  /* avoid wasting space */
302         break;
303       }
304       default: {
305         if (seminfo) save_and_next(ls);
306         else next(ls);
307       }
308     }
309   } endloop:
310   if (seminfo)
311     seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep,
312                                      luaZ_bufflen(ls->buff) - 2 * sep);
313 }
314 
315 
316 static void esccheck (LexState *ls, int c, const char *msg) {
317   if (!c) {
318     if (ls->current != EOZ)
319       save_and_next(ls);  /* add current to buffer for error message */
320     lexerror(ls, msg, TK_STRING);
321   }
322 }
323 
324 
325 static int gethexa (LexState *ls) {
326   save_and_next(ls);
327   esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected");
328   return luaO_hexavalue(ls->current);
329 }
330 
331 
332 static int readhexaesc (LexState *ls) {
333   int r = gethexa(ls);
334   r = (r << 4) + gethexa(ls);
335   luaZ_buffremove(ls->buff, 2);  /* remove saved chars from buffer */
336   return r;
337 }
338 
339 
340 static unsigned long readutf8esc (LexState *ls) {
341   unsigned long r;
342   int i = 4;  /* chars to be removed: '\', 'u', '{', and first digit */
343   save_and_next(ls);  /* skip 'u' */
344   esccheck(ls, ls->current == '{', "missing '{'");
345   r = gethexa(ls);  /* must have at least one digit */
346   while (cast_void(save_and_next(ls)), lisxdigit(ls->current)) {
347     i++;
348     esccheck(ls, r <= (0x7FFFFFFFu >> 4), "UTF-8 value too large");
349     r = (r << 4) + luaO_hexavalue(ls->current);
350   }
351   esccheck(ls, ls->current == '}', "missing '}'");
352   next(ls);  /* skip '}' */
353   luaZ_buffremove(ls->buff, i);  /* remove saved chars from buffer */
354   return r;
355 }
356 
357 
358 static void utf8esc (LexState *ls) {
359   char buff[UTF8BUFFSZ];
360   int n = luaO_utf8esc(buff, readutf8esc(ls));
361   for (; n > 0; n--)  /* add 'buff' to string */
362     save(ls, buff[UTF8BUFFSZ - n]);
363 }
364 
365 
366 static int readdecesc (LexState *ls) {
367   int i;
368   int r = 0;  /* result accumulator */
369   for (i = 0; i < 3 && lisdigit(ls->current); i++) {  /* read up to 3 digits */
370     r = 10*r + ls->current - '0';
371     save_and_next(ls);
372   }
373   esccheck(ls, r <= UCHAR_MAX, "decimal escape too large");
374   luaZ_buffremove(ls->buff, i);  /* remove read digits from buffer */
375   return r;
376 }
377 
378 
379 static void read_string (LexState *ls, int del, SemInfo *seminfo) {
380   save_and_next(ls);  /* keep delimiter (for error messages) */
381   while (ls->current != del) {
382     switch (ls->current) {
383       case EOZ:
384         lexerror(ls, "unfinished string", TK_EOS);
385         break;  /* to avoid warnings */
386       case '\n':
387       case '\r':
388         lexerror(ls, "unfinished string", TK_STRING);
389         break;  /* to avoid warnings */
390       case '\\': {  /* escape sequences */
391         int c;  /* final character to be saved */
392         save_and_next(ls);  /* keep '\\' for error messages */
393         switch (ls->current) {
394           case 'a': c = '\a'; goto read_save;
395           case 'b': c = '\b'; goto read_save;
396           case 'f': c = '\f'; goto read_save;
397           case 'n': c = '\n'; goto read_save;
398           case 'r': c = '\r'; goto read_save;
399           case 't': c = '\t'; goto read_save;
400           case 'v': c = '\v'; goto read_save;
401           case 'x': c = readhexaesc(ls); goto read_save;
402           case 'u': utf8esc(ls);  goto no_save;
403           case '\n': case '\r':
404             inclinenumber(ls); c = '\n'; goto only_save;
405           case '\\': case '\"': case '\'':
406             c = ls->current; goto read_save;
407           case EOZ: goto no_save;  /* will raise an error next loop */
408           case 'z': {  /* zap following span of spaces */
409             luaZ_buffremove(ls->buff, 1);  /* remove '\\' */
410             next(ls);  /* skip the 'z' */
411             while (lisspace(ls->current)) {
412               if (currIsNewline(ls)) inclinenumber(ls);
413               else next(ls);
414             }
415             goto no_save;
416           }
417           default: {
418             esccheck(ls, lisdigit(ls->current), "invalid escape sequence");
419             c = readdecesc(ls);  /* digital escape '\ddd' */
420             goto only_save;
421           }
422         }
423        read_save:
424          next(ls);
425          /* go through */
426        only_save:
427          luaZ_buffremove(ls->buff, 1);  /* remove '\\' */
428          save(ls, c);
429          /* go through */
430        no_save: break;
431       }
432       default:
433         save_and_next(ls);
434     }
435   }
436   save_and_next(ls);  /* skip delimiter */
437   seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1,
438                                    luaZ_bufflen(ls->buff) - 2);
439 }
440 
441 
442 static int llex (LexState *ls, SemInfo *seminfo) {
443   luaZ_resetbuffer(ls->buff);
444   for (;;) {
445     switch (ls->current) {
446       case '\n': case '\r': {  /* line breaks */
447         inclinenumber(ls);
448         break;
449       }
450       case ' ': case '\f': case '\t': case '\v': {  /* spaces */
451         next(ls);
452         break;
453       }
454       case '-': {  /* '-' or '--' (comment) */
455         next(ls);
456         if (ls->current != '-') return '-';
457         /* else is a comment */
458         next(ls);
459         if (ls->current == '[') {  /* long comment? */
460           size_t sep = skip_sep(ls);
461           luaZ_resetbuffer(ls->buff);  /* 'skip_sep' may dirty the buffer */
462           if (sep >= 2) {
463             read_long_string(ls, NULL, sep);  /* skip long comment */
464             luaZ_resetbuffer(ls->buff);  /* previous call may dirty the buff. */
465             break;
466           }
467         }
468         /* else short comment */
469         while (!currIsNewline(ls) && ls->current != EOZ)
470           next(ls);  /* skip until end of line (or end of file) */
471         break;
472       }
473       case '[': {  /* long string or simply '[' */
474         size_t sep = skip_sep(ls);
475         if (sep >= 2) {
476           read_long_string(ls, seminfo, sep);
477           return TK_STRING;
478         }
479         else if (sep == 0)  /* '[=...' missing second bracket? */
480           lexerror(ls, "invalid long string delimiter", TK_STRING);
481         return '[';
482       }
483       case '=': {
484         next(ls);
485         if (check_next1(ls, '=')) return TK_EQ;  /* '==' */
486         else return '=';
487       }
488       case '<': {
489         next(ls);
490         if (check_next1(ls, '=')) return TK_LE;  /* '<=' */
491         else if (check_next1(ls, '<')) return TK_SHL;  /* '<<' */
492         else return '<';
493       }
494       case '>': {
495         next(ls);
496         if (check_next1(ls, '=')) return TK_GE;  /* '>=' */
497         else if (check_next1(ls, '>')) return TK_SHR;  /* '>>' */
498         else return '>';
499       }
500       case '/': {
501         next(ls);
502         if (check_next1(ls, '/')) return TK_IDIV;  /* '//' */
503         else return '/';
504       }
505       case '~': {
506         next(ls);
507         if (check_next1(ls, '=')) return TK_NE;  /* '~=' */
508         else return '~';
509       }
510       case ':': {
511         next(ls);
512         if (check_next1(ls, ':')) return TK_DBCOLON;  /* '::' */
513         else return ':';
514       }
515       case '"': case '\'': {  /* short literal strings */
516         read_string(ls, ls->current, seminfo);
517         return TK_STRING;
518       }
519       case '.': {  /* '.', '..', '...', or number */
520         save_and_next(ls);
521         if (check_next1(ls, '.')) {
522           if (check_next1(ls, '.'))
523             return TK_DOTS;   /* '...' */
524           else return TK_CONCAT;   /* '..' */
525         }
526         else if (!lisdigit(ls->current)) return '.';
527         else return read_numeral(ls, seminfo);
528       }
529       case '0': case '1': case '2': case '3': case '4':
530       case '5': case '6': case '7': case '8': case '9': {
531         return read_numeral(ls, seminfo);
532       }
533       case EOZ: {
534         return TK_EOS;
535       }
536       default: {
537         if (lislalpha(ls->current)) {  /* identifier or reserved word? */
538           TString *ts;
539           do {
540             save_and_next(ls);
541           } while (lislalnum(ls->current));
542           ts = luaX_newstring(ls, luaZ_buffer(ls->buff),
543                                   luaZ_bufflen(ls->buff));
544           seminfo->ts = ts;
545           if (isreserved(ts))  /* reserved word? */
546             return ts->extra - 1 + FIRST_RESERVED;
547           else {
548             return TK_NAME;
549           }
550         }
551         else {  /* single-char tokens ('+', '*', '%', '{', '}', ...) */
552           int c = ls->current;
553           next(ls);
554           return c;
555         }
556       }
557     }
558   }
559 }
560 
561 
562 void luaX_next (LexState *ls) {
563   ls->lastline = ls->linenumber;
564   if (ls->lookahead.token != TK_EOS) {  /* is there a look-ahead token? */
565     ls->t = ls->lookahead;  /* use this one */
566     ls->lookahead.token = TK_EOS;  /* and discharge it */
567   }
568   else
569     ls->t.token = llex(ls, &ls->t.seminfo);  /* read next token */
570 }
571 
572 
573 int luaX_lookahead (LexState *ls) {
574   lua_assert(ls->lookahead.token == TK_EOS);
575   ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
576   return ls->lookahead.token;
577 }
578 
579