xref: /freebsd/contrib/lua/src/llex.c (revision 56e53cb8ef000c3ef72337a4095987a932cdedef)
1 /*
2 ** $Id: llex.c,v 2.96 2016/05/02 14:02:12 roberto Exp $
3 ** Lexical Analyzer
4 ** See Copyright Notice in lua.h
5 */
6 
7 #define llex_c
8 #define LUA_CORE
9 
10 #include "lprefix.h"
11 
12 
13 #include <locale.h>
14 #include <string.h>
15 
16 #include "lua.h"
17 
18 #include "lctype.h"
19 #include "ldebug.h"
20 #include "ldo.h"
21 #include "lgc.h"
22 #include "llex.h"
23 #include "lobject.h"
24 #include "lparser.h"
25 #include "lstate.h"
26 #include "lstring.h"
27 #include "ltable.h"
28 #include "lzio.h"
29 
30 
31 
32 #define next(ls) (ls->current = zgetc(ls->z))
33 
34 
35 
36 #define currIsNewline(ls)	(ls->current == '\n' || ls->current == '\r')
37 
38 
39 /* ORDER RESERVED */
40 static const char *const luaX_tokens [] = {
41     "and", "break", "do", "else", "elseif",
42     "end", "false", "for", "function", "goto", "if",
43     "in", "local", "nil", "not", "or", "repeat",
44     "return", "then", "true", "until", "while",
45     "//", "..", "...", "==", ">=", "<=", "~=",
46     "<<", ">>", "::", "<eof>",
47     "<number>", "<integer>", "<name>", "<string>"
48 };
49 
50 
51 #define save_and_next(ls) (save(ls, ls->current), next(ls))
52 
53 
54 static l_noret lexerror (LexState *ls, const char *msg, int token);
55 
56 
57 static void save (LexState *ls, int c) {
58   Mbuffer *b = ls->buff;
59   if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
60     size_t newsize;
61     if (luaZ_sizebuffer(b) >= MAX_SIZE/2)
62       lexerror(ls, "lexical element too long", 0);
63     newsize = luaZ_sizebuffer(b) * 2;
64     luaZ_resizebuffer(ls->L, b, newsize);
65   }
66   b->buffer[luaZ_bufflen(b)++] = cast(char, c);
67 }
68 
69 
70 void luaX_init (lua_State *L) {
71   int i;
72   TString *e = luaS_newliteral(L, LUA_ENV);  /* create env name */
73   luaC_fix(L, obj2gco(e));  /* never collect this name */
74   for (i=0; i<NUM_RESERVED; i++) {
75     TString *ts = luaS_new(L, luaX_tokens[i]);
76     luaC_fix(L, obj2gco(ts));  /* reserved words are never collected */
77     ts->extra = cast_byte(i+1);  /* reserved word */
78   }
79 }
80 
81 
82 const char *luaX_token2str (LexState *ls, int token) {
83   if (token < FIRST_RESERVED) {  /* single-byte symbols? */
84     lua_assert(token == cast_uchar(token));
85     return luaO_pushfstring(ls->L, "'%c'", token);
86   }
87   else {
88     const char *s = luaX_tokens[token - FIRST_RESERVED];
89     if (token < TK_EOS)  /* fixed format (symbols and reserved words)? */
90       return luaO_pushfstring(ls->L, "'%s'", s);
91     else  /* names, strings, and numerals */
92       return s;
93   }
94 }
95 
96 
97 static const char *txtToken (LexState *ls, int token) {
98   switch (token) {
99     case TK_NAME: case TK_STRING:
100     case TK_FLT: case TK_INT:
101       save(ls, '\0');
102       return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff));
103     default:
104       return luaX_token2str(ls, token);
105   }
106 }
107 
108 
109 static l_noret lexerror (LexState *ls, const char *msg, int token) {
110   msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber);
111   if (token)
112     luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
113   luaD_throw(ls->L, LUA_ERRSYNTAX);
114 }
115 
116 
117 l_noret luaX_syntaxerror (LexState *ls, const char *msg) {
118   lexerror(ls, msg, ls->t.token);
119 }
120 
121 
122 /*
123 ** creates a new string and anchors it in scanner's table so that
124 ** it will not be collected until the end of the compilation
125 ** (by that time it should be anchored somewhere)
126 */
127 TString *luaX_newstring (LexState *ls, const char *str, size_t l) {
128   lua_State *L = ls->L;
129   TValue *o;  /* entry for 'str' */
130   TString *ts = luaS_newlstr(L, str, l);  /* create new string */
131   setsvalue2s(L, L->top++, ts);  /* temporarily anchor it in stack */
132   o = luaH_set(L, ls->h, L->top - 1);
133   if (ttisnil(o)) {  /* not in use yet? */
134     /* boolean value does not need GC barrier;
135        table has no metatable, so it does not need to invalidate cache */
136     setbvalue(o, 1);  /* t[string] = true */
137     luaC_checkGC(L);
138   }
139   else {  /* string already present */
140     ts = tsvalue(keyfromval(o));  /* re-use value previously stored */
141   }
142   L->top--;  /* remove string from stack */
143   return ts;
144 }
145 
146 
147 /*
148 ** increment line number and skips newline sequence (any of
149 ** \n, \r, \n\r, or \r\n)
150 */
151 static void inclinenumber (LexState *ls) {
152   int old = ls->current;
153   lua_assert(currIsNewline(ls));
154   next(ls);  /* skip '\n' or '\r' */
155   if (currIsNewline(ls) && ls->current != old)
156     next(ls);  /* skip '\n\r' or '\r\n' */
157   if (++ls->linenumber >= MAX_INT)
158     lexerror(ls, "chunk has too many lines", 0);
159 }
160 
161 
162 void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source,
163                     int firstchar) {
164   ls->t.token = 0;
165   ls->L = L;
166   ls->current = firstchar;
167   ls->lookahead.token = TK_EOS;  /* no look-ahead token */
168   ls->z = z;
169   ls->fs = NULL;
170   ls->linenumber = 1;
171   ls->lastline = 1;
172   ls->source = source;
173   ls->envn = luaS_newliteral(L, LUA_ENV);  /* get env name */
174   luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER);  /* initialize buffer */
175 }
176 
177 
178 
179 /*
180 ** =======================================================
181 ** LEXICAL ANALYZER
182 ** =======================================================
183 */
184 
185 
186 static int check_next1 (LexState *ls, int c) {
187   if (ls->current == c) {
188     next(ls);
189     return 1;
190   }
191   else return 0;
192 }
193 
194 
195 /*
196 ** Check whether current char is in set 'set' (with two chars) and
197 ** saves it
198 */
199 static int check_next2 (LexState *ls, const char *set) {
200   lua_assert(set[2] == '\0');
201   if (ls->current == set[0] || ls->current == set[1]) {
202     save_and_next(ls);
203     return 1;
204   }
205   else return 0;
206 }
207 
208 
209 /* LUA_NUMBER */
210 /*
211 ** this function is quite liberal in what it accepts, as 'luaO_str2num'
212 ** will reject ill-formed numerals.
213 */
214 static int read_numeral (LexState *ls, SemInfo *seminfo) {
215   TValue obj;
216   const char *expo = "Ee";
217   int first = ls->current;
218   lua_assert(lisdigit(ls->current));
219   save_and_next(ls);
220   if (first == '0' && check_next2(ls, "xX"))  /* hexadecimal? */
221     expo = "Pp";
222   for (;;) {
223     if (check_next2(ls, expo))  /* exponent part? */
224       check_next2(ls, "-+");  /* optional exponent sign */
225     if (lisxdigit(ls->current))
226       save_and_next(ls);
227     else if (ls->current == '.')
228       save_and_next(ls);
229     else break;
230   }
231   save(ls, '\0');
232   if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0)  /* format error? */
233     lexerror(ls, "malformed number", TK_FLT);
234   if (ttisinteger(&obj)) {
235     seminfo->i = ivalue(&obj);
236     return TK_INT;
237   }
238   else {
239     lua_assert(ttisfloat(&obj));
240     seminfo->r = fltvalue(&obj);
241     return TK_FLT;
242   }
243 }
244 
245 
246 /*
247 ** skip a sequence '[=*[' or ']=*]'; if sequence is well formed, return
248 ** its number of '='s; otherwise, return a negative number (-1 iff there
249 ** are no '='s after initial bracket)
250 */
251 static int skip_sep (LexState *ls) {
252   int count = 0;
253   int s = ls->current;
254   lua_assert(s == '[' || s == ']');
255   save_and_next(ls);
256   while (ls->current == '=') {
257     save_and_next(ls);
258     count++;
259   }
260   return (ls->current == s) ? count : (-count) - 1;
261 }
262 
263 
264 static void read_long_string (LexState *ls, SemInfo *seminfo, int sep) {
265   int line = ls->linenumber;  /* initial line (for error message) */
266   save_and_next(ls);  /* skip 2nd '[' */
267   if (currIsNewline(ls))  /* string starts with a newline? */
268     inclinenumber(ls);  /* skip it */
269   for (;;) {
270     switch (ls->current) {
271       case EOZ: {  /* error */
272         const char *what = (seminfo ? "string" : "comment");
273         const char *msg = luaO_pushfstring(ls->L,
274                      "unfinished long %s (starting at line %d)", what, line);
275         lexerror(ls, msg, TK_EOS);
276         break;  /* to avoid warnings */
277       }
278       case ']': {
279         if (skip_sep(ls) == sep) {
280           save_and_next(ls);  /* skip 2nd ']' */
281           goto endloop;
282         }
283         break;
284       }
285       case '\n': case '\r': {
286         save(ls, '\n');
287         inclinenumber(ls);
288         if (!seminfo) luaZ_resetbuffer(ls->buff);  /* avoid wasting space */
289         break;
290       }
291       default: {
292         if (seminfo) save_and_next(ls);
293         else next(ls);
294       }
295     }
296   } endloop:
297   if (seminfo)
298     seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + (2 + sep),
299                                      luaZ_bufflen(ls->buff) - 2*(2 + sep));
300 }
301 
302 
303 static void esccheck (LexState *ls, int c, const char *msg) {
304   if (!c) {
305     if (ls->current != EOZ)
306       save_and_next(ls);  /* add current to buffer for error message */
307     lexerror(ls, msg, TK_STRING);
308   }
309 }
310 
311 
312 static int gethexa (LexState *ls) {
313   save_and_next(ls);
314   esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected");
315   return luaO_hexavalue(ls->current);
316 }
317 
318 
319 static int readhexaesc (LexState *ls) {
320   int r = gethexa(ls);
321   r = (r << 4) + gethexa(ls);
322   luaZ_buffremove(ls->buff, 2);  /* remove saved chars from buffer */
323   return r;
324 }
325 
326 
327 static unsigned long readutf8esc (LexState *ls) {
328   unsigned long r;
329   int i = 4;  /* chars to be removed: '\', 'u', '{', and first digit */
330   save_and_next(ls);  /* skip 'u' */
331   esccheck(ls, ls->current == '{', "missing '{'");
332   r = gethexa(ls);  /* must have at least one digit */
333   while ((save_and_next(ls), lisxdigit(ls->current))) {
334     i++;
335     r = (r << 4) + luaO_hexavalue(ls->current);
336     esccheck(ls, r <= 0x10FFFF, "UTF-8 value too large");
337   }
338   esccheck(ls, ls->current == '}', "missing '}'");
339   next(ls);  /* skip '}' */
340   luaZ_buffremove(ls->buff, i);  /* remove saved chars from buffer */
341   return r;
342 }
343 
344 
345 static void utf8esc (LexState *ls) {
346   char buff[UTF8BUFFSZ];
347   int n = luaO_utf8esc(buff, readutf8esc(ls));
348   for (; n > 0; n--)  /* add 'buff' to string */
349     save(ls, buff[UTF8BUFFSZ - n]);
350 }
351 
352 
353 static int readdecesc (LexState *ls) {
354   int i;
355   int r = 0;  /* result accumulator */
356   for (i = 0; i < 3 && lisdigit(ls->current); i++) {  /* read up to 3 digits */
357     r = 10*r + ls->current - '0';
358     save_and_next(ls);
359   }
360   esccheck(ls, r <= UCHAR_MAX, "decimal escape too large");
361   luaZ_buffremove(ls->buff, i);  /* remove read digits from buffer */
362   return r;
363 }
364 
365 
366 static void read_string (LexState *ls, int del, SemInfo *seminfo) {
367   save_and_next(ls);  /* keep delimiter (for error messages) */
368   while (ls->current != del) {
369     switch (ls->current) {
370       case EOZ:
371         lexerror(ls, "unfinished string", TK_EOS);
372         break;  /* to avoid warnings */
373       case '\n':
374       case '\r':
375         lexerror(ls, "unfinished string", TK_STRING);
376         break;  /* to avoid warnings */
377       case '\\': {  /* escape sequences */
378         int c;  /* final character to be saved */
379         save_and_next(ls);  /* keep '\\' for error messages */
380         switch (ls->current) {
381           case 'a': c = '\a'; goto read_save;
382           case 'b': c = '\b'; goto read_save;
383           case 'f': c = '\f'; goto read_save;
384           case 'n': c = '\n'; goto read_save;
385           case 'r': c = '\r'; goto read_save;
386           case 't': c = '\t'; goto read_save;
387           case 'v': c = '\v'; goto read_save;
388           case 'x': c = readhexaesc(ls); goto read_save;
389           case 'u': utf8esc(ls);  goto no_save;
390           case '\n': case '\r':
391             inclinenumber(ls); c = '\n'; goto only_save;
392           case '\\': case '\"': case '\'':
393             c = ls->current; goto read_save;
394           case EOZ: goto no_save;  /* will raise an error next loop */
395           case 'z': {  /* zap following span of spaces */
396             luaZ_buffremove(ls->buff, 1);  /* remove '\\' */
397             next(ls);  /* skip the 'z' */
398             while (lisspace(ls->current)) {
399               if (currIsNewline(ls)) inclinenumber(ls);
400               else next(ls);
401             }
402             goto no_save;
403           }
404           default: {
405             esccheck(ls, lisdigit(ls->current), "invalid escape sequence");
406             c = readdecesc(ls);  /* digital escape '\ddd' */
407             goto only_save;
408           }
409         }
410        read_save:
411          next(ls);
412          /* go through */
413        only_save:
414          luaZ_buffremove(ls->buff, 1);  /* remove '\\' */
415          save(ls, c);
416          /* go through */
417        no_save: break;
418       }
419       default:
420         save_and_next(ls);
421     }
422   }
423   save_and_next(ls);  /* skip delimiter */
424   seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1,
425                                    luaZ_bufflen(ls->buff) - 2);
426 }
427 
428 
429 static int llex (LexState *ls, SemInfo *seminfo) {
430   luaZ_resetbuffer(ls->buff);
431   for (;;) {
432     switch (ls->current) {
433       case '\n': case '\r': {  /* line breaks */
434         inclinenumber(ls);
435         break;
436       }
437       case ' ': case '\f': case '\t': case '\v': {  /* spaces */
438         next(ls);
439         break;
440       }
441       case '-': {  /* '-' or '--' (comment) */
442         next(ls);
443         if (ls->current != '-') return '-';
444         /* else is a comment */
445         next(ls);
446         if (ls->current == '[') {  /* long comment? */
447           int sep = skip_sep(ls);
448           luaZ_resetbuffer(ls->buff);  /* 'skip_sep' may dirty the buffer */
449           if (sep >= 0) {
450             read_long_string(ls, NULL, sep);  /* skip long comment */
451             luaZ_resetbuffer(ls->buff);  /* previous call may dirty the buff. */
452             break;
453           }
454         }
455         /* else short comment */
456         while (!currIsNewline(ls) && ls->current != EOZ)
457           next(ls);  /* skip until end of line (or end of file) */
458         break;
459       }
460       case '[': {  /* long string or simply '[' */
461         int sep = skip_sep(ls);
462         if (sep >= 0) {
463           read_long_string(ls, seminfo, sep);
464           return TK_STRING;
465         }
466         else if (sep != -1)  /* '[=...' missing second bracket */
467           lexerror(ls, "invalid long string delimiter", TK_STRING);
468         return '[';
469       }
470       case '=': {
471         next(ls);
472         if (check_next1(ls, '=')) return TK_EQ;
473         else return '=';
474       }
475       case '<': {
476         next(ls);
477         if (check_next1(ls, '=')) return TK_LE;
478         else if (check_next1(ls, '<')) return TK_SHL;
479         else return '<';
480       }
481       case '>': {
482         next(ls);
483         if (check_next1(ls, '=')) return TK_GE;
484         else if (check_next1(ls, '>')) return TK_SHR;
485         else return '>';
486       }
487       case '/': {
488         next(ls);
489         if (check_next1(ls, '/')) return TK_IDIV;
490         else return '/';
491       }
492       case '~': {
493         next(ls);
494         if (check_next1(ls, '=')) return TK_NE;
495         else return '~';
496       }
497       case ':': {
498         next(ls);
499         if (check_next1(ls, ':')) return TK_DBCOLON;
500         else return ':';
501       }
502       case '"': case '\'': {  /* short literal strings */
503         read_string(ls, ls->current, seminfo);
504         return TK_STRING;
505       }
506       case '.': {  /* '.', '..', '...', or number */
507         save_and_next(ls);
508         if (check_next1(ls, '.')) {
509           if (check_next1(ls, '.'))
510             return TK_DOTS;   /* '...' */
511           else return TK_CONCAT;   /* '..' */
512         }
513         else if (!lisdigit(ls->current)) return '.';
514         else return read_numeral(ls, seminfo);
515       }
516       case '0': case '1': case '2': case '3': case '4':
517       case '5': case '6': case '7': case '8': case '9': {
518         return read_numeral(ls, seminfo);
519       }
520       case EOZ: {
521         return TK_EOS;
522       }
523       default: {
524         if (lislalpha(ls->current)) {  /* identifier or reserved word? */
525           TString *ts;
526           do {
527             save_and_next(ls);
528           } while (lislalnum(ls->current));
529           ts = luaX_newstring(ls, luaZ_buffer(ls->buff),
530                                   luaZ_bufflen(ls->buff));
531           seminfo->ts = ts;
532           if (isreserved(ts))  /* reserved word? */
533             return ts->extra - 1 + FIRST_RESERVED;
534           else {
535             return TK_NAME;
536           }
537         }
538         else {  /* single-char tokens (+ - / ...) */
539           int c = ls->current;
540           next(ls);
541           return c;
542         }
543       }
544     }
545   }
546 }
547 
548 
549 void luaX_next (LexState *ls) {
550   ls->lastline = ls->linenumber;
551   if (ls->lookahead.token != TK_EOS) {  /* is there a look-ahead token? */
552     ls->t = ls->lookahead;  /* use this one */
553     ls->lookahead.token = TK_EOS;  /* and discharge it */
554   }
555   else
556     ls->t.token = llex(ls, &ls->t.seminfo);  /* read next token */
557 }
558 
559 
560 int luaX_lookahead (LexState *ls) {
561   lua_assert(ls->lookahead.token == TK_EOS);
562   ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
563   return ls->lookahead.token;
564 }
565 
566