1 /* 2 ** $Id: llex.c $ 3 ** Lexical Analyzer 4 ** See Copyright Notice in lua.h 5 */ 6 7 #define llex_c 8 #define LUA_CORE 9 10 #include "lprefix.h" 11 12 13 #include <locale.h> 14 #include <string.h> 15 16 #include "lua.h" 17 18 #include "lctype.h" 19 #include "ldebug.h" 20 #include "ldo.h" 21 #include "lgc.h" 22 #include "llex.h" 23 #include "lobject.h" 24 #include "lparser.h" 25 #include "lstate.h" 26 #include "lstring.h" 27 #include "ltable.h" 28 #include "lzio.h" 29 30 31 32 #define next(ls) (ls->current = zgetc(ls->z)) 33 34 35 36 #define currIsNewline(ls) (ls->current == '\n' || ls->current == '\r') 37 38 39 /* ORDER RESERVED */ 40 static const char *const luaX_tokens [] = { 41 "and", "break", "do", "else", "elseif", 42 "end", "false", "for", "function", "goto", "if", 43 "in", "local", "nil", "not", "or", "repeat", 44 "return", "then", "true", "until", "while", 45 "//", "..", "...", "==", ">=", "<=", "~=", 46 "<<", ">>", "::", "<eof>", 47 "<number>", "<integer>", "<name>", "<string>" 48 }; 49 50 51 #define save_and_next(ls) (save(ls, ls->current), next(ls)) 52 53 54 static l_noret lexerror (LexState *ls, const char *msg, int token); 55 56 57 static void save (LexState *ls, int c) { 58 Mbuffer *b = ls->buff; 59 if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) { 60 size_t newsize; 61 if (luaZ_sizebuffer(b) >= MAX_SIZE/2) 62 lexerror(ls, "lexical element too long", 0); 63 newsize = luaZ_sizebuffer(b) * 2; 64 luaZ_resizebuffer(ls->L, b, newsize); 65 } 66 b->buffer[luaZ_bufflen(b)++] = cast_char(c); 67 } 68 69 70 void luaX_init (lua_State *L) { 71 int i; 72 TString *e = luaS_newliteral(L, LUA_ENV); /* create env name */ 73 luaC_fix(L, obj2gco(e)); /* never collect this name */ 74 for (i=0; i<NUM_RESERVED; i++) { 75 TString *ts = luaS_new(L, luaX_tokens[i]); 76 luaC_fix(L, obj2gco(ts)); /* reserved words are never collected */ 77 ts->extra = cast_byte(i+1); /* reserved word */ 78 } 79 } 80 81 82 const char *luaX_token2str (LexState *ls, int token) { 83 if (token < FIRST_RESERVED) { /* single-byte symbols? */ 84 if (lisprint(token)) 85 return luaO_pushfstring(ls->L, "'%c'", token); 86 else /* control character */ 87 return luaO_pushfstring(ls->L, "'<\\%d>'", token); 88 } 89 else { 90 const char *s = luaX_tokens[token - FIRST_RESERVED]; 91 if (token < TK_EOS) /* fixed format (symbols and reserved words)? */ 92 return luaO_pushfstring(ls->L, "'%s'", s); 93 else /* names, strings, and numerals */ 94 return s; 95 } 96 } 97 98 99 static const char *txtToken (LexState *ls, int token) { 100 switch (token) { 101 case TK_NAME: case TK_STRING: 102 case TK_FLT: case TK_INT: 103 save(ls, '\0'); 104 return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff)); 105 default: 106 return luaX_token2str(ls, token); 107 } 108 } 109 110 111 static l_noret lexerror (LexState *ls, const char *msg, int token) { 112 msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber); 113 if (token) 114 luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token)); 115 luaD_throw(ls->L, LUA_ERRSYNTAX); 116 } 117 118 119 l_noret luaX_syntaxerror (LexState *ls, const char *msg) { 120 lexerror(ls, msg, ls->t.token); 121 } 122 123 124 /* 125 ** creates a new string and anchors it in scanner's table so that 126 ** it will not be collected until the end of the compilation 127 ** (by that time it should be anchored somewhere) 128 */ 129 TString *luaX_newstring (LexState *ls, const char *str, size_t l) { 130 lua_State *L = ls->L; 131 TValue *o; /* entry for 'str' */ 132 TString *ts = luaS_newlstr(L, str, l); /* create new string */ 133 setsvalue2s(L, L->top++, ts); /* temporarily anchor it in stack */ 134 o = luaH_set(L, ls->h, s2v(L->top - 1)); 135 if (isempty(o)) { /* not in use yet? */ 136 /* boolean value does not need GC barrier; 137 table is not a metatable, so it does not need to invalidate cache */ 138 setbtvalue(o); /* t[string] = true */ 139 luaC_checkGC(L); 140 } 141 else { /* string already present */ 142 ts = keystrval(nodefromval(o)); /* re-use value previously stored */ 143 } 144 L->top--; /* remove string from stack */ 145 return ts; 146 } 147 148 149 /* 150 ** increment line number and skips newline sequence (any of 151 ** \n, \r, \n\r, or \r\n) 152 */ 153 static void inclinenumber (LexState *ls) { 154 int old = ls->current; 155 lua_assert(currIsNewline(ls)); 156 next(ls); /* skip '\n' or '\r' */ 157 if (currIsNewline(ls) && ls->current != old) 158 next(ls); /* skip '\n\r' or '\r\n' */ 159 if (++ls->linenumber >= MAX_INT) 160 lexerror(ls, "chunk has too many lines", 0); 161 } 162 163 164 void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source, 165 int firstchar) { 166 ls->t.token = 0; 167 ls->L = L; 168 ls->current = firstchar; 169 ls->lookahead.token = TK_EOS; /* no look-ahead token */ 170 ls->z = z; 171 ls->fs = NULL; 172 ls->linenumber = 1; 173 ls->lastline = 1; 174 ls->source = source; 175 ls->envn = luaS_newliteral(L, LUA_ENV); /* get env name */ 176 luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER); /* initialize buffer */ 177 } 178 179 180 181 /* 182 ** ======================================================= 183 ** LEXICAL ANALYZER 184 ** ======================================================= 185 */ 186 187 188 static int check_next1 (LexState *ls, int c) { 189 if (ls->current == c) { 190 next(ls); 191 return 1; 192 } 193 else return 0; 194 } 195 196 197 /* 198 ** Check whether current char is in set 'set' (with two chars) and 199 ** saves it 200 */ 201 static int check_next2 (LexState *ls, const char *set) { 202 lua_assert(set[2] == '\0'); 203 if (ls->current == set[0] || ls->current == set[1]) { 204 save_and_next(ls); 205 return 1; 206 } 207 else return 0; 208 } 209 210 211 /* LUA_NUMBER */ 212 /* 213 ** This function is quite liberal in what it accepts, as 'luaO_str2num' 214 ** will reject ill-formed numerals. Roughly, it accepts the following 215 ** pattern: 216 ** 217 ** %d(%x|%.|([Ee][+-]?))* | 0[Xx](%x|%.|([Pp][+-]?))* 218 ** 219 ** The only tricky part is to accept [+-] only after a valid exponent 220 ** mark, to avoid reading '3-4' or '0xe+1' as a single number. 221 ** 222 ** The caller might have already read an initial dot. 223 */ 224 static int read_numeral (LexState *ls, SemInfo *seminfo) { 225 TValue obj; 226 const char *expo = "Ee"; 227 int first = ls->current; 228 lua_assert(lisdigit(ls->current)); 229 save_and_next(ls); 230 if (first == '0' && check_next2(ls, "xX")) /* hexadecimal? */ 231 expo = "Pp"; 232 for (;;) { 233 if (check_next2(ls, expo)) /* exponent mark? */ 234 check_next2(ls, "-+"); /* optional exponent sign */ 235 else if (lisxdigit(ls->current) || ls->current == '.') /* '%x|%.' */ 236 save_and_next(ls); 237 else break; 238 } 239 if (lislalpha(ls->current)) /* is numeral touching a letter? */ 240 save_and_next(ls); /* force an error */ 241 save(ls, '\0'); 242 if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0) /* format error? */ 243 lexerror(ls, "malformed number", TK_FLT); 244 if (ttisinteger(&obj)) { 245 seminfo->i = ivalue(&obj); 246 return TK_INT; 247 } 248 else { 249 lua_assert(ttisfloat(&obj)); 250 seminfo->r = fltvalue(&obj); 251 return TK_FLT; 252 } 253 } 254 255 256 /* 257 ** read a sequence '[=*[' or ']=*]', leaving the last bracket. If 258 ** sequence is well formed, return its number of '='s + 2; otherwise, 259 ** return 1 if it is a single bracket (no '='s and no 2nd bracket); 260 ** otherwise (an unfinished '[==...') return 0. 261 */ 262 static size_t skip_sep (LexState *ls) { 263 size_t count = 0; 264 int s = ls->current; 265 lua_assert(s == '[' || s == ']'); 266 save_and_next(ls); 267 while (ls->current == '=') { 268 save_and_next(ls); 269 count++; 270 } 271 return (ls->current == s) ? count + 2 272 : (count == 0) ? 1 273 : 0; 274 } 275 276 277 static void read_long_string (LexState *ls, SemInfo *seminfo, size_t sep) { 278 int line = ls->linenumber; /* initial line (for error message) */ 279 save_and_next(ls); /* skip 2nd '[' */ 280 if (currIsNewline(ls)) /* string starts with a newline? */ 281 inclinenumber(ls); /* skip it */ 282 for (;;) { 283 switch (ls->current) { 284 case EOZ: { /* error */ 285 const char *what = (seminfo ? "string" : "comment"); 286 const char *msg = luaO_pushfstring(ls->L, 287 "unfinished long %s (starting at line %d)", what, line); 288 lexerror(ls, msg, TK_EOS); 289 break; /* to avoid warnings */ 290 } 291 case ']': { 292 if (skip_sep(ls) == sep) { 293 save_and_next(ls); /* skip 2nd ']' */ 294 goto endloop; 295 } 296 break; 297 } 298 case '\n': case '\r': { 299 save(ls, '\n'); 300 inclinenumber(ls); 301 if (!seminfo) luaZ_resetbuffer(ls->buff); /* avoid wasting space */ 302 break; 303 } 304 default: { 305 if (seminfo) save_and_next(ls); 306 else next(ls); 307 } 308 } 309 } endloop: 310 if (seminfo) 311 seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep, 312 luaZ_bufflen(ls->buff) - 2 * sep); 313 } 314 315 316 static void esccheck (LexState *ls, int c, const char *msg) { 317 if (!c) { 318 if (ls->current != EOZ) 319 save_and_next(ls); /* add current to buffer for error message */ 320 lexerror(ls, msg, TK_STRING); 321 } 322 } 323 324 325 static int gethexa (LexState *ls) { 326 save_and_next(ls); 327 esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected"); 328 return luaO_hexavalue(ls->current); 329 } 330 331 332 static int readhexaesc (LexState *ls) { 333 int r = gethexa(ls); 334 r = (r << 4) + gethexa(ls); 335 luaZ_buffremove(ls->buff, 2); /* remove saved chars from buffer */ 336 return r; 337 } 338 339 340 static unsigned long readutf8esc (LexState *ls) { 341 unsigned long r; 342 int i = 4; /* chars to be removed: '\', 'u', '{', and first digit */ 343 save_and_next(ls); /* skip 'u' */ 344 esccheck(ls, ls->current == '{', "missing '{'"); 345 r = gethexa(ls); /* must have at least one digit */ 346 while (cast_void(save_and_next(ls)), lisxdigit(ls->current)) { 347 i++; 348 esccheck(ls, r <= (0x7FFFFFFFu >> 4), "UTF-8 value too large"); 349 r = (r << 4) + luaO_hexavalue(ls->current); 350 } 351 esccheck(ls, ls->current == '}', "missing '}'"); 352 next(ls); /* skip '}' */ 353 luaZ_buffremove(ls->buff, i); /* remove saved chars from buffer */ 354 return r; 355 } 356 357 358 static void utf8esc (LexState *ls) { 359 char buff[UTF8BUFFSZ]; 360 int n = luaO_utf8esc(buff, readutf8esc(ls)); 361 for (; n > 0; n--) /* add 'buff' to string */ 362 save(ls, buff[UTF8BUFFSZ - n]); 363 } 364 365 366 static int readdecesc (LexState *ls) { 367 int i; 368 int r = 0; /* result accumulator */ 369 for (i = 0; i < 3 && lisdigit(ls->current); i++) { /* read up to 3 digits */ 370 r = 10*r + ls->current - '0'; 371 save_and_next(ls); 372 } 373 esccheck(ls, r <= UCHAR_MAX, "decimal escape too large"); 374 luaZ_buffremove(ls->buff, i); /* remove read digits from buffer */ 375 return r; 376 } 377 378 379 static void read_string (LexState *ls, int del, SemInfo *seminfo) { 380 save_and_next(ls); /* keep delimiter (for error messages) */ 381 while (ls->current != del) { 382 switch (ls->current) { 383 case EOZ: 384 lexerror(ls, "unfinished string", TK_EOS); 385 break; /* to avoid warnings */ 386 case '\n': 387 case '\r': 388 lexerror(ls, "unfinished string", TK_STRING); 389 break; /* to avoid warnings */ 390 case '\\': { /* escape sequences */ 391 int c; /* final character to be saved */ 392 save_and_next(ls); /* keep '\\' for error messages */ 393 switch (ls->current) { 394 case 'a': c = '\a'; goto read_save; 395 case 'b': c = '\b'; goto read_save; 396 case 'f': c = '\f'; goto read_save; 397 case 'n': c = '\n'; goto read_save; 398 case 'r': c = '\r'; goto read_save; 399 case 't': c = '\t'; goto read_save; 400 case 'v': c = '\v'; goto read_save; 401 case 'x': c = readhexaesc(ls); goto read_save; 402 case 'u': utf8esc(ls); goto no_save; 403 case '\n': case '\r': 404 inclinenumber(ls); c = '\n'; goto only_save; 405 case '\\': case '\"': case '\'': 406 c = ls->current; goto read_save; 407 case EOZ: goto no_save; /* will raise an error next loop */ 408 case 'z': { /* zap following span of spaces */ 409 luaZ_buffremove(ls->buff, 1); /* remove '\\' */ 410 next(ls); /* skip the 'z' */ 411 while (lisspace(ls->current)) { 412 if (currIsNewline(ls)) inclinenumber(ls); 413 else next(ls); 414 } 415 goto no_save; 416 } 417 default: { 418 esccheck(ls, lisdigit(ls->current), "invalid escape sequence"); 419 c = readdecesc(ls); /* digital escape '\ddd' */ 420 goto only_save; 421 } 422 } 423 read_save: 424 next(ls); 425 /* go through */ 426 only_save: 427 luaZ_buffremove(ls->buff, 1); /* remove '\\' */ 428 save(ls, c); 429 /* go through */ 430 no_save: break; 431 } 432 default: 433 save_and_next(ls); 434 } 435 } 436 save_and_next(ls); /* skip delimiter */ 437 seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1, 438 luaZ_bufflen(ls->buff) - 2); 439 } 440 441 442 static int llex (LexState *ls, SemInfo *seminfo) { 443 luaZ_resetbuffer(ls->buff); 444 for (;;) { 445 switch (ls->current) { 446 case '\n': case '\r': { /* line breaks */ 447 inclinenumber(ls); 448 break; 449 } 450 case ' ': case '\f': case '\t': case '\v': { /* spaces */ 451 next(ls); 452 break; 453 } 454 case '-': { /* '-' or '--' (comment) */ 455 next(ls); 456 if (ls->current != '-') return '-'; 457 /* else is a comment */ 458 next(ls); 459 if (ls->current == '[') { /* long comment? */ 460 size_t sep = skip_sep(ls); 461 luaZ_resetbuffer(ls->buff); /* 'skip_sep' may dirty the buffer */ 462 if (sep >= 2) { 463 read_long_string(ls, NULL, sep); /* skip long comment */ 464 luaZ_resetbuffer(ls->buff); /* previous call may dirty the buff. */ 465 break; 466 } 467 } 468 /* else short comment */ 469 while (!currIsNewline(ls) && ls->current != EOZ) 470 next(ls); /* skip until end of line (or end of file) */ 471 break; 472 } 473 case '[': { /* long string or simply '[' */ 474 size_t sep = skip_sep(ls); 475 if (sep >= 2) { 476 read_long_string(ls, seminfo, sep); 477 return TK_STRING; 478 } 479 else if (sep == 0) /* '[=...' missing second bracket? */ 480 lexerror(ls, "invalid long string delimiter", TK_STRING); 481 return '['; 482 } 483 case '=': { 484 next(ls); 485 if (check_next1(ls, '=')) return TK_EQ; /* '==' */ 486 else return '='; 487 } 488 case '<': { 489 next(ls); 490 if (check_next1(ls, '=')) return TK_LE; /* '<=' */ 491 else if (check_next1(ls, '<')) return TK_SHL; /* '<<' */ 492 else return '<'; 493 } 494 case '>': { 495 next(ls); 496 if (check_next1(ls, '=')) return TK_GE; /* '>=' */ 497 else if (check_next1(ls, '>')) return TK_SHR; /* '>>' */ 498 else return '>'; 499 } 500 case '/': { 501 next(ls); 502 if (check_next1(ls, '/')) return TK_IDIV; /* '//' */ 503 else return '/'; 504 } 505 case '~': { 506 next(ls); 507 if (check_next1(ls, '=')) return TK_NE; /* '~=' */ 508 else return '~'; 509 } 510 case ':': { 511 next(ls); 512 if (check_next1(ls, ':')) return TK_DBCOLON; /* '::' */ 513 else return ':'; 514 } 515 case '"': case '\'': { /* short literal strings */ 516 read_string(ls, ls->current, seminfo); 517 return TK_STRING; 518 } 519 case '.': { /* '.', '..', '...', or number */ 520 save_and_next(ls); 521 if (check_next1(ls, '.')) { 522 if (check_next1(ls, '.')) 523 return TK_DOTS; /* '...' */ 524 else return TK_CONCAT; /* '..' */ 525 } 526 else if (!lisdigit(ls->current)) return '.'; 527 else return read_numeral(ls, seminfo); 528 } 529 case '0': case '1': case '2': case '3': case '4': 530 case '5': case '6': case '7': case '8': case '9': { 531 return read_numeral(ls, seminfo); 532 } 533 case EOZ: { 534 return TK_EOS; 535 } 536 default: { 537 if (lislalpha(ls->current)) { /* identifier or reserved word? */ 538 TString *ts; 539 do { 540 save_and_next(ls); 541 } while (lislalnum(ls->current)); 542 ts = luaX_newstring(ls, luaZ_buffer(ls->buff), 543 luaZ_bufflen(ls->buff)); 544 seminfo->ts = ts; 545 if (isreserved(ts)) /* reserved word? */ 546 return ts->extra - 1 + FIRST_RESERVED; 547 else { 548 return TK_NAME; 549 } 550 } 551 else { /* single-char tokens ('+', '*', '%', '{', '}', ...) */ 552 int c = ls->current; 553 next(ls); 554 return c; 555 } 556 } 557 } 558 } 559 } 560 561 562 void luaX_next (LexState *ls) { 563 ls->lastline = ls->linenumber; 564 if (ls->lookahead.token != TK_EOS) { /* is there a look-ahead token? */ 565 ls->t = ls->lookahead; /* use this one */ 566 ls->lookahead.token = TK_EOS; /* and discharge it */ 567 } 568 else 569 ls->t.token = llex(ls, &ls->t.seminfo); /* read next token */ 570 } 571 572 573 int luaX_lookahead (LexState *ls) { 574 lua_assert(ls->lookahead.token == TK_EOS); 575 ls->lookahead.token = llex(ls, &ls->lookahead.seminfo); 576 return ls->lookahead.token; 577 } 578 579